{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "Triples_target_final = pd.read_csv(\"Triples_target_final.tsv\", sep='\\t')\n", "Triples_target_final = Triples_target_final.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_csv('triplets_total.csv', sep=';')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "disease_id = triplets_total[\"disease_PwB\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def convert(lista):\n", " return tuple(i for i in lista)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0035579', 'C0035579', 'C0020538', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0020538', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0035579', 'C0030567', 'C0030567', 'C0002395', 'C0002395', 'C0002395', 'C0002395', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0025202', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538')\n" ] } ], "source": [ "# Driver function\n", "lista = disease_id\n", "print(convert(lista))\n", "list_disease_id = convert(lista)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "dis_gen = f'''SELECT disease_id, gene_id FROM disnet_biolayer.disease_gene\n", "where sio_id != 'SIO_001120'\n", "and sio_id != 'NO_CURATED'\n", "and disease_id IN {list_disease_id}\n", ";'''" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dis_gen=pd.read_sql(dis_gen, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "581.8571428571429" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 581.857143\n", "std 464.301780\n", "min 3.000000\n", "25% 267.500000\n", "50% 559.000000\n", "75% 854.500000\n", "max 1267.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = f'''\n", "SELECT dg.disease_id,gp.pathway_id\n", " FROM disnet_biolayer.disease_gene dg \n", " JOIN disnet_biolayer.disease ds ON ds.disease_id = dg.disease_id\n", " JOIN disnet_biolayer.tmp_gene_pathway gp ON gp.gene_id = dg.gene_id\n", " where dg.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw=pd.read_sql(dis_gen_pw, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1847.142857142857" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 1847.142857\n", "std 1531.396251\n", "min 3.000000\n", "25% 795.000000\n", "50% 1640.000000\n", "75% 2748.500000\n", "max 4200.000000\n", "Name: pathway_id, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
disease_idpathway_id
0C0020538WP554
1C0018799WP1544
2C0018799WP1528
3C0027947WP229
4C0013369WP229
.........
659C0268274WP4153
660C0085131WP4153
661C0036161WP4153
662C0268275WP4153
663C0162666WP4236
\n", "

664 rows × 2 columns

\n", "
" ], "text/plain": [ " disease_id pathway_id\n", "0 C0020538 WP554\n", "1 C0018799 WP1544\n", "2 C0018799 WP1528\n", "3 C0027947 WP229\n", "4 C0013369 WP229\n", ".. ... ...\n", "659 C0268274 WP4153\n", "660 C0085131 WP4153\n", "661 C0036161 WP4153\n", "662 C0268275 WP4153\n", "663 C0162666 WP4236\n", "\n", "[664 rows x 2 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')\n", "dis_path_direct" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total.drop([\"disease_no_PwB\",\"drug_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total_fil.rename(columns={\"disease_PwB\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "dr_pw = triplets_total_fil.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "42.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dr_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### drug" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "drug = f'''SELECT disease_id,drug_id FROM disnet_drugslayer.drug_disease\n", "where disease_id in {list_disease_id}'''" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "drug=pd.read_sql(drug, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "drug = drug.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "drug = drug.reset_index()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "685.5714285714286" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug[\"drug_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### symptom" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "sint = f'''SELECT DISTINCT\n", " ds.disease_id,s.cui as symptom \n", " FROM disnet_biolayer.disease ds \n", " JOIN edsssdb.layersmappings lm on ds.disease_id = lm.cui\n", " JOIN edsssdb.disease_symptom dsy ON lm.disnet_id = dsy.disease_id\n", " JOIN edsssdb.symptom s ON dsy.cui = s.cui\n", " where ds.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "sint_all=pd.read_sql(sint, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.reset_index()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "83.71428571428571" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sint_all[\"symptom\"].mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }