{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "Triples_target_final = pd.read_csv(\"Triples_target_final.tsv\", sep='\\t')\n", "Triples_target_final = Triples_target_final.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_csv('triplets_total.csv', sep=';')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "disease_id = triplets_total[\"disease_no_PwB\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def convert(lista):\n", " return tuple(i for i in lista)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('C0018802', 'C0018802', 'C0004238', 'C0011881', 'C0001206', 'C0020649', 'C0024586', 'C0010674', 'C0011881', 'C0004238', 'C0003962', 'C0013604', 'C0020428', 'C0020473', 'C0018801', 'C0004096', 'C0009324', 'C0010346', 'C0011849', 'C0020598', 'C0020626', 'C0024141', 'C0026769', 'C0028754', 'C0029456', 'C0029458', 'C0015371', 'C0020598', 'C0035086', 'C0011881', 'C0002395', 'C0026769', 'C0020598', 'C0020626', 'C0029456', 'C0029458', 'C0020598', 'C0020626', 'C0020598', 'C0020626', 'C0035086', 'C0003862', 'C0003873', 'C0004153', 'C0004604', 'C0009443', 'C0026764', 'C0001144', 'C0001261', 'C0003175', 'C0006277', 'C0006309', 'C0010674', 'C0011581', 'C0018081', 'C0023860', 'C0027404', 'C0030193', 'C0016053', 'C0184567', 'C0242422', 'C0600177', 'C1621958', 'C0027051', 'C0028754', 'C0038454', 'C0038454', 'C0036337', 'C0036341', 'C0042870', 'C0242422', 'C1739363', 'C0042847', 'C0162316', 'C0006114', 'C0033860', 'C0042847', 'C0085682', 'C0184567', 'C0242422', 'C0026769', 'C0030567', 'C0036341', 'C0497327', 'C0036341', 'C0497327', 'C0042870', 'C3536984', 'C0039621', 'C0085682', 'C0042870', 'C0085682', 'C1527383', 'C0027051', 'C0029408', 'C0030193', 'C0032463', 'C0040460', 'C0149931', 'C0393735', 'C0948089', 'C0031099', 'C0031350', 'C0032064', 'C0032285', 'C0034362', 'C0035854', 'C0037199', 'C0039128', 'C0042029', 'C0026764', 'C0036341', 'C1269683', 'C0036341', 'C0497327', 'C1269683', 'C0002395')\n" ] } ], "source": [ "# Driver function\n", "lista = disease_id\n", "print(convert(lista))\n", "list_disease_id = convert(lista)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_gen = f'''SELECT disease_id, gene_id FROM disnet_biolayer.disease_gene\n", "where sio_id != 'SIO_001120'\n", "and sio_id != 'NO_CURATED'\n", "and disease_id IN {list_disease_id}\n", ";'''" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "dis_gen=pd.read_sql(dis_gen, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "228.91780821917808" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 73.000000\n", "mean 228.917808\n", "std 301.082480\n", "min 1.000000\n", "25% 14.000000\n", "50% 79.000000\n", "75% 392.000000\n", "max 1369.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = f'''\n", "SELECT dg.disease_id,gp.pathway_id\n", " FROM disnet_biolayer.disease_gene dg \n", " JOIN disnet_biolayer.disease ds ON ds.disease_id = dg.disease_id\n", " JOIN disnet_biolayer.tmp_gene_pathway gp ON gp.gene_id = dg.gene_id\n", " where dg.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw=pd.read_sql(dis_gen_pw, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "766.2361111111111" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total.drop([\"disease_PwB\",\"drug_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total_fil.rename(columns={\"disease_no_PwB\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "dr_pw = triplets_total_fil.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6.4" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dr_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### drug" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "drug = f'''SELECT disease_id,drug_id FROM disnet_drugslayer.drug_disease\n", "where disease_id in {list_disease_id}'''" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "drug=pd.read_sql(drug, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "drug = drug.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "drug = drug.reset_index()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "441.88607594936707" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug[\"drug_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### sintomas" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "sint = f'''SELECT DISTINCT\n", " ds.disease_id,s.cui as symptom \n", " FROM disnet_biolayer.disease ds \n", " JOIN edsssdb.layersmappings lm on ds.disease_id = lm.cui\n", " JOIN edsssdb.disease_symptom dsy ON lm.disnet_id = dsy.disease_id\n", " JOIN edsssdb.symptom s ON dsy.cui = s.cui\n", " where ds.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "sint_all=pd.read_sql(sint, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.reset_index()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "76.81333333333333" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sint_all[\"symptom\"].mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }