{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "Triples_target_final = pd.read_csv(\"./Data/Input/DISNET/Triples_target_final.tsv\", sep='\\t')\n", "Triples_target_final = Triples_target_final.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_csv('./Data/Input/DISNET/triplets_total.csv', sep=';')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "disease_id = triplets_total[\"disease_PwB\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def convert(lista):\n", " return tuple(i for i in lista)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0035579', 'C0035579', 'C0020538', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0020538', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0035579', 'C0030567', 'C0030567', 'C0002395', 'C0002395', 'C0002395', 'C0002395', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0025202', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538')\n" ] } ], "source": [ "# Driver function\n", "lista = disease_id\n", "print(convert(lista))\n", "list_disease_id = convert(lista)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "dis_gen = f'''SELECT disease_id, gene_id FROM disnet_biolayer.disease_gene\n", "where sio_id != 'SIO_001120'\n", "and sio_id != 'NO_CURATED'\n", "and disease_id IN {list_disease_id}\n", ";'''" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dis_gen=pd.read_sql(dis_gen, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "581.8571428571429" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 581.857143\n", "std 464.301780\n", "min 3.000000\n", "25% 267.500000\n", "50% 559.000000\n", "75% 854.500000\n", "max 1267.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = f'''\n", "SELECT dg.disease_id,gp.pathway_id\n", " FROM disnet_biolayer.disease_gene dg \n", " JOIN disnet_biolayer.disease ds ON ds.disease_id = dg.disease_id\n", " JOIN disnet_biolayer.tmp_gene_pathway gp ON gp.gene_id = dg.gene_id\n", " where dg.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw=pd.read_sql(dis_gen_pw, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1847.142857142857" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 1847.142857\n", "std 1531.396251\n", "min 3.000000\n", "25% 795.000000\n", "50% 1640.000000\n", "75% 2748.500000\n", "max 4200.000000\n", "Name: pathway_id, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total.drop([\"disease_no_PwB\",\"drug_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "triplets_total_fil = triplets_total_fil.rename(columns={\"disease_PwB\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "dr_pw = triplets_total_fil.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "42.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dr_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### drug" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "drug = f'''SELECT disease_id,drug_id FROM disnet_drugslayer.drug_disease\n", "where disease_id in {list_disease_id}'''" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "drug=pd.read_sql(drug, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "drug = drug.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "drug = drug.reset_index()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "685.5714285714286" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug[\"drug_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### symptom" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "sint = f'''SELECT DISTINCT\n", " ds.disease_id,s.cui as symptom \n", " FROM disnet_biolayer.disease ds \n", " JOIN edsssdb.layersmappings lm on ds.disease_id = lm.cui\n", " JOIN edsssdb.disease_symptom dsy ON lm.disnet_id = dsy.disease_id\n", " JOIN edsssdb.symptom s ON dsy.cui = s.cui\n", " where ds.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "sint_all=pd.read_sql(sint, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "sint_all = sint_all.reset_index()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "83.71428571428571" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sint_all[\"symptom\"].mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }