{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sqlalchemy import create_engine\n",
    "from sklearn import preprocessing\n",
    "import mysql.connector\n",
    "from pandas import DataFrame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "Triples_target_final = pd.read_csv(\"Triples_target_final.tsv\", sep='\\t')\n",
    "Triples_target_final = Triples_target_final.drop([\"Unnamed: 0\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_total = pd.read_csv('triplets_total.csv', sep=';')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_total = triplets_total.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "disease_id = triplets_total[\"disease_PwB\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert(lista):\n",
    "    return tuple(i for i in lista)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0035579', 'C0035579', 'C0020538', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0020538', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0035579', 'C0030567', 'C0030567', 'C0002395', 'C0002395', 'C0002395', 'C0002395', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0025202', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538')\n"
     ]
    }
   ],
   "source": [
    "# Driver function\n",
    "lista = disease_id\n",
    "print(convert(lista))\n",
    "list_disease_id = convert(lista)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen = f'''SELECT disease_id, gene_id FROM disnet_biolayer.disease_gene\n",
    "where sio_id != 'SIO_001120'\n",
    "and sio_id != 'NO_CURATED'\n",
    "and disease_id IN {list_disease_id}\n",
    ";'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen=pd.read_sql(dis_gen, con=disnet_db_ares)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_num_gen = dis_gen_num_gen.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "581.8571428571429"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_gen_num_gen[\"gene_id\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count       7.000000\n",
       "mean      581.857143\n",
       "std       464.301780\n",
       "min         3.000000\n",
       "25%       267.500000\n",
       "50%       559.000000\n",
       "75%       854.500000\n",
       "max      1267.000000\n",
       "Name: gene_id, dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_gen_num_gen[\"gene_id\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### pw via gene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_pw = f'''\n",
    "SELECT dg.disease_id,gp.pathway_id\n",
    "        FROM disnet_biolayer.disease_gene dg \n",
    "    JOIN disnet_biolayer.disease ds ON ds.disease_id = dg.disease_id\n",
    "    JOIN disnet_biolayer.tmp_gene_pathway gp ON gp.gene_id = dg.gene_id\n",
    "    where dg.disease_id in {list_disease_id}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_pw=pd.read_sql(dis_gen_pw, con=disnet_db_ares)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_pw = dis_gen_pw.groupby(['disease_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen_pw = dis_gen_pw.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1847.142857142857"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_gen_pw[\"pathway_id\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count       7.000000\n",
       "mean     1847.142857\n",
       "std      1531.396251\n",
       "min         3.000000\n",
       "25%       795.000000\n",
       "50%      1640.000000\n",
       "75%      2748.500000\n",
       "max      4200.000000\n",
       "Name: pathway_id, dtype: float64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_gen_pw[\"pathway_id\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### pw direct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>disease_id</th>\n",
       "      <th>pathway_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C0020538</td>\n",
       "      <td>WP554</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C0018799</td>\n",
       "      <td>WP1544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C0018799</td>\n",
       "      <td>WP1528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C0027947</td>\n",
       "      <td>WP229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C0013369</td>\n",
       "      <td>WP229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>659</th>\n",
       "      <td>C0268274</td>\n",
       "      <td>WP4153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>660</th>\n",
       "      <td>C0085131</td>\n",
       "      <td>WP4153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>661</th>\n",
       "      <td>C0036161</td>\n",
       "      <td>WP4153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>662</th>\n",
       "      <td>C0268275</td>\n",
       "      <td>WP4153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>663</th>\n",
       "      <td>C0162666</td>\n",
       "      <td>WP4236</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>664 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    disease_id pathway_id\n",
       "0     C0020538      WP554\n",
       "1     C0018799     WP1544\n",
       "2     C0018799     WP1528\n",
       "3     C0027947      WP229\n",
       "4     C0013369      WP229\n",
       "..         ...        ...\n",
       "659   C0268274     WP4153\n",
       "660   C0085131     WP4153\n",
       "661   C0036161     WP4153\n",
       "662   C0268275     WP4153\n",
       "663   C0162666     WP4236\n",
       "\n",
       "[664 rows x 2 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')\n",
    "dis_path_direct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_total_fil = triplets_total.drop([\"disease_no_PwB\",\"drug_id\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_total_fil = triplets_total_fil.rename(columns={\"disease_PwB\": \"disease_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "dr_pw = triplets_total_fil.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "dr_pw = dr_pw.groupby(['disease_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "dr_pw = dr_pw.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42.0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dr_pw[\"pathway_id\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### drug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug = f'''SELECT disease_id,drug_id FROM disnet_drugslayer.drug_disease\n",
    "where disease_id in {list_disease_id}'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug=pd.read_sql(drug, con=disnet_db_ares)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug = drug.groupby(['disease_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug = drug.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "685.5714285714286"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drug[\"drug_id\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### symptom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "sint = f'''SELECT DISTINCT\n",
    "        ds.disease_id,s.cui as symptom \n",
    "        FROM disnet_biolayer.disease ds \n",
    "            JOIN edsssdb.layersmappings lm on ds.disease_id = lm.cui\n",
    "            JOIN edsssdb.disease_symptom dsy ON lm.disnet_id = dsy.disease_id\n",
    "            JOIN edsssdb.symptom s ON dsy.cui = s.cui\n",
    "       where ds.disease_id in {list_disease_id}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "sint_all=pd.read_sql(sint, con=disnet_db_ares)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "sint_all = sint_all.groupby(['disease_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "sint_all = sint_all.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "83.71428571428571"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sint_all[\"symptom\"].mean()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}