{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f28f5e21-ca09-42c9-b7c3-1868d37db298",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import networkx as nx\n",
    "from tabulate import tabulate\n",
    "from networkx.algorithms import bipartite\n",
    "import random\n",
    "from scipy.stats import norm\n",
    "from itertools import combinations\n",
    "import re\n",
    "from itertools import product\n",
    "import sys\n",
    "sys.path.append('schizophrenia/functions_network_medicine_schizo.py')\n",
    "import functions_network_medicine_schizo"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99573d30-ec91-4703-8cc2-1597e6ed6fc3",
   "metadata": {},
   "source": [
    "### Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1dbaebbd-1ad3-4483-8418-7bf44a41db5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#nodes\n",
    "pro = pd.read_csv('data/nodes/pro.tsv', sep=\"\\t\")\n",
    "gen = pd.read_csv('data/nodes/gen.tsv', sep=\"\\t\")\n",
    "dru = pd.read_csv('data/nodes/dru.tsv', sep=\"\\t\")\n",
    "dis = pd.read_csv('data/nodes/dis.tsv', sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8db5703d-6d8e-4295-8e34-895fe8b6d1ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "#links\n",
    "pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep=\"\\t\")\n",
    "dis_gen = pd.read_csv('data/links/dis_gen.tsv', sep=\"\\t\")\n",
    "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
    "dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
    "gen_pro = pd.read_csv('data/links/gen_pro.tsv', sep=\"\\t\")\n",
    "dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d33a5a56-e66c-4a5d-a695-b5ef8972f9cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "#file with SPLs between all PPI nodes\n",
    "spl = pd.read_csv('files/SPL PPI.csv', index_col='Source')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2519e85f-388b-4146-9906-aa450cfd4316",
   "metadata": {},
   "source": [
    "### Interactome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5160b911-92a7-4687-8c8c-dfe38f1d0372",
   "metadata": {},
   "outputs": [],
   "source": [
    "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebc315c4-08ad-4f50-849e-0e4bfac34b1f",
   "metadata": {},
   "source": [
    "### Identification of disease module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1cf1179a-49fe-4202-9f60-1600bb0a5634",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dictionaries to store the results for each disease\n",
    "dis_gen_dict = {}  # Seed genes\n",
    "dis_lcc = {}  # Module size\n",
    "dis_lcc_dp = {}  # Results for statistical validation dp\n",
    "dis_lcc_ndp = {}  # Results for statistical validation ndp\n",
    "prots_interactome_dict = {} # Disease proteins in interactome\n",
    "\n",
    "dis_total = dis_dru_the['dis'].unique().tolist()\n",
    "\n",
    "for dis in dis_total:\n",
    "    # Seed genes\n",
    "    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
    "    dis_gen_dict[dis] = genes\n",
    "    \n",
    "    # Disease proteins \n",
    "    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
    "    \n",
    "    # Disease proteins in interactome\n",
    "    prots_interactome = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
    "    prots_interactome_dict[dis] = prots_interactome\n",
    "    \n",
    "    # Module size\n",
    "    SG_dis = G_ppi.subgraph(prots_interactome)\n",
    "    \n",
    "    if SG_dis: # if disease has at least one protein in its module\n",
    "        lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
    "        dis_lcc[dis] = lcc\n",
    "        \n",
    "        # Statistical validation ndp\n",
    "        lcc_ndp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=False)\n",
    "        dis_lcc_ndp[dis] = lcc_ndp\n",
    "        \n",
    "        # Statistical validation dp\n",
    "        lcc_dp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=True)\n",
    "        dis_lcc_dp[dis] = lcc_dp   \n",
    "        \n",
    "    else:\n",
    "        dis_lcc[dis] = []\n",
    "        dis_lcc_ndp[dis] = []\n",
    "        dis_lcc_dp[dis] = []\n",
    "\n",
    "\n",
    "results = {\n",
    "    \"ID\": dis_total,\n",
    "    \"Seed genes\": [len(dis_gen_dict[dis]) for dis in dis_total],\n",
    "    \"Genes in PPI\": len(prots_interactome_dict[dis]) for dis in dis_total],\n",
    "    \"Genes in LCC\": [len(dis_lcc[dis]) for dis in dis_total],\n",
    "    \"Relative LCC\": [len(dis_lcc[dis]) / len(functions_network_medicine.pro_gen_dict(dis_gen_dict[dis], gen_pro)) if len(functions_network_medicine.pro_gen_dict(dis_gen_dict[dis], gen_pro)) != 0 else 0 for dis in dis_total],\n",
    "    \"Mean random LCC (ndp)\": [dis_lcc_ndp[dis][0] for dis in dis_total],\n",
    "    \"Std random LCC (ndp)\": [dis_lcc_ndp[dis][1] for dis in dis_total],\n",
    "    \"Z-score (ndp)\": [dis_lcc_ndp[dis][2] for dis in dis_total],\n",
    "    \"Mean random LCC (dp)\": [dis_lcc_dp[dis][0] for dis in dis_total],\n",
    "    \"Std random LCCs (dp)\": [dis_lcc_dp[dis][1] for dis in dis_total], \n",
    "    \"Z-score (dp)\": [dis_lcc_dp[dis][2] for dis in dis_total]\n",
    "}\n",
    "\n",
    "dis_module_df = pd.DataFrame(results)\n",
    "dis_module_df.to_csv(\"Module.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "92f1b21e-eb2c-4a35-b9a0-1562e4751677",
   "metadata": {},
   "source": [
    "### Disease filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "480659cf-1dd8-409c-a2d5-418a16e97bb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter of diseases with a significant module size\n",
    "significative = dis_module_df[(dis_module_df['Z-score (dp)'] > 1.65) & (dis_module_df['Z-score (ndp)'] > 1.65)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "115a0101-7ed4-4ef4-959e-ab07c7681c61",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter of diseases that have more than 15 pathological proteins in its module belonging to the interactome\n",
    "\n",
    "for i, lcc in enumerate(significative[\"Genes in LCC\"]):\n",
    "    if lcc < 15: \n",
    "        disease_no_sig = significative[\"ID\"][i]\n",
    "        \n",
    "        row = significative[significative['ID'] == disease_no_sig]\n",
    "        \n",
    "        significative = significative.drop(row.index)\n",
    "        \n",
    "significative.reset_index(drop=True, inplace=True)\n",
    "significative.to_csv(\"Module signif.csv\", index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "194a51fd-885e-4e82-96a5-aa2a77250ab1",
   "metadata": {},
   "source": [
    "### Closest distance disease-drug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "722cf051-f5ee-4473-9674-e5519d8616ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "significative = pd.read_csv('Module signif.csv', sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1553f0f2-c28e-4dd3-8643-f791be9e1dca",
   "metadata": {},
   "outputs": [],
   "source": [
    "#DataFrame with drugs and their targets:\n",
    "total_drug_list = set()\n",
    "for drug in dru_pro[\"dru\"]:\n",
    "     if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
    "            list_drugs_total.add(drug)\n",
    "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfdd0ff9-b286-43b1-8c9a-7f6e067cf066",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [] #list with the results of each disease for the observed distance\n",
    "\n",
    "for i, dis in enumerate(df_pro_dis_total_filt[\"dis\"]):\n",
    "    SG_dis = G_ppi.subgraph(df_pro_dis_total_filt[\"pro_ppi\"][i]) #disease subnetwork\n",
    "    lcc = funciones_network_medicine.lcc(SG_dis) # disease module\n",
    "    proximity_obs = funciones_network_medicine.proximity(targets_total, spl, lcc, G_ppi) # DF with drugs and distance to disease\n",
    "    \n",
    "    for _, row in proximity_obs.iterrows():\n",
    "        results.append({\"ID\": dis, \"Drugs\": row[\"Fármacos\"], \"Closest distance\": row[\"dc\"]})\n",
    "\n",
    "df_distance = pd.DataFrame(results)\n",
    "\n",
    "df_distance.to_csv(\"Closest distance.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a496aeb-4674-417a-ac3e-5a8323e148ed",
   "metadata": {},
   "source": [
    "### Proximity disease-drug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "aaefdc73-2962-46c7-a510-8fe22adf0e70",
   "metadata": {},
   "outputs": [],
   "source": [
    "# random target modules\n",
    "target_modules_list_random_drugs = functions_network_medicine_schizo.calculate_random_drug_target_modules(1000, targets_total, G_ppi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d58deeea-711d-45d8-9ed9-91bbbeef16f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(columns=[\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
    "\n",
    "# empty file in which I will add the results\n",
    "with open(\"Closest_distance_rand.csv\", 'w', newline='') as file:\n",
    "    writer = csv.writer(file)\n",
    "    writer.writerow([\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
    "\n",
    "# determination of closest distance in random set of disease module and target module\n",
    "for i, dis in enumerate(significative[\"ID\"]):\n",
    "    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
    "    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
    "    pro_ppi = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
    "    SG_dis = G_ppi.subgraph(pro_ppi)\n",
    "    lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
    "    proximity_rand = functions_network_medicine_schizo.proximity_random(targets_total, spl, lcc, G_ppi, 1000, lista_modulos_diana_farmacos_aleatorios)\n",
    "        \n",
    "    results = []\n",
    "    for _, row in proximity_rand.iterrows():\n",
    "        results.append({\"ID\": dis, \"Drugs\": row[\"Drugs\"], \"Dc_mean\": row[\"dc_mean\"], \"Dc_std\": row[\"dc_std\"]})\n",
    "    \n",
    "    with open(\"Closest_distance_rand.csv\", 'a', newline='') as file:\n",
    "        writer = csv.writer(file)\n",
    "        for result in results:\n",
    "            writer.writerow([result[\"ID\"], result[\"Drugs\"], result[\"Dc_mean\"], result[\"Dc_std\"]])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c96edf3d-41dc-4837-9893-a9ae1d5fa2f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "rand_prox = pd.read_csv(\"Closest_distance_rand.csv\", sep = \",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "084b52bd-b94b-4508-b29e-f131c72837ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "distance=pd.read_csv(\"Closest distance.csv\", sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "672acf4e-7d8e-421b-bfcb-5d05b4be4fa9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Join df_obs and df_prox based on the ID and Drugs columns\n",
    "df_merged = pd.merge(distance, rand_prox, on=['ID', 'Drugs'])\n",
    "\n",
    "# dc_zscore\n",
    "df_merged['Dc_zscore'] = (df_merged['Closest distance'] - df_merged['Dc_mean']) / df_merged['Dc_std']\n",
    "\n",
    "# New column indicating if each drug is used for the treatment of the disease\n",
    "df_merged['Treatment'] = df_merged.apply(lambda row: functions_network_medicine_schizo.determine_treatment(row, dis_dru_the), axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "92dc074e-4ebc-4466-92a2-9a8893e9a6f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_merged.to_csv(\"results/Proximity_results.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "3586e5aa-4877-4d70-b986-5bb92d881bc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "proximity = pd.read_csv(\"Proximity_results.csv\", sep = \",\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0a5d2a3-b632-4874-bed6-67ed6e3e4714",
   "metadata": {},
   "source": [
    "### Statistical analysis of the proximity results across all the diseases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "5b03a68c-307e-42b5-8209-a27c44dafb00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter the data to include only relevant columns\n",
    "df_relevant = proximity[['Closest distance', 'Dc_zscore', 'Treatment']]\n",
    "\n",
    "# Compute descriptive statistics for 'Closest distance' and 'Dc_zscore' grouped by 'Treatment'\n",
    "describe_closest_distance = df_relevant.groupby('Treatment')['Closest distance'].describe()\n",
    "describe_dc_zscore = df_relevant.groupby('Treatment')['Dc_zscore'].describe()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "52b62828-5c27-4578-bb46-9a2b5de60258",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Describe de Closest distance:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unknown</th>\n",
       "      <td>516993.0</td>\n",
       "      <td>1.741093</td>\n",
       "      <td>0.590947</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yes</th>\n",
       "      <td>12162.0</td>\n",
       "      <td>1.473756</td>\n",
       "      <td>0.691416</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.6</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              count      mean       std  min  25%  50%  75%  max\n",
       "Treatment                                                       \n",
       "unknown    516993.0  1.741093  0.590947  0.0  1.4  2.0  2.0  4.0\n",
       "yes         12162.0  1.473756  0.691416  0.0  1.0  1.6  2.0  3.0"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"\\nDescribe Closest distance:\")\n",
    "describe_closest_distance.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "099c6bab-f16f-4b16-bb17-99a86a298498",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Describe de Dc_zscore:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unknown</th>\n",
       "      <td>510797.0</td>\n",
       "      <td>-0.159867</td>\n",
       "      <td>4.989091</td>\n",
       "      <td>-187.636997</td>\n",
       "      <td>-0.709030</td>\n",
       "      <td>0.123404</td>\n",
       "      <td>0.733509</td>\n",
       "      <td>164.760255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yes</th>\n",
       "      <td>12053.0</td>\n",
       "      <td>-1.295405</td>\n",
       "      <td>8.074283</td>\n",
       "      <td>-163.309391</td>\n",
       "      <td>-1.741129</td>\n",
       "      <td>-0.465569</td>\n",
       "      <td>0.400364</td>\n",
       "      <td>104.948135</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              count      mean       std         min       25%       50%  \\\n",
       "Treatment                                                                 \n",
       "unknown    510797.0 -0.159867  4.989091 -187.636997 -0.709030  0.123404   \n",
       "yes         12053.0 -1.295405  8.074283 -163.309391 -1.741129 -0.465569   \n",
       "\n",
       "                75%         max  \n",
       "Treatment                        \n",
       "unknown    0.733509  164.760255  \n",
       "yes        0.400364  104.948135  "
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"\\nDescribe Dc_zscore:\")\n",
    "describe_dc_zscore.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}