Upload New File

2636d29c · Maria Marin · 83affd2a · 2636d29c · 2636d29c · 2636d29c
Commit 2636d29c authored Mar 30, 2024 by Maria Marin
13 changed files
--- a/analysis/README.md
+++ b/analysis/README.md
+# Analysis
+
+## Repository content
+
+| NAME | DESCRIPTION |
+|-----------------------|------------------------ -------------------------------------------------- ---------------|
+| [figures]() | Directory with the figures generated as a result of the analysis|
+| files() | Directory with the intermediate files used during the analysis |
+| results() | Directory with the files generated as a result of the analysis |
+|[schizophrenia]() | Directory with the Jupyter Notebooks and Python scripts employed to study drug repusposing candidates for schizophrenia |
+| [total diseases analysis.ipynb]() | Jupyter Notebook used to analyze the disease module size and the proximity disease-drug for 495 diseases as a previous step to obtain the proximity distribution files (in results) that will be used in the analysis carried out for schizophrenia  |
--- a/analysis/figures/LCC analysis schizophrenia.png
+++ b/analysis/figures/LCC analysis schizophrenia.png
--- a/analysis/figures/proximity schizophrenia.svg
+++ b/analysis/figures/proximity schizophrenia.svg
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="475.2pt" height="360pt" viewBox="0 0 475.2 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2024-03-25T13:59:19.102751</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.8.0, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 360 
+L 475.2 360 
+L 475.2 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+ </g>
+</svg>
--- a/analysis/figures/schizophrenia module.png
+++ b/analysis/figures/schizophrenia module.png
--- a/analysis/results/Filtering_by_DGE_results.csv
+++ b/analysis/results/Filtering_by_DGE_results.csv
+Drugs,Gene symbol,log(Fold Change),Corrected pvalue
+CHEMBL413,FGF2,0.262194915301407,0.0001663143107123
+CHEMBL629,NTRK2,0.215842149854464,0.0005334841403452
+CHEMBL717,ABCB1,-0.19259430589577,0.0011518294091511
+CHEMBL472,ABCA1,0.18596862999644,8.3409820679652e-05
+CHEMBL608,ABCA1,0.18596862999644,8.3409820679652e-05
+CHEMBL86304,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL1201168,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL1574,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL1201201,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL972,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL278819,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL1089,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL673,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL750,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL37744,MAOA,0.141364018163515,0.0007876002881378
+CHEMBL941,DDR1,0.124057530881635,0.0079332374104671
+CHEMBL607,GRIN2C,0.122802787266293,0.0057675281998525
+CHEMBL930,GLUL,0.105690036807368,0.0101258664300799
+CHEMBL90593,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL264374,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL457,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL1297,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL23588,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL633,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL557555,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL565,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL521,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL6,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL672,PPARA,0.0995849269559933,0.0094860300493365
+CHEMBL3137309,BCL2,0.0931726845904655,0.0138753870077204
+CHEMBL521,BCL2,0.0931726845904655,0.0138753870077204
+CHEMBL428647,BCL2,0.0931726845904655,0.0138753870077204
+CHEMBL887,BCL2,0.0931726845904655,0.0138753870077204
+CHEMBL1088977,CBS,0.0802567480856852,0.0111248339342028
+CHEMBL1286,SV2A,-0.0753815769457941,0.0050610575822867
+CHEMBL607400,SV2A,-0.0753815769457941,0.0050610575822867
+CHEMBL113,PIK3CB,-0.0654377622012529,0.0380665474160753
+CHEMBL1200733,ATP2A2,-0.0573001962598098,0.0164116692300517
+CHEMBL701,GABBR2,-0.055808229356042,0.0347092303857617
+CHEMBL896,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL1423,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL471,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL479,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL631,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL83,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL12713,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL517,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL43,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL998,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL623,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL2,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL533,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL296419,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL1008,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL473,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL640,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL709,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL462605,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL723,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL1294,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL16,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL71,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL532,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL6966,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL652,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL184412,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL1107,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL157101,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL611,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL11,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL1628227,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL17157,KCNH2,-0.0508555360118699,0.0233144571011437
+CHEMBL41,KCNH2,-0.0508555360118699,0.0233144571011437
--- a/analysis/results/Proximity_results.csv
+++ b/analysis/results/Proximity_results.csv
--- a/analysis/results/Repurposing candidates schizophrenia.csv
+++ b/analysis/results/Repurposing candidates schizophrenia.csv
+Drugs,Closest distance,Dc_zscore
+CHEMBL623,0.3,-115.57962872808126
+CHEMBL113,0.3636363636363636,-103.5128460079722
+CHEMBL941,0.5,-94.87934739957429
+CHEMBL1628227,0.3529411764705882,-90.84425264943393
+CHEMBL17157,0.4285714285714285,-87.50885056097165
+CHEMBL83,0.4285714285714285,-85.95038070535077
+CHEMBL930,0.5,-78.56768686211016
+CHEMBL23588,0.5,-77.73657144185727
+CHEMBL640,0.3333333333333333,-75.75625189325385
+CHEMBL709,0.5,-74.36576498963095
+CHEMBL278819,0.3333333333333333,-65.01494476738038
+CHEMBL652,0.5,-51.71753701225944
+CHEMBL413,0.3333333333333333,-22.765103095067783
+CHEMBL43,0.5,-8.866920486283584
--- a/analysis/schizophrenia/DGE.ipynb
+++ b/analysis/schizophrenia/DGE.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "eed38b7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import pandas as pd\n",
+    "from Bio import Entrez\n",
+    "import GEOparse\n",
+    "import geopandas as gpd\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from GEOparse import GEOparse\n",
+    "import gzip\n",
+    "import shutil\n",
+    "import funciones_network_medicine\n",
+    "import networkx as nx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "947d7a7a-7b53-4c14-8c9f-7d25ccfecdca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#nodes\n",
+    "pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n",
+    "gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n",
+    "dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n",
+    "dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")\n",
+    "#links\n",
+    "pro_pro = pd.read_csv('../data/links/pro_pro.tsv', sep=\"\\t\")\n",
+    "dis_gen = pd.read_csv('../data/links/dis_gen.tsv', sep=\"\\t\")\n",
+    "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
+    "dis_dru_the = pd.read_csv('../data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
+    "gen_pro = pd.read_csv('../data/links/gen_pro.tsv', sep=\"\\t\")\n",
+    "dru_pro = pd.read_csv('../data/links/dru_pro.tsv', sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f939592a-7c9c-4ec4-b20a-03c07439a56b",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Interactome"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "37a4ff6e-e3e6-4b0f-ad8d-d3aef695027d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94600860-ca3b-4f30-a2fe-530e719426d9",
+   "metadata": {},
+   "source": [
+    "#### List of genes in LCC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5f2370f1-9d66-43e2-9b25-88c32e75a3ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen_schizo = funciones_network_medicine.genes_enf(\"C0036341\", dis_gen)\n",
+    "dict_schizo= funciones_network_medicine.pro_gen_dict(gen_schizo, gen_pro)\n",
+    "dict_schizo_PPI = funciones_network_medicine.gen_pro_PPI(dict_schizo, pro_pro)\n",
+    "SG_schizo= funciones_network_medicine.SG(dict_schizo_PPI, G_ppi)\n",
+    "lcc_schizo= funciones_network_medicine.lcc(SG_schizo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4c71bf7a-847f-4b63-919c-4d81207d61cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_num = []\n",
+    "for pro in lcc_schizo:\n",
+    "    for i,pro2 in enumerate(gen_pro[\"pro\"]):\n",
+    "        if pro == pro2:\n",
+    "            list_num.append(gen_pro[\"gen\"][i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8a56ffb7-76fa-451f-9849-ca96309c8fec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_genes = []\n",
+    "for num_gen in set(list_num):\n",
+    "    for i,num_gen2 in enumerate(gen[\"id\"]):\n",
+    "        if num_gen == num_gen2:\n",
+    "            gen_names = gen[\"name\"][i]\n",
+    "            list_genes.append(gen_names) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67209750-1411-45a3-9ad9-5a92f8521c90",
+   "metadata": {},
+   "source": [
+    "#### Microarray GE analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "dd844f76-955b-4708-9b54-a6617044cfe1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dge = pd.read_csv('../files/Microarray_SCZ_metaanalysis_092017.csv', sep=\",\") #Gandal et al. 2018"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce169023-6da3-4e67-8c4e-a38c5e31a60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Genes from the Gandal study that are found in the schizophrenia module\n",
+    "\n",
+    "filtered_dge = dge[dge[\"symbol\"].isin(list_genes)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6be40bf1-e2b7-4470-8196-34620b7147f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            Unnamed: 0      beta        SE         p       fdr  symbol\n",
+      "7      ENSG00000001084  0.038813  0.029858  0.195025  0.405474    GCLC\n",
+      "23     ENSG00000002822 -0.051322  0.019815  0.010259  0.058147  MAD1L1\n",
+      "24     ENSG00000002834 -0.000855  0.014219  0.952121  0.976082   LASP1\n",
+      "110    ENSG00000006128 -0.200785  0.047132  0.000031  0.000998    TAC1\n",
+      "126    ENSG00000006611  0.071682  0.030708  0.020512  0.093734   USH1C\n",
+      "...                ...       ...       ...       ...       ...     ...\n",
+      "12208  ENSG00000256269 -0.047590  0.016506  0.004341  0.032269    HMBS\n",
+      "12222  ENSG00000257017 -0.022547  0.024117  0.350880  0.577942      HP\n",
+      "12251  ENSG00000259207  0.002088  0.020028  0.917063  0.960690   ITGB3\n",
+      "12298  ENSG00000262683 -0.008338  0.019593  0.670838  0.824663    FHIT\n",
+      "12386  ENSG00000273079  0.019514  0.026687  0.465453  0.676388  GRIN2B\n",
+      "\n",
+      "[769 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(filtered_dge)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "89f27194-032e-4f1e-b6d1-1f0a2b4c5abc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Genes with significantly higher or lower expression in schizophrenia patients.\n",
+    "\n",
+    "fdr_filtered_dge = filtered_dge[filtered_dge[\"fdr\"] <= 0.05] #pvalue fixed with False Discovery Rate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7e59d5d1-7a75-4d8a-b447-49ea639b3db3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            Unnamed: 0      beta        SE             p       fdr    symbol\n",
+      "110    ENSG00000006128 -0.200785  0.047132  3.066855e-05  0.000998      TAC1\n",
+      "148    ENSG00000007168 -0.061780  0.021111  3.801441e-03  0.029186  PAFAH1B1\n",
+      "161    ENSG00000007372  0.174375  0.040167  2.189604e-05  0.000770      PAX6\n",
+      "219    ENSG00000010256 -0.096030  0.019028  9.619559e-07  0.000083    UQCRC1\n",
+      "260    ENSG00000011405  0.102244  0.036488  5.545087e-03  0.038023   PIK3C2A\n",
+      "...                ...       ...       ...           ...       ...       ...\n",
+      "11459  ENSG00000204525 -0.110543  0.034336  1.485081e-03  0.014967     HLA-C\n",
+      "11471  ENSG00000204580  0.124058  0.035651  6.085740e-04  0.007933      DDR1\n",
+      "11678  ENSG00000214113 -0.057975  0.021102  6.522134e-03  0.042600     LYRM4\n",
+      "11906  ENSG00000231925 -0.053500  0.016371  1.262639e-03  0.013331     TAPBP\n",
+      "12208  ENSG00000256269 -0.047590  0.016506  4.341083e-03  0.032269      HMBS\n",
+      "\n",
+      "[178 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(fdr_filtered_dge)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8bc007a4-2597-4f4a-a542-cced89d231b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#sort based on logFC (in absolute value)\n",
+    "dge_sorted = fdr_filtered_dge.reindex(fdr_filtered_dge['beta'].abs().sort_values(ascending=False).index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "45f91300-2b2c-46bc-99f9-78e31ce0a5b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dge_sorted.to_csv(\"dge_sorted.csv\", index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4984712e-1730-4e6c-a063-cbdf29092f8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dge_sorted = pd.read_csv(\"../files/dge_sorted.csv\", sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b91a30b7-0708-4a06-8c32-b3faf08046c6",
+   "metadata": {},
+   "source": [
+    "### Drug targets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f86487dd-724d-4c97-95a8-962a5236ca2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#DataFrame with drugs and their targets:\n",
+    "total_drug_list = set()\n",
+    "for drug in dru_pro[\"dru\"]:\n",
+    "     if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
+    "            list_drugs_total.add(drug)\n",
+    "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "a99abc85-90d9-481f-ab68-3c7687868ecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# DataFrame with drugs that have at least one target protein in the LCC with significant differential expression.\n",
+    "\n",
+    "gen_dict_ = dict(zip(gen['name'], gen['id']))\n",
+    "\n",
+    "drugs = []\n",
+    "genes = []\n",
+    "pvalues = []\n",
+    "logfc = []\n",
+    "\n",
+    "for i, gene_symbol in enumerate(dge_sorted[\"symbol\"]):\n",
+    "    gene_id = gen_dict_.get(gene_symbol)\n",
+    "    proteins = []\n",
+    "    for j, gene2 in enumerate(gen_pro[\"gen\"]):\n",
+    "        if gene_id == gene2:\n",
+    "            proteins.append(gen_pro[\"pro\"][j])\n",
+    "    for z, drug in enumerate(targets_total[\"Fármacos\"]):\n",
+    "        target_list = targets_total[\"Dianas\"][z]\n",
+    "        for protein in proteins:\n",
+    "            if protein in target_list:\n",
+    "                drugs.append(drug)\n",
+    "                genes.append(gene_symbol) \n",
+    "                pvalues.append(dge_sorted[\"fdr\"][i])\n",
+    "                logfc.append(dge_sorted[\"beta\"][i])\n",
+    "\n",
+    "results = {\n",
+    "    \"Drugs\": drugs,\n",
+    "    \"Gene symbol\": genes,\n",
+    "    \"log(Fold Change)\": logfc,\n",
+    "    \"Corrected pvalue\": pvalues,\n",
+    "}\n",
+    "\n",
+    "dge_drugs = pd.DataFrame(results)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69dd8106-15d9-44d9-86f6-54511a38c8d8",
+   "metadata": {},
+   "source": [
+    "### Assessment of module membership of co-expressed genes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "401ee7e6-7bf7-41a8-813a-227976418d08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kme = pd.read_csv(\"../files/gandal_2018a_kMEs.csv\", sep = \",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "e91afef6-7790-44a0-a71d-026f858a9604",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dge_genes = []\n",
+    "\n",
+    "for symbol in dge_drugs[\"Gene symbol\"]:\n",
+    "    for i,symbol2 in enumerate(kme[\"external_gene_id\"]):\n",
+    "        if symbol == symbol2:\n",
+    "            module = kme[\"Module.name\"][i]\n",
+    "            if module.startswith(\"CD\"):\n",
+    "                j = int(module[2:])  # Extrae el número del módulo (por ejemplo, 1, 2, 3, ...)\n",
+    "                if j != 0:\n",
+    "                    kme_value = kme.iloc[i, j + 1]\n",
+    "                    if kme_value > '0,5':\n",
+    "                        dge_genes.append(symbol)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "196811f9-dc94-4722-b67a-b261aa561c5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kme_filtrado = dge_drugs[dge_drugs['Gene symbol'].isin(set(dge_genes))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "6cc25eae-272d-4992-a493-d97b43fdd42c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kme_filtrado.to_csv(\"../results/Filtering_by_DGE_results.csv\", index = False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/analysis/schizophrenia/README.md
+++ b/analysis/schizophrenia/README.md
+# Schizophrenia
+
+Python scripts and Jupyter Notebooks used to apply Network Medicine concepts
+in order to characterize the disease module of schizophrenia and determine the proximity between the neurological condition and drugs.
+
+## Repository content
+
+| NAME | DESCRIPTION |
+|-----------------------|------------------------ -------------------------------------------------- ---------------|
+| [DGE.ipynb]() | Jupyter Notebook used for analyze the Differential Gene Expression (DGE) data |
+| [disease module and proximity.ipynb]() |  Jypyter Notebook used to characterize the disease module of schizophrenia and determine the closest distance and proximity between the neurologicalcondition and drugs|
+| [repurposing.ipynb]() | Jupyter Notebook used to identify drug repurposing candidates for schizophrenia based on the results obtained in the differential gene expression, distance and proximity analyses |
+| [functions_network_medicine_schizo.py]() | Python script with the functions implemented in the Jupyter Notebooks |
+
+## Methodology of the analysis
+
+### Characterization of the disease module
+
+1. Generation of the **interactome**.
+2. Definition of **pathological proteins**.
+3. Development of the **subgraph** of the disease.
+4. Identification of the **module** of the disease.
+5. **Statistical validation** of the disease modules.
+
+### Determination of disease-drug proximity
+
+1. **Distance** between disease modules and drugs: closest distance *\(d<sub>c</sub>\)*.
+2. **Proximity** between disease modules and drugs: distance z-score.
+
+### Differential Gene Expression analysis
+
+Identification of genes that may be potential therapeutic targets depending on whether they meet the following criteria:
+
+1. Differentially expressed in patients with schizophrenia
+2. Significantly correlated with the genes belonging to its co-expression module in different psyhiatric and neurological diseases (PNDs).
+3. Part of disease module. 
+
+Data regarding the DGE and the co-expression modules were obtained, respectively, from the supplementary material of ... and ...
+
+## Criteria for identifying drug repurposing candidates: 
+
+1. Distance to schizophrenia module between Q1 and median of distance values across treatment drugs.
+2. Proximal to disease module (z-score of distance less than or equal to -0.15).
+3. Targetting potential therapeutic targets identificated previously.
--- a/analysis/schizophrenia/disease module and proximity.ipynb
+++ b/analysis/schizophrenia/disease module and proximity.ipynb
--- a/analysis/schizophrenia/functions_network_medicine_schizo.py
+++ b/analysis/schizophrenia/functions_network_medicine_schizo.py
+#! /usr/bin/env python
+
+"""
+# ---------------------------------------------------------------------------
+#
+# functions_network_medicine_schizo.py
+# File with all the functions that I have used to calculate the distance
+# and the proximity between diseases and drugs. Additionally, functions are included
+# employees to obtain the disease module and study its
+# significance prior to determining distance and proximity.
+#
+# Exploring Drug Repurposing Opportunities for Schizophrenia: A Network Medicine Approach 
+#
+# María Marín Tercero
+# ----------------------------------------------------------------------------
+"""
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import networkx as nx
+from tabulate import tabulate
+from networkx.algorithms import bipartite
+import random
+from scipy.stats import norm
+from itertools import combinations
+import re
+from itertools import product
+
+    
+
+# =================================================================================
+
+def genes_dis(enf, file):
+    """
+    This function creates a list with the genes associated with the disease "enf" in the dis_gen file   
+    
+    """
+    genes=[]
+    for i, dis in enumerate(file["dis"]): 
+        if dis == enf:
+            gen = file["gen"][i]
+            genes.append(gen) 
+    return genes
+
+# =================================================================================
+
+def pro_gen_dict(gene_list, file):
+    """
+    This function creates a dictionary from the list of genes associated with the disease with:
+     key: protein associated with each gene in the gen_pro file
+     value: gene related to the key protein in the gen_pro file
+    """
+    result_dict = {}
+    for i, gen in enumerate(file["gen"]): 
+        # Looping through gen_pro, which relates genes and proteins.
+        # I'm storing the position of the gene (i) and the gene id (gen).
+        if gen in gene_list: 
+            # Searching each gene in gen_pro within the corresponding gene list of each disease.
+            prot = file["pro"][i] 
+            # If that gene is in the gene list of each disease, I find the associated protein at the same position.
+            result_dict[prot] = gen 
+            # Adding to each disease's dictionary the protein as key and the related gene as value.
+    return result_dict
+
+# =================================================================================
+
+def gen_pro_PPI(dict1, file):
+    """
+    From a dictionary with the relationships between proteins and genes associated with each of our diseases,
+    this function retains the prot:gen relationship from the dictionary only if such prot appears in the PPI network of the pro_pro file.
+    key: proteins appearing in the PPI network
+    value: genes related to the key protein
+    """
+    result_dict = {}
+    for prot in dict1.keys(): 
+        # Iterating over all proteins in the general prot:gen dictionary.
+        if prot in file["prA"].tolist() or prot in file["prB"].tolist(): 
+            # Selecting proteins that appear in the PPI network.
+            result_dict[prot] = dict1[prot] 
+            # Adding to the PPI prot:gen dictionary only the prot:gen relationships for proteins that are in the PPI.
+    return result_dict 
+
+# =================================================================================
+
+def SG(dic, PPI):
+    """ 
+    Input data: dictionary with proteins from the PPI (keys) and associated genes (values) for a disease, PPI network
+    This function creates a subgraph only with the proteins from the PPI network associated with my disease as nodes.
+    """
+
+    # Creating a subgraph only with the proteins from the PPI network associated with my disease as nodes
+    SG = nx.subgraph(PPI, dic.keys())
+    return SG
+
+# =================================================================================
+
+def lcc(SG):
+    """
+    This function gives us the LCC of the proteins from the PPI network associated with a disease from a subgraph
+    formed only with the proteins associated with the disease.
+    """
+    lcc = max(nx.connected_components(SG), key=len) 
+    # Calculating the LCC (module comprising the largest number of proteins associated with a disease).
+    
+    # Our goal is to obtain the number of genes that are part of the LCC of the disease:
+    # The number of proteins from the disease in the LCC is the same number as the genes in the LCC
+    # (because we have extracted the list of proteins from the dictionary where they form a tuple with their associated genes).
+    
+    return lcc
+
+# =================================================================================
+
+def nodes_by_degree(G):
+    """
+    This function returns a dictionary where we will obtain the degrees as keys and, in the values, all the nodes of the network that contain that degree.
+    """
+    degree_dict = {}
+    for node in G.nodes():
+        degree = G.degree(node)
+
+        if degree not in degree_dict:
+            degree_dict[degree] = []
+
+        degree_dict[degree].append(node)
+
+    return degree_dict
+
+# =================================================================================
+
+def lcc_simulation(G, lcc, PPI, dp=False):
+    """
+    Input data: disease module subgraph (G), PPI, and disease LCC (lcc). 
+    Dp can be False to compute networks preserving the degree distribution of the disease module or True otherwise.
+    
+    This function returns the mean and standard deviation of the LCC of 1000 random networks with the same number 
+    of nodes and edges as the graph G provided as input. 
+    
+    For the creation of these networks in the case dp = False, the associations between disease proteins are randomly 
+    distributed within the network. Therefore, the networks will not have the same structure as the disease module.
+    
+    For the creation of networks in the case dp = True, nodes in the PPI network with the same degree as the nodes 
+    in the disease module are selected. Thus, networks with the same structure as the disease module are created.
+    """
+    
+    # Preliminary calculations:
+    
+    # Get the total proteins from the PPI
+    ppi_nodes = PPI.nodes()
+    
+    # Get the total proteins in the disease module
+    disease_nodes = G.nodes()
+    
+    # Get the number of proteins in the disease module
+    num_disease_nodes = len(disease_nodes & set(ppi_nodes))
+    
+    # Group nodes by their degree
+    degree_grouped_nodes = nodes_by_degree(PPI)
+
+
+    # 1000 random simulations to calculate 1000 LCC
+    random_list = []
+    for i in range(1000):
+        if dp == False: # Simulation for non-degree preserving
+            # Get a random set of proteins within the total nodes of the PPI network.
+            # This set has a number of nodes equivalent to the number of nodes in the disease module.
+            random_nodes = set(random.sample(list(ppi_nodes), num_disease_nodes))
+        if dp == True: # Simulation for degree preserving
+            random_nodes = set()
+            for node in disease_nodes: # For each node in the disease module
+                degree = PPI.degree(node) # Get its degree in the PPI network
+                available_nodes = degree_grouped_nodes[degree] # Get a group of nodes in the PPI with the same degree as the iterated node
+                control = True
+                while(control): # Loop to choose sampled nodes only once
+                    chosen_node = random.choice(available_nodes) # Choose a node from the selected PPI nodes in the previous step
+                    control = chosen_node in random_nodes # Check if that node is among the sampled nodes
+                random_nodes.add(chosen_node) # Add that node to the list of nodes that I will use to create the random network
+                # Only nodes that are not previously among the sampled nodes are added thanks to the while loop  
+        r = nx.subgraph(PPI,random_nodes) # Subgraph of the PPI with the selected random nodes
+        r = nx.Graph(r) # To remove parallel edges
+        r.remove_edges_from(nx.selfloop_edges(r)) # To remove self-loops connecting a node to itself
+        random_list.append(len(max(nx.connected_components(r), key=len)))
+    mean = np.mean(random_list) # Mean
+    std = np.std(random_list) # Standard deviation
+    zscore = (len(lcc) - mean)/std
+        
+    return mean, std, zscore, random_list
+
+# =================================================================================
+
+def degrees_list(G):
+    """
+    This function returns a list with the nodes and another list with their degrees.
+    """
+    nodes = list(G.nodes())
+    degrees = list(dict(G.degree()).values())
+    
+    return nodes, degrees
+
+# =================================================================================
+
+
+def rep(disease_name, G, PPI, ndp_list, dp_list, LCC, mean_ndp, std_ndp, zscore_ndp, mean_dp, std_dp, zscore_dp):
+    """
+    Input data:
+    1. Disease name
+    2. SG: LCC of the proteins from the PPI network associated with a disease
+    3. G: PPI network 
+    4. ndp_list: list of LCCs (ndp)
+    5. dp_list: list of LCCs (dp)
+    6.LCC: Observed LCC of the disease
+    7. mean_ndp: mean of the list of LCCs (ndp)
+    8. std_ndp: standard deviation of the list of LCCs (ndp)
+    9.zscore_ndp: z-score of the list of LCCs (ndp)
+    10. mean_dp: mean of the list of LCCs (dp) 
+    11. std_dp: standard deviation of the list of LCCs (dp)
+    12. zscore_dp: z-score of the list of LCCs (dp)
+    
+    This function performs a representation of:
+    1. Distribution of LCCs (ndp)
+    2. Distribution of LCCs (dp)
+    """
+    
+    # Get a list with the nodes and their degrees from the PPI
+    G_ppi_list = degrees_list(PPI)
+    
+    degree_ppi = pd.DataFrame(list(zip(G_ppi_list[0], G_ppi_list[1])), columns=['node','degree'])
+     
+    # Get a list with the nodes and their degrees from the PPI
+    G_disease_list = degrees_list(G)
+    
+    degree_disease = pd.DataFrame(list(zip(G_disease_list[0], G_disease_list[1])), columns=['node','degree'])
+
+      
+    # Group the list of nodes and degrees from the PPI network by degree and count how many nodes in the network have that degree    
+    G_plot_ppi = degree_ppi.groupby('degree').count()
+    
+    # Group the list of nodes and degrees from the disease by degree and count how many nodes in the network have that degree
+    G_plot_disease = degree_disease.groupby('degree').count()
+    
+    # Create a figure to add 3 subplots
+    fig, axs = plt.subplots(1, 2, figsize=(15, 4))
+
+    # Representation of the distribution of LCC ndp
+    axs[0].hist(ndp_list, color = '#79C4FF', bins=20, edgecolor='black')
+    axs[0].set_xlabel('Nº of nodes in LCC', fontsize=12)
+    axs[0].set_ylabel('Nº of random networks', fontsize=12)
+    axs[0].set_title('LCC distribution for 1,000 random networks (ndp)', fontsize=14)
+    axs[0].axvline(x=len(LCC), color='#FF7A7A', linestyle='--')
+    axs[0].legend(["LCC "+str(disease_name)], loc='upper right')
+
+    
+    # Representation of the distribution of LCC dp
+    axs[1].hist(dp_list, color = '#79C4FF', bins=20, edgecolor='black')
+    axs[1].set_xlabel('Nº of nodes in LCC', fontsize=12)
+    axs[1].set_ylabel('Nº of random networks', fontsize=12)
+    axs[1].set_title('LCC distribution for 1,000 random networks (dp)', fontsize=14)
+    axs[1].axvline(x=len(LCC), color='#FF7A7A', linestyle='--')
+    axs[1].legend(["LCC "+str(disease_name)], loc='upper right')
+
+    
+    # Adjust the layout and space between plots
+    plt.tight_layout()
+    
+    # Legend 1
+    legend_text_1 = r'$LCC (ndp)_{\mathrm{obs}}$' + ' (' + r'$\mathrm{mean\ LCC} - \mathrm{STD}$, z-score' + ')'
+    leyenda_valor_1 = f'{len(LCC)} ({round(mean_ndp,2)} - {round(std_ndp,2)}, {round(zscore_ndp,2)})'
+    fig.text(0.51, -0.05, legend_text_1+str(" = ")+leyenda_valor_1, fontsize=12, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round'))
+    
+    # Legend 2 
+    legend_text_2 = r'$LCC (dp)_{\mathrm{obs}}$' + ' (' + r'$\mathrm{mean\ LCC} - \mathrm{STD}$, z-score' + ')'
+    leyenda_valor_2 = f'{len(LCC)} ({round(mean_dp,2)} - {round(std_dp,2)}, {round(zscore_dp,2)})'
+    fig.text(0.51, -0.15, legend_text_2+str(" = ")+leyenda_valor_2, fontsize=12, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round'))
+    
+    # General title above the two plots
+    plt.suptitle(str(disease_name), fontsize=20, y=1.1, fontweight='bold', style='italic')
+
+    # Show the figure 
+    plt.show()
+
+
+
+# ================================================================================= 
+
+def targets(drug_list, arch):
+    """
+    Input data: list of drugs treating a disease (drug_list) and a file with drug-target relationships (arch).
+    This function allows us to obtain a DataFrame with drugs in the "Drugs" column and their targets in the "Targets" column.
+    """
+    targets_total = [] 
+    # List to store targets for each drug separated by commas
+    drugs = [] 
+    # List to store drugs found in the 'arch' file
+    for drug1 in drug_list: 
+        # Iterate over each drug in the drug list
+        targets = [] 
+        # Empty list to store targets for each drug
+        for i, drug2 in enumerate(arch["dru"]): 
+            # Iterate over drugs in the 'arch' file, keeping track of the drug (drug2) and its row (i) in the drug column
+            if drug1 == drug2: 
+                # If a drug for a disease is found in the drug-target file
+                targets.append(arch["pro"][i]) 
+                # Add its target to the targets list, which will be in the same row (i) in the target column
+        if len(targets) > 0: 
+            # Check if the targets list is not empty
+            drugs.append(drug1) 
+            # Add the drug to the list of drugs, so only drugs appearing in the 'arch' file are stored
+            targets_total.append(targets) 
+            # Add the list of targets for that drug to the list of targets for all drugs
+            
+    data = {"Drugs": drugs, "Targets": targets_total} 
+    # Combine the data and classify them into Drugs and Targets
+    df = pd.DataFrame(data) 
+    # Create a DataFrame with the results
+    return df
+
+# =================================================================================
+
+def calculate_dc_drug(target_list, dist_matrix, disease_module_proteins, PPI):
+    """
+    Input data: list of targets of the drug (target_list), proteins of the disease module (disease_module_proteins), 
+    matrix file with distances between all nodes of the PPI (dist_matrix), PPI network (PPI).
+    This function returns the closest measure (dc) of a drug and a disease.
+    """
+    targets_in_ppi = set(target_list) & set(PPI.nodes()) 
+    # List of targets of the drug that are also in the PPI network
+    targets_in_disease_module = set(targets_in_ppi) & set(disease_module_proteins)
+    # List of targets of the drug that are also part of the disease module
+    
+    distances_disease_target = dist_matrix.loc[list(targets_in_ppi), list(disease_module_proteins)].values  
+    # Generate a matrix with the shortest path lengths (SPLs) between all drug targets and all proteins in the disease module according to the distance file
+    
+    non_empty_rows = ~np.isnan(distances_disease_target).all(axis=1) 
+    # Keep rows that are not empty, i.e., remove targets that have no path to any protein in the disease module
+    
+    if np.isnan(distances_disease_target).all(): 
+        # If the previous matrix is empty
+        return np.nan 
+        # There is no path between any drug target and the disease
+    
+    elif len(targets_in_disease_module) == len(targets_in_ppi): 
+        # If all drug targets are part of the disease module
+        return 0 
+        # The dc value will be 0
+    
+    else: 
+        return  np.nanmean(np.nanmin(distances_disease_target[non_empty_rows], axis=1)) 
+        # Otherwise, calculate the mean of the minimum SPLs of targets that have a path to the disease module (the mean of the minimum values of each row of the matrix)
+
+# ================================================================================= 
+
+def proximity(df, arch_dist, prots_enf, PPI):
+    """
+    Input data: DataFrame (df) with "Drugs" and their "Targets"; file with distances between all nodes of the network (arch_dist);
+    list of proteins related to a disease (prots_enf); and PPI network (PPI).
+    Function that allows us to obtain a DataFrame with the proximity values (dc) for a list of drugs and a disease.
+    """
+    dc_total = []  # List to store the dc value of all drugs
+    diseases = []  # List to add whether the drug belongs to a specific disease
+    for i, drug in enumerate(df["Drugs"]):  # For each drug in the drugs and targets DataFrame, I keep the row (i)
+        targets_list = df["Targets"][i]  # Get the list of targets for that drug, which will be in the same row as the drug but in the targets column
+        dc_total.append(calculate_dc_drug(targets_list, arch_dist, prots_enf, PPI))
+
+    data = {'Drugs': list(df["Drugs"]), 'dc': dc_total}  # Save the relationship between the list of drugs and the list of dcs
+    result_table = pd.DataFrame(data)  # Convert the results into a DataFrame
+
+    return result_table
+
+
+# =================================================================================
+
+def proximity_random(df, arch_dist, prots_enf, PPI, num_iterations, df_results):
+    """
+    Input data: DataFrame (df) with "Drugs" and their "Targets", file with distances between all nodes of the network (arch_dist),
+    list of proteins related to a disease (prots_enf), PPI network (PPI), and the number of iterations (num_iterations)
+    to calculate the dc (num_iterations), DataFrame (df_results) with the results of random target modules iterations for each drug.
+    This function allows us to obtain a DataFrame with the average proximity values (dc) calculated from:
+    - 1000 random modules of proteins with the same number of proteins and the same degree distribution as the disease module.
+    - 1000 random target modules with the same number of proteins and the same degree distribution as each drug in the Drugs and Targets DataFrame.
+    """
+    group_nodes_degree = nodes_by_degree(PPI)  # Group nodes by their degree
+    
+    # Initialize a matrix with null values that has the same number of rows as the total number of drugs in the DataFrame
+    # and the same number of columns as the number of iterations
+    proximity_matrix = np.full((len(df["Drugs"]), num_iterations), None, dtype=object)
+
+    for i in range(num_iterations):  # For each iteration
+        
+        # Random disease module
+        random_prots = set()  # Create a set of random proteins
+        for prot in prots_enf:  # For each protein in the disease module
+            degree_prot = PPI.degree(prot)  # Calculate its degree
+            available_prots = group_nodes_degree[degree_prot]  # Choose proteins from the PPI with the same degree
+            random_prots.add(np.random.choice(available_prots))  # Choose the same number of nodes as the disease module, taken from the total list of PPI proteins randomly
+
+        # Random target module
+        random_targets = {}  # Create a dictionary to store random targets for each drug
+        for j, drug in enumerate(df["Drugs"]):  # For each drug in the DataFrame
+            random_target = df_results.iloc[j, i]  # Get the random target corresponding to this iteration for this drug
+            random_targets[drug] = random_target  # Store the random target in the dictionary
+            
+            # Calculate the dc of each drug with the random disease module and the random target
+            drug_dc_total = calculate_dc_drug(random_target, arch_dist, random_prots, PPI)
+            proximity_matrix[j, i] = drug_dc_total  # Add the dc of each drug (in row j) in the column of the matrix corresponding to the iteration (i)
+            
+    
+    drug_mean_proximity = []  # List to add the mean of dc after 1000 iterations for each drug
+    deviation = []  # List to add the deviation of dc after 1000 iterations for each drug
+    
+    for row in proximity_matrix:  # For each row (for each drug)
+        if all(x is np.nan for x in row):  # If the entire row is None
+            drug_mean_proximity.append(np.nan)  # Add None to the drug_mean_proximity list
+            deviation.append(np.nan)  # Add None to the deviation list
+        else:
+            drug_mean_proximity.append(np.nanmean(row))  # Add the mean of that row
+            deviation.append(np.nanstd(row))  # Standard deviation
+            
+    data = {'Drugs': list(df["Drugs"]), 'dc_mean': drug_mean_proximity, 'dc_std' : deviation}
+    result_table = pd.DataFrame(data)
+
+    return result_table
+
+
+
+# ================================================================================= 
+
+def calculate_random_drug_target_modules(num_iterations, df, PPI):
+    results = {}
+
+    group_nodes_by_degree = functions_network_medicine_schizo.nodes_by_degree(PPI)  # Group nodes by their degree
+    
+    for i in range(num_iterations):  # for each iteration
+        iteration_results = {}
+        for j, drug in enumerate(df["Drugs"]):  # for each drug, keep its row (j)
+            target_list = df["Targets"][j]  # get the list of targets for that drug
+            target_list_PPI = set(target_list) & set(PPI.nodes())  # get the drug's targets that are in the PPI
+            drug_targets = []
+            for target in target_list_PPI:  # for each target in the target list
+                degree_target = PPI.degree(target)  # calculate its degree
+                available_targets = group_nodes_by_degree[degree_target]  # choose proteins from the PPI with the same degree
+                random_target = np.random.choice(available_targets)  # same number of nodes as the disease module, taken from the total list of PPI proteins randomly
+                drug_targets.append(random_target)
+            iteration_results[drug] = drug_targets
+        results[i] = iteration_results
+
+    df_results = pd.DataFrame(results)
+    return df_results
+
+# ================================================================================= 
+
+def determine_treatment(row, dis_dru_the):
+    disease = row['ID']
+    drug = row['Drugs']
+
+    treatments = dis_dru_the[(dis_dru_the['dis'] == disease) & (dis_dru_the['dru'] == drug)]
+
+    treatment = 'yes' if treatments.shape[0] > 0 else 'unknown'
+
+    return treatment
+
+# =================================================================================
+
+def rep_prox(df_combined, disease_name):
+    """
+    This function represents in a boxplot the distribution of proximity to a disease and
+    the distribution of its z-score for the group of drugs used to treat the disease
+    and the group of drugs not used for its treatment (unknown).
+
+    Input:
+    1. df_combined: DataFrame combined with drugs, their observed proximity, their average random proximity,
+    the standard deviation of random proximity, their z-score, a column indicating the disease
+    (dementia, epilepsy, bipolar disorder, or schizophrenia), and another column indicating if the drug
+    is used for the treatment of the disease in the same row.
+    2. disease_name: Name of the disease
+    """
+    # Filter the data to generate two new dataframes, one with data for drugs used as treatment for the disease, and another with data for the rest of drugs
+    drugs_with_disease = df_combined[(df_combined['ID'] == disease_name) & (df_combined['Treatment'] == 'yes')]
+    drugs_without_disease = df_combined[(df_combined['ID'] == disease_name) & (df_combined['Treatment'] == 'unknown')]
+
+    combined_data = pd.concat([drugs_with_disease.assign(Treatment='Treatment'), drugs_without_disease.assign(Treatment='Unknown')])
+
+    # Combine the two datasets into a single subplot
+    fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # Create a subplot with 1 row and 2 columns
+
+    # Plot the boxplot with both proximity distributions
+    sns.boxplot(x='Treatment', y='Closest distance', data=combined_data, hue='Treatment', ax=axes[0], palette={'Treatment': '#FF7A7A', 'Unknown': '#79C4FF'}, dodge=False, medianprops=dict(linewidth=2), legend=False)
+    axes[0].set_ylabel('Closest distance ($\mathregular{d_c}$)', fontsize=12)
+    axes[0].set_xlabel('')
+    for label in axes[0].get_xticklabels():
+        label.set_fontsize(12)
+
+    # Plot the boxplot with both proximity distributions
+    sns.boxplot(x='Treatment', y='Dc_zscore', data=combined_data, hue='Treatment', ax=axes[1], palette={'Treatment': '#FF7A7A', 'Unknown': '#79C4FF'}, dodge=False, medianprops=dict(linewidth=2), legend=False)
+    axes[1].set_ylabel('Proximity [z-score ($\mathregular{d_c}$)]', fontsize=12)
+    axes[1].set_xlabel('')
+    for label in axes[1].get_xticklabels():
+        label.set_fontsize(12)
+
+    plt.tight_layout()  # Adjust the layout of the subplot to avoid overlap
+    plt.show()
+
+# =================================================================================
\ No newline at end of file
--- a/analysis/schizophrenia/repurposing.ipynb
+++ b/analysis/schizophrenia/repurposing.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "b987b5e5-3a96-46dc-ad64-08edeb477a6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import networkx as nx\n",
+    "from tabulate import tabulate\n",
+    "from networkx.algorithms import bipartite\n",
+    "import random\n",
+    "from scipy.stats import norm\n",
+    "from itertools import combinations\n",
+    "import re\n",
+    "from itertools import product\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe4b1e17-fa87-42d9-a28d-e53aebb77943",
+   "metadata": {},
+   "source": [
+    "### Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "c5bf61fe-29af-40b6-a85e-f5453c2b4b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximity = pd.read_csv(\"../results/Proximity_results.csv\", sep = \",\")\n",
+    "dge = pd.read_csv(\"../results/Filtering_by_DGE_results.csv\", sep = \",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a70afe4-73a7-4436-a8d9-14deba773032",
+   "metadata": {},
+   "source": [
+    "### Filtering of drug repurposing candidates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "06d790a3-6c11-4724-9ed3-0c764083243f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filtering of schizophrenia data\n",
+    "proximity_schizo = proximity[proximity[\"ID\"] == \"C0036341\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07ae6a91-08a8-4bab-97c2-b47070875c48",
+   "metadata": {},
+   "source": [
+    "#### 1) Proximal drugs to schizophrenia module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "493872db-4d45-4ca6-a613-d0adafacd5c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximal_drugs_schizo = proximity_schizo[(proximity_schizo[\"Dc_zscore\"] <= -0.15) & \n",
+    "                                        (proximity_schizo[\"Treatment\"] == \"unknown\")]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e3e28c3-f78d-4d20-9745-3d28dad6838a",
+   "metadata": {},
+   "source": [
+    "#### 2) Distance to schizophrenia module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "1f33226f-a0b8-4d76-be20-f82ca3851d6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Q1 and median of closest distance for treatment drugs\n",
+    "\n",
+    "treatment_schizo = proximity_schizo[proximity_schizo[\"Treatment\"] == \"yes\"]\n",
+    "Q1_treatment = treatment_schizo[\"Closest distance\"].quantile(0.25)\n",
+    "median_treatment = treatment_schizo[\"Closest distance\"].quantile(0.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "79a8a797-2089-4c60-bcb3-690de1bdbec6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Unknown drugs with distance to schizophrenia module between Q1 and nedian of treatment drugs\n",
+    "\n",
+    "closest_drugs_schizo = proximal_drugs_schizo[(proximal_drugs_schizo[\"Closest distance\"] >= Q1_treatment) &\n",
+    "                                       (proximal_drugs_schizo[\"Closest distance\"] <= median_treatment)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6f6a702-06ee-4d94-8982-1ff4659b8dbc",
+   "metadata": {},
+   "source": [
+    "#### 3) Targets with significant DGE in schizophrenia and correlated with its co-expression module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "0665674b-9fbd-4855-9785-7a93b1907540",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "drugs_in_dge = dge[\"Drugs\"].unique()\n",
+    "drugs_schizo_filtered =  closest_drugs_schizo[(closest_drugs_schizo[\"Drugs\"].isin(drugs_in_dge))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "418c6fb7-4a32-407e-bb35-9c13804d1dbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximity_schizo_filtered = drugs_schizo_filtered.sort_values(by=\"Dc_zscore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "c6dbe376-fca9-43c1-9c76-cad14964f703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximity_schizo_filtered = proximity_schizo_filtered.drop('ID', axis=1).drop('Treatment', axis=1).drop('Dc_std', axis=1).drop('Dc_mean', axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "05f3dce7-b0ea-4ce3-b06a-52c52f3bb7a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximity_schizo_filtered.to_csv(\"../results/Repurposing candidates schizophrenia.csv\", index = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12a5a03c-76c7-4817-a596-2e89724302d4",
+   "metadata": {},
+   "source": [
+    "### MeSH Pharmacological Actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "73e12501-9ea1-4d08-a87a-88a30036d1d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = pd.read_csv(\"../files/drug_categories.csv\", sep = \",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b7c78db-7596-4759-a5e1-50d9e121abb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_drugs = pd.merge(proximity_schizo_filtered, dge[['Drugs', 'Gene symbol']], on='Drugs')\n",
+    "merged_drugs = pd.merge(merged_drugs, categories[['drug_id', 'class_name', 'type']], left_on='Drugs', right_on='drug_id')\n",
+    "\n",
+    "drugs_classification = merged_drugs[merged_drugs['class_name'].str.contains('Agents')]\n",
+    "\n",
+    "drugs_classification = drugs_classification[['Drugs', 'Closest distance', 'Dc_zscore', 'Gene symbol', 'class_name', 'type']]\n",
+    "drugs_classification.columns = ['Drugs', 'Closest distance', 'Proximity', 'Gene target', 'Classification', 'Type']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "21abf2e1-c9df-47ab-86a7-0bcf5523b0ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            Drugs  Closest distance   Proximity Gene target  \\\n",
+      "1       CHEMBL623          0.300000 -115.579629       KCNH2   \n",
+      "8       CHEMBL113          0.363636 -103.512846      PIK3CB   \n",
+      "11      CHEMBL113          0.363636 -103.512846      PIK3CB   \n",
+      "15      CHEMBL941          0.500000  -94.879347        DDR1   \n",
+      "20  CHEMBL1628227          0.352941  -90.844253       KCNH2   \n",
+      "28       CHEMBL83          0.428571  -85.950381       KCNH2   \n",
+      "30       CHEMBL83          0.428571  -85.950381       KCNH2   \n",
+      "34    CHEMBL23588          0.500000  -77.736571       PPARA   \n",
+      "35      CHEMBL640          0.333333  -75.756252       KCNH2   \n",
+      "39      CHEMBL709          0.500000  -74.365765       KCNH2   \n",
+      "43   CHEMBL278819          0.333333  -65.014945        MAOA   \n",
+      "44      CHEMBL652          0.500000  -51.717537       KCNH2   \n",
+      "47      CHEMBL413          0.333333  -22.765103        FGF2   \n",
+      "49      CHEMBL413          0.333333  -22.765103        FGF2   \n",
+      "50      CHEMBL413          0.333333  -22.765103        FGF2   \n",
+      "53       CHEMBL43          0.500000   -8.866920       KCNH2   \n",
+      "\n",
+      "                              Classification    Type  \n",
+      "1   Antidepressive Agents, Second-Generation  MESHPA  \n",
+      "8    Anti-Inflammatory Agents, Non-Steroidal  MESHPA  \n",
+      "11                      Antimutagenic Agents  MESHPA  \n",
+      "15                     Antineoplastic Agents  MESHPA  \n",
+      "20          Antidepressive Agents, Tricyclic  MESHPA  \n",
+      "28           Antineoplastic Agents, Hormonal  MESHPA  \n",
+      "30          Bone Density Conservation Agents  MESHPA  \n",
+      "34                  Anti-Inflammatory Agents  MESHPA  \n",
+      "35                    Anti-Arrhythmia Agents  MESHPA  \n",
+      "39                         Urological Agents  MESHPA  \n",
+      "43                     Antidepressive Agents  MESHPA  \n",
+      "44                    Anti-Arrhythmia Agents  MESHPA  \n",
+      "47                     Anti-Bacterial Agents  MESHPA  \n",
+      "49                         Antifungal Agents  MESHPA  \n",
+      "50                  Immunosuppressive Agents  MESHPA  \n",
+      "53                     Antineoplastic Agents  MESHPA  \n"
+     ]
+    }
+   ],
+   "source": [
+    "merged_drugs = pd.merge(proximity_schizo_filtered, dge[['Drugs', 'Gene symbol']], on='Drugs')\n",
+    "merged_drugs = pd.merge(merged_drugs, categories[['drug_id', 'class_name', 'type']], left_on='Drugs', right_on='drug_id')\n",
+    "\n",
+    "drugs_classification = merged_drugs[merged_drugs['class_name'].str.contains('Agents')]\n",
+    "\n",
+    "drugs_classification = drugs_classification[['Drugs', 'Closest distance', 'Dc_zscore', 'Gene symbol', 'class_name', 'type']]\n",
+    "drugs_classification.columns = ['Drugs', 'Closest distance', 'Proximity', 'Gene target', 'Classification', 'Type']\n",
+    "\n",
+    "\n",
+    "print(drugs_classification)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "447d822e-7e21-45fa-993e-53659fda6fc5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3898d83-c1ed-4c95-88d5-eca1ae6debe6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/analysis/total diseases analysis.ipynb
+++ b/analysis/total diseases analysis.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f28f5e21-ca09-42c9-b7c3-1868d37db298",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import networkx as nx\n",
+    "from tabulate import tabulate\n",
+    "from networkx.algorithms import bipartite\n",
+    "import random\n",
+    "from scipy.stats import norm\n",
+    "from itertools import combinations\n",
+    "import re\n",
+    "from itertools import product\n",
+    "import sys\n",
+    "sys.path.append('schizophrenia/functions_network_medicine_schizo.py')  # Reemplaza '/ruta/a/tu/notebook' con la ruta real al directorio que contiene el archivo functions.py\n",
+    "import functions_network_medicine_schizo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "99573d30-ec91-4703-8cc2-1597e6ed6fc3",
+   "metadata": {},
+   "source": [
+    "### Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1dbaebbd-1ad3-4483-8418-7bf44a41db5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#nodes\n",
+    "pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n",
+    "gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n",
+    "dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n",
+    "dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8db5703d-6d8e-4295-8e34-895fe8b6d1ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#links\n",
+    "pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep=\"\\t\")\n",
+    "dis_gen = pd.read_csv('data/links/dis_gen.tsv', sep=\"\\t\")\n",
+    "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
+    "dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
+    "gen_pro = pd.read_csv('data/links/gen_pro.tsv', sep=\"\\t\")\n",
+    "dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d33a5a56-e66c-4a5d-a695-b5ef8972f9cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#file with SPLs between all PPI nodes\n",
+    "spl = pd.read_csv('files/SPL PPI.csv', index_col='Source')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2519e85f-388b-4146-9906-aa450cfd4316",
+   "metadata": {},
+   "source": [
+    "### Interactome"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5160b911-92a7-4687-8c8c-dfe38f1d0372",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebc315c4-08ad-4f50-849e-0e4bfac34b1f",
+   "metadata": {},
+   "source": [
+    "### Identification of disease module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cf1179a-49fe-4202-9f60-1600bb0a5634",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dictionaries to store the results for each disease\n",
+    "dis_gen_dict = {}  # Seed genes\n",
+    "dis_lcc = {}  # Module size\n",
+    "dis_lcc_dp = {}  # Results for statistical validation dp\n",
+    "dis_lcc_ndp = {}  # Results for statistical validation ndp\n",
+    "\n",
+    "dis_total = dis_dru_the['dis'].unique().tolist()\n",
+    "\n",
+    "for dis in dis_total:\n",
+    "    # Seed genes\n",
+    "    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
+    "    dis_gen_dict[dis] = genes\n",
+    "    \n",
+    "    # Disease proteins \n",
+    "    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
+    "    \n",
+    "    # Disease proteins in interactome\n",
+    "    prots_interactome = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
+    "\n",
+    "    # Module size\n",
+    "    SG_dis = G_ppi.subgraph(prots_interactome)\n",
+    "    \n",
+    "    if SG_dis: # if disease has at least one protein in its module\n",
+    "        lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
+    "        dis_lcc[dis] = lcc\n",
+    "        \n",
+    "        # Statistical validation ndp\n",
+    "        lcc_ndp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=False)\n",
+    "        dis_lcc_ndp[dis] = lcc_ndp\n",
+    "        \n",
+    "        # Statistical validation dp\n",
+    "        lcc_dp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=True)\n",
+    "        dis_lcc_dp[dis] = lcc_dp   \n",
+    "        \n",
+    "    else:\n",
+    "        dis_lcc[dis] = []\n",
+    "        dis_lcc_ndp[dis] = []\n",
+    "        dis_lcc_dp[dis] = []\n",
+    "\n",
+    "\n",
+    "results = {\n",
+    "    \"ID\": dis_total,\n",
+    "    \"Seed genes\": [len(dis_gen_dict[dis]) for dis in dis_total],\n",
+    "    \"Genes in PPI\": [len(df_pro_dis_total_filt[df_pro_dis_total_filt[\"dis\"] == dis][\"pro_ppi\"].iloc[0]) for dis in dis_total],\n",
+    "    \"Genes in LCC\": [len(dis_lcc[dis]) for dis in dis_total],\n",
+    "    \"Relative LCC\": [len(dis_lcc[dis]) / len(df_pro_dis_total_filt[df_pro_dis_total_filt[\"dis\"] == dis][\"pro_ppi\"].iloc[0]) for dis in dis_total],\n",
+    "    \"Mean random LCC (ndp)\": [dis_lcc_ndp[dis][0] for dis in dis_total],\n",
+    "    \"Std random LCC (ndp)\": [dis_lcc_ndp[dis][1] for dis in dis_total],\n",
+    "    \"Z-score (ndp)\": [dis_lcc_ndp[dis][2] for dis in dis_total],\n",
+    "    \"Mean random LCC (dp)\": [dis_lcc_dp[dis][0] for dis in dis_total],\n",
+    "    \"Std random LCCs (dp)\": [dis_lcc_dp[dis][1] for dis in dis_total], \n",
+    "    \"Z-score (dp)\": [dis_lcc_dp[dis][2] for dis in dis_total]\n",
+    "}\n",
+    "\n",
+    "dis_module_df = pd.DataFrame(results)\n",
+    "dis_module_df.to_csv(\"Module.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92f1b21e-eb2c-4a35-b9a0-1562e4751677",
+   "metadata": {},
+   "source": [
+    "### Disease filtering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "480659cf-1dd8-409c-a2d5-418a16e97bb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter of diseases with a significant module size\n",
+    "significative = dis_module_df[(dis_module_df['Z-score (dp)'] > 1.65) & (dis_module_df['Z-score (ndp)'] > 1.65)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "115a0101-7ed4-4ef4-959e-ab07c7681c61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter of diseases that have more than 15 pathological proteins in its module belonging to the interactome\n",
+    "\n",
+    "for i, lcc in enumerate(significative[\"Genes in LCC\"]):\n",
+    "    if lcc < 15: \n",
+    "        disease_no_sig = significative[\"ID\"][i]\n",
+    "        \n",
+    "        row = significative[significative['ID'] == disease_no_sig]\n",
+    "        \n",
+    "        significative = significative.drop(row.index)\n",
+    "        \n",
+    "significative.reset_index(drop=True, inplace=True)\n",
+    "significative.to_csv(\"Module signif.csv\", index = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "194a51fd-885e-4e82-96a5-aa2a77250ab1",
+   "metadata": {},
+   "source": [
+    "### Closest distance disease-drug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "722cf051-f5ee-4473-9674-e5519d8616ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "significative = pd.read_csv('Module signif.csv', sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1553f0f2-c28e-4dd3-8643-f791be9e1dca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#DataFrame with drugs and their targets:\n",
+    "total_drug_list = set()\n",
+    "for drug in dru_pro[\"dru\"]:\n",
+    "     if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
+    "            list_drugs_total.add(drug)\n",
+    "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfdd0ff9-b286-43b1-8c9a-7f6e067cf066",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = [] #list with the results of each disease for the observed distance\n",
+    "\n",
+    "for i, dis in enumerate(df_pro_dis_total_filt[\"dis\"]):\n",
+    "    SG_dis = G_ppi.subgraph(df_pro_dis_total_filt[\"pro_ppi\"][i]) #disease subnetwork\n",
+    "    lcc = funciones_network_medicine.lcc(SG_dis) # disease module\n",
+    "    proximity_obs = funciones_network_medicine.proximity(targets_total, spl, lcc, G_ppi) # DF with drugs and distance to disease\n",
+    "    \n",
+    "    for _, row in proximity_obs.iterrows():\n",
+    "        results.append({\"ID\": dis, \"Drugs\": row[\"Fármacos\"], \"Closest distance\": row[\"dc\"]})\n",
+    "\n",
+    "df_distance = pd.DataFrame(results)\n",
+    "\n",
+    "df_distance.to_csv(\"Closest distance.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a496aeb-4674-417a-ac3e-5a8323e148ed",
+   "metadata": {},
+   "source": [
+    "### Proximity disease-drug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "aaefdc73-2962-46c7-a510-8fe22adf0e70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# random target modules\n",
+    "target_modules_list_random_drugs = functions_network_medicine_schizo.calculate_random_drug_target_modules(1000, targets_total, G_ppi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d58deeea-711d-45d8-9ed9-91bbbeef16f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(columns=[\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
+    "\n",
+    "# empty file in which I will add the results\n",
+    "with open(\"Rand_closest_distance_rand.csv\", 'w', newline='') as file:\n",
+    "    writer = csv.writer(file)\n",
+    "    writer.writerow([\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
+    "\n",
+    "# determination of closest distance in random set of disease module and target module\n",
+    "for i, dis in enumerate(significative[\"ID\"]):\n",
+    "    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
+    "    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
+    "    pro_ppi = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
+    "    SG_dis = G_ppi.subgraph(pro_ppi)\n",
+    "    lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
+    "    proximity_rand = functions_network_medicine_schizo.proximity_random(targets_total, spl, lcc, G_ppi, 1000, lista_modulos_diana_farmacos_aleatorios)\n",
+    "        \n",
+    "    results = []\n",
+    "    for _, row in proximity_rand.iterrows():\n",
+    "        results.append({\"ID\": dis, \"Drugs\": row[\"Drugs\"], \"Dc_mean\": row[\"dc_mean\"], \"Dc_std\": row[\"dc_std\"]})\n",
+    "    \n",
+    "    with open(\"Rand_closest_distance_rand.csv\", 'a', newline='') as file:\n",
+    "        writer = csv.writer(file)\n",
+    "        for result in results:\n",
+    "            writer.writerow([result[\"ID\"], result[\"Drugs\"], result[\"Dc_mean\"], result[\"Dc_std\"]])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "c96edf3d-41dc-4837-9893-a9ae1d5fa2f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rand_prox = pd.read_csv(\"Rand_closest_distance_rand.csv\", sep = \",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "084b52bd-b94b-4508-b29e-f131c72837ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "distance=pd.read_csv(\"Closest distance.csv\", sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "672acf4e-7d8e-421b-bfcb-5d05b4be4fa9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Join df_obs and df_prox based on the ID and Drugs columns\n",
+    "df_merged = pd.merge(distance, rand_prox, on=['ID', 'Drugs'])\n",
+    "\n",
+    "# dc_zscore\n",
+    "df_merged['Dc_zscore'] = (df_merged['Closest distance'] - df_merged['Dc_mean']) / df_merged['Dc_std']\n",
+    "\n",
+    "# New column indicating if each drug is used for the treatment of the disease\n",
+    "df_merged['Treatment'] = df_merged.apply(lambda row: functions_network_medicine_schizo.determine_treatment(row, dis_dru_the), axis=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "92dc074e-4ebc-4466-92a2-9a8893e9a6f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.to_csv(\"results/Proximity_results.csv\", index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "3586e5aa-4877-4d70-b986-5bb92d881bc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proximity = pd.read_csv(\"Proximity_results.csv\", sep = \",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0a5d2a3-b632-4874-bed6-67ed6e3e4714",
+   "metadata": {},
+   "source": [
+    "### Statistical analysis of the proximity results across all the diseases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "5b03a68c-307e-42b5-8209-a27c44dafb00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter the data to include only relevant columns\n",
+    "df_relevant = proximity[['Closest distance', 'Dc_zscore', 'Treatment']]\n",
+    "\n",
+    "# Compute descriptive statistics for 'Closest distance' and 'Dc_zscore' grouped by 'Treatment'\n",
+    "describe_closest_distance = df_relevant.groupby('Treatment')['Closest distance'].describe()\n",
+    "describe_dc_zscore = df_relevant.groupby('Treatment')['Dc_zscore'].describe()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "id": "52b62828-5c27-4578-bb46-9a2b5de60258",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Describe de Closest distance:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Treatment</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>516993.0</td>\n",
+       "      <td>1.741093</td>\n",
+       "      <td>0.590947</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>yes</th>\n",
+       "      <td>12162.0</td>\n",
+       "      <td>1.473756</td>\n",
+       "      <td>0.691416</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.6</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              count      mean       std  min  25%  50%  75%  max\n",
+       "Treatment                                                       \n",
+       "unknown    516993.0  1.741093  0.590947  0.0  1.4  2.0  2.0  4.0\n",
+       "yes         12162.0  1.473756  0.691416  0.0  1.0  1.6  2.0  3.0"
+      ]
+     },
+     "execution_count": 118,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"\\nDescribe Closest distance:\")\n",
+    "describe_closest_distance.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "id": "099c6bab-f16f-4b16-bb17-99a86a298498",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Describe de Dc_zscore:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Treatment</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>510797.0</td>\n",
+       "      <td>-0.159867</td>\n",
+       "      <td>4.989091</td>\n",
+       "      <td>-187.636997</td>\n",
+       "      <td>-0.709030</td>\n",
+       "      <td>0.123404</td>\n",
+       "      <td>0.733509</td>\n",
+       "      <td>164.760255</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>yes</th>\n",
+       "      <td>12053.0</td>\n",
+       "      <td>-1.295405</td>\n",
+       "      <td>8.074283</td>\n",
+       "      <td>-163.309391</td>\n",
+       "      <td>-1.741129</td>\n",
+       "      <td>-0.465569</td>\n",
+       "      <td>0.400364</td>\n",
+       "      <td>104.948135</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              count      mean       std         min       25%       50%  \\\n",
+       "Treatment                                                                 \n",
+       "unknown    510797.0 -0.159867  4.989091 -187.636997 -0.709030  0.123404   \n",
+       "yes         12053.0 -1.295405  8.074283 -163.309391 -1.741129 -0.465569   \n",
+       "\n",
+       "                75%         max  \n",
+       "Treatment                        \n",
+       "unknown    0.733509  164.760255  \n",
+       "yes        0.400364  104.948135  "
+      ]
+     },
+     "execution_count": 119,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"\\nDescribe Dc_zscore:\")\n",
+    "describe_dc_zscore.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}