{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "f28f5e21-ca09-42c9-b7c3-1868d37db298", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import networkx as nx\n", "from tabulate import tabulate\n", "from networkx.algorithms import bipartite\n", "import random\n", "from scipy.stats import norm\n", "from itertools import combinations\n", "import re\n", "from itertools import product\n", "import sys\n", "sys.path.append('schizophrenia/functions_network_medicine_schizo.py')\n", "import functions_network_medicine_schizo" ] }, { "cell_type": "markdown", "id": "99573d30-ec91-4703-8cc2-1597e6ed6fc3", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "code", "execution_count": 2, "id": "1dbaebbd-1ad3-4483-8418-7bf44a41db5e", "metadata": {}, "outputs": [], "source": [ "#nodes\n", "pro = pd.read_csv('data/nodes/pro.tsv', sep=\"\\t\")\n", "gen = pd.read_csv('data/nodes/gen.tsv', sep=\"\\t\")\n", "dru = pd.read_csv('data/nodes/dru.tsv', sep=\"\\t\")\n", "dis = pd.read_csv('data/nodes/dis.tsv', sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "8db5703d-6d8e-4295-8e34-895fe8b6d1ae", "metadata": {}, "outputs": [], "source": [ "#links\n", "pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep=\"\\t\")\n", "dis_gen = pd.read_csv('data/links/dis_gen.tsv', sep=\"\\t\")\n", "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n", "dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep=\"\\t\")\n", "gen_pro = pd.read_csv('data/links/gen_pro.tsv', sep=\"\\t\")\n", "dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "d33a5a56-e66c-4a5d-a695-b5ef8972f9cc", "metadata": {}, "outputs": [], "source": [ "#file with SPLs between all PPI nodes\n", "spl = pd.read_csv('files/SPL PPI.csv', index_col='Source')" ] }, { "cell_type": "markdown", "id": "2519e85f-388b-4146-9906-aa450cfd4316", "metadata": {}, "source": [ "### Interactome" ] }, { "cell_type": "code", "execution_count": 5, "id": "5160b911-92a7-4687-8c8c-dfe38f1d0372", "metadata": {}, "outputs": [], "source": [ "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')" ] }, { "cell_type": "markdown", "id": "ebc315c4-08ad-4f50-849e-0e4bfac34b1f", "metadata": {}, "source": [ "### Identification of disease module" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf1179a-49fe-4202-9f60-1600bb0a5634", "metadata": {}, "outputs": [], "source": [ "# Dictionaries to store the results for each disease\n", "dis_gen_dict = {} # Seed genes\n", "dis_lcc = {} # Module size\n", "dis_lcc_dp = {} # Results for statistical validation dp\n", "dis_lcc_ndp = {} # Results for statistical validation ndp\n", "prots_interactome_dict = {} # Disease proteins in interactome\n", "\n", "dis_total = dis_dru_the['dis'].unique().tolist()\n", "\n", "for dis in dis_total:\n", " # Seed genes\n", " genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n", " dis_gen_dict[dis] = genes\n", " \n", " # Disease proteins \n", " prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n", " \n", " # Disease proteins in interactome\n", " prots_interactome = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n", " prots_interactome_dict[dis] = prots_interactome\n", " \n", " # Module size\n", " SG_dis = G_ppi.subgraph(prots_interactome)\n", " \n", " if SG_dis: # if disease has at least one protein in its module\n", " lcc = functions_network_medicine_schizo.lcc(SG_dis)\n", " dis_lcc[dis] = lcc\n", " \n", " # Statistical validation ndp\n", " lcc_ndp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=False)\n", " dis_lcc_ndp[dis] = lcc_ndp\n", " \n", " # Statistical validation dp\n", " lcc_dp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=True)\n", " dis_lcc_dp[dis] = lcc_dp \n", " \n", " else:\n", " dis_lcc[dis] = []\n", " dis_lcc_ndp[dis] = []\n", " dis_lcc_dp[dis] = []\n", "\n", "\n", "results = {\n", " \"ID\": dis_total,\n", " \"Seed genes\": [len(dis_gen_dict[dis]) for dis in dis_total],\n", " \"Genes in PPI\": len(prots_interactome_dict[dis]) for dis in dis_total],\n", " \"Genes in LCC\": [len(dis_lcc[dis]) for dis in dis_total],\n", " \"Relative LCC\": [len(dis_lcc[dis]) / len(functions_network_medicine.pro_gen_dict(dis_gen_dict[dis], gen_pro)) if len(functions_network_medicine.pro_gen_dict(dis_gen_dict[dis], gen_pro)) != 0 else 0 for dis in dis_total],\n", " \"Mean random LCC (ndp)\": [dis_lcc_ndp[dis][0] for dis in dis_total],\n", " \"Std random LCC (ndp)\": [dis_lcc_ndp[dis][1] for dis in dis_total],\n", " \"Z-score (ndp)\": [dis_lcc_ndp[dis][2] for dis in dis_total],\n", " \"Mean random LCC (dp)\": [dis_lcc_dp[dis][0] for dis in dis_total],\n", " \"Std random LCCs (dp)\": [dis_lcc_dp[dis][1] for dis in dis_total], \n", " \"Z-score (dp)\": [dis_lcc_dp[dis][2] for dis in dis_total]\n", "}\n", "\n", "dis_module_df = pd.DataFrame(results)\n", "dis_module_df.to_csv(\"Module.csv\", index=False)" ] }, { "cell_type": "markdown", "id": "92f1b21e-eb2c-4a35-b9a0-1562e4751677", "metadata": {}, "source": [ "### Disease filtering" ] }, { "cell_type": "code", "execution_count": null, "id": "480659cf-1dd8-409c-a2d5-418a16e97bb2", "metadata": {}, "outputs": [], "source": [ "# Filter of diseases with a significant module size\n", "significative = dis_module_df[(dis_module_df['Z-score (dp)'] > 1.65) & (dis_module_df['Z-score (ndp)'] > 1.65)]" ] }, { "cell_type": "code", "execution_count": 42, "id": "115a0101-7ed4-4ef4-959e-ab07c7681c61", "metadata": {}, "outputs": [], "source": [ "# Filter of diseases that have more than 15 pathological proteins in its module belonging to the interactome\n", "\n", "for i, lcc in enumerate(significative[\"Genes in LCC\"]):\n", " if lcc < 15: \n", " disease_no_sig = significative[\"ID\"][i]\n", " \n", " row = significative[significative['ID'] == disease_no_sig]\n", " \n", " significative = significative.drop(row.index)\n", " \n", "significative.reset_index(drop=True, inplace=True)\n", "significative.to_csv(\"Module signif.csv\", index = False)" ] }, { "cell_type": "markdown", "id": "194a51fd-885e-4e82-96a5-aa2a77250ab1", "metadata": {}, "source": [ "### Closest distance disease-drug" ] }, { "cell_type": "code", "execution_count": 8, "id": "722cf051-f5ee-4473-9674-e5519d8616ce", "metadata": {}, "outputs": [], "source": [ "significative = pd.read_csv('Module signif.csv', sep=\",\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "1553f0f2-c28e-4dd3-8643-f791be9e1dca", "metadata": {}, "outputs": [], "source": [ "#DataFrame with drugs and their targets:\n", "total_drug_list = set()\n", "for drug in dru_pro[\"dru\"]:\n", " if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n", " list_drugs_total.add(drug)\n", "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)" ] }, { "cell_type": "code", "execution_count": null, "id": "dfdd0ff9-b286-43b1-8c9a-7f6e067cf066", "metadata": {}, "outputs": [], "source": [ "results = [] #list with the results of each disease for the observed distance\n", "\n", "for i, dis in enumerate(df_pro_dis_total_filt[\"dis\"]):\n", " SG_dis = G_ppi.subgraph(df_pro_dis_total_filt[\"pro_ppi\"][i]) #disease subnetwork\n", " lcc = funciones_network_medicine.lcc(SG_dis) # disease module\n", " proximity_obs = funciones_network_medicine.proximity(targets_total, spl, lcc, G_ppi) # DF with drugs and distance to disease\n", " \n", " for _, row in proximity_obs.iterrows():\n", " results.append({\"ID\": dis, \"Drugs\": row[\"Fármacos\"], \"Closest distance\": row[\"dc\"]})\n", "\n", "df_distance = pd.DataFrame(results)\n", "\n", "df_distance.to_csv(\"Closest distance.csv\", index=False)" ] }, { "cell_type": "markdown", "id": "5a496aeb-4674-417a-ac3e-5a8323e148ed", "metadata": {}, "source": [ "### Proximity disease-drug" ] }, { "cell_type": "code", "execution_count": 13, "id": "aaefdc73-2962-46c7-a510-8fe22adf0e70", "metadata": {}, "outputs": [], "source": [ "# random target modules\n", "target_modules_list_random_drugs = functions_network_medicine_schizo.calculate_random_drug_target_modules(1000, targets_total, G_ppi)" ] }, { "cell_type": "code", "execution_count": null, "id": "d58deeea-711d-45d8-9ed9-91bbbeef16f0", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(columns=[\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n", "\n", "# empty file in which I will add the results\n", "with open(\"Closest_distance_rand.csv\", 'w', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerow([\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n", "\n", "# determination of closest distance in random set of disease module and target module\n", "for i, dis in enumerate(significative[\"ID\"]):\n", " genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n", " prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n", " pro_ppi = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n", " SG_dis = G_ppi.subgraph(pro_ppi)\n", " lcc = functions_network_medicine_schizo.lcc(SG_dis)\n", " proximity_rand = functions_network_medicine_schizo.proximity_random(targets_total, spl, lcc, G_ppi, 1000, lista_modulos_diana_farmacos_aleatorios)\n", " \n", " results = []\n", " for _, row in proximity_rand.iterrows():\n", " results.append({\"ID\": dis, \"Drugs\": row[\"Drugs\"], \"Dc_mean\": row[\"dc_mean\"], \"Dc_std\": row[\"dc_std\"]})\n", " \n", " with open(\"Closest_distance_rand.csv\", 'a', newline='') as file:\n", " writer = csv.writer(file)\n", " for result in results:\n", " writer.writerow([result[\"ID\"], result[\"Drugs\"], result[\"Dc_mean\"], result[\"Dc_std\"]])\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "c96edf3d-41dc-4837-9893-a9ae1d5fa2f1", "metadata": {}, "outputs": [], "source": [ "rand_prox = pd.read_csv(\"Closest_distance_rand.csv\", sep = \",\")" ] }, { "cell_type": "code", "execution_count": 51, "id": "084b52bd-b94b-4508-b29e-f131c72837ba", "metadata": {}, "outputs": [], "source": [ "distance=pd.read_csv(\"Closest distance.csv\", sep=\",\")" ] }, { "cell_type": "code", "execution_count": 57, "id": "672acf4e-7d8e-421b-bfcb-5d05b4be4fa9", "metadata": {}, "outputs": [], "source": [ "# Join df_obs and df_prox based on the ID and Drugs columns\n", "df_merged = pd.merge(distance, rand_prox, on=['ID', 'Drugs'])\n", "\n", "# dc_zscore\n", "df_merged['Dc_zscore'] = (df_merged['Closest distance'] - df_merged['Dc_mean']) / df_merged['Dc_std']\n", "\n", "# New column indicating if each drug is used for the treatment of the disease\n", "df_merged['Treatment'] = df_merged.apply(lambda row: functions_network_medicine_schizo.determine_treatment(row, dis_dru_the), axis=1)\n" ] }, { "cell_type": "code", "execution_count": 62, "id": "92dc074e-4ebc-4466-92a2-9a8893e9a6f7", "metadata": {}, "outputs": [], "source": [ "df_merged.to_csv(\"results/Proximity_results.csv\", index = False)" ] }, { "cell_type": "code", "execution_count": 64, "id": "3586e5aa-4877-4d70-b986-5bb92d881bc0", "metadata": {}, "outputs": [], "source": [ "proximity = pd.read_csv(\"Proximity_results.csv\", sep = \",\")" ] }, { "cell_type": "markdown", "id": "d0a5d2a3-b632-4874-bed6-67ed6e3e4714", "metadata": {}, "source": [ "### Statistical analysis of the proximity results across all the diseases" ] }, { "cell_type": "code", "execution_count": 117, "id": "5b03a68c-307e-42b5-8209-a27c44dafb00", "metadata": {}, "outputs": [], "source": [ "# Filter the data to include only relevant columns\n", "df_relevant = proximity[['Closest distance', 'Dc_zscore', 'Treatment']]\n", "\n", "# Compute descriptive statistics for 'Closest distance' and 'Dc_zscore' grouped by 'Treatment'\n", "describe_closest_distance = df_relevant.groupby('Treatment')['Closest distance'].describe()\n", "describe_dc_zscore = df_relevant.groupby('Treatment')['Dc_zscore'].describe()\n" ] }, { "cell_type": "code", "execution_count": 118, "id": "52b62828-5c27-4578-bb46-9a2b5de60258", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Describe de Closest distance:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
Treatment
unknown516993.01.7410930.5909470.01.42.02.04.0
yes12162.01.4737560.6914160.01.01.62.03.0
\n", "
" ], "text/plain": [ " count mean std min 25% 50% 75% max\n", "Treatment \n", "unknown 516993.0 1.741093 0.590947 0.0 1.4 2.0 2.0 4.0\n", "yes 12162.0 1.473756 0.691416 0.0 1.0 1.6 2.0 3.0" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"\\nDescribe Closest distance:\")\n", "describe_closest_distance.head()" ] }, { "cell_type": "code", "execution_count": 119, "id": "099c6bab-f16f-4b16-bb17-99a86a298498", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Describe de Dc_zscore:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
Treatment
unknown510797.0-0.1598674.989091-187.636997-0.7090300.1234040.733509164.760255
yes12053.0-1.2954058.074283-163.309391-1.741129-0.4655690.400364104.948135
\n", "
" ], "text/plain": [ " count mean std min 25% 50% \\\n", "Treatment \n", "unknown 510797.0 -0.159867 4.989091 -187.636997 -0.709030 0.123404 \n", "yes 12053.0 -1.295405 8.074283 -163.309391 -1.741129 -0.465569 \n", "\n", " 75% max \n", "Treatment \n", "unknown 0.733509 164.760255 \n", "yes 0.400364 104.948135 " ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"\\nDescribe Dc_zscore:\")\n", "describe_dc_zscore.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }