Commit 2636d29c authored by Maria Marin's avatar Maria Marin

Upload New File

parent 83affd2a
# Analysis
## Repository content
| NAME | DESCRIPTION |
|-----------------------|------------------------ -------------------------------------------------- ---------------|
| [figures]() | Directory with the figures generated as a result of the analysis|
| files() | Directory with the intermediate files used during the analysis |
| results() | Directory with the files generated as a result of the analysis |
|[schizophrenia]() | Directory with the Jupyter Notebooks and Python scripts employed to study drug repusposing candidates for schizophrenia |
| [total diseases analysis.ipynb]() | Jupyter Notebook used to analyze the disease module size and the proximity disease-drug for 495 diseases as a previous step to obtain the proximity distribution files (in results) that will be used in the analysis carried out for schizophrenia |
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="475.2pt" height="360pt" viewBox="0 0 475.2 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
<metadata>
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<cc:Work>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:date>2024-03-25T13:59:19.102751</dc:date>
<dc:format>image/svg+xml</dc:format>
<dc:creator>
<cc:Agent>
<dc:title>Matplotlib v3.8.0, https://matplotlib.org/</dc:title>
</cc:Agent>
</dc:creator>
</cc:Work>
</rdf:RDF>
</metadata>
<defs>
<style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
</defs>
<g id="figure_1">
<g id="patch_1">
<path d="M 0 360
L 475.2 360
L 475.2 0
L 0 0
z
" style="fill: #ffffff"/>
</g>
</g>
</svg>
Drugs,Gene symbol,log(Fold Change),Corrected pvalue
CHEMBL413,FGF2,0.262194915301407,0.0001663143107123
CHEMBL629,NTRK2,0.215842149854464,0.0005334841403452
CHEMBL717,ABCB1,-0.19259430589577,0.0011518294091511
CHEMBL472,ABCA1,0.18596862999644,8.3409820679652e-05
CHEMBL608,ABCA1,0.18596862999644,8.3409820679652e-05
CHEMBL86304,MAOA,0.141364018163515,0.0007876002881378
CHEMBL1201168,MAOA,0.141364018163515,0.0007876002881378
CHEMBL1574,MAOA,0.141364018163515,0.0007876002881378
CHEMBL1201201,MAOA,0.141364018163515,0.0007876002881378
CHEMBL972,MAOA,0.141364018163515,0.0007876002881378
CHEMBL278819,MAOA,0.141364018163515,0.0007876002881378
CHEMBL1089,MAOA,0.141364018163515,0.0007876002881378
CHEMBL673,MAOA,0.141364018163515,0.0007876002881378
CHEMBL750,MAOA,0.141364018163515,0.0007876002881378
CHEMBL37744,MAOA,0.141364018163515,0.0007876002881378
CHEMBL941,DDR1,0.124057530881635,0.0079332374104671
CHEMBL607,GRIN2C,0.122802787266293,0.0057675281998525
CHEMBL930,GLUL,0.105690036807368,0.0101258664300799
CHEMBL90593,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL264374,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL457,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL1297,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL23588,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL633,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL557555,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL565,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL521,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL6,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL672,PPARA,0.0995849269559933,0.0094860300493365
CHEMBL3137309,BCL2,0.0931726845904655,0.0138753870077204
CHEMBL521,BCL2,0.0931726845904655,0.0138753870077204
CHEMBL428647,BCL2,0.0931726845904655,0.0138753870077204
CHEMBL887,BCL2,0.0931726845904655,0.0138753870077204
CHEMBL1088977,CBS,0.0802567480856852,0.0111248339342028
CHEMBL1286,SV2A,-0.0753815769457941,0.0050610575822867
CHEMBL607400,SV2A,-0.0753815769457941,0.0050610575822867
CHEMBL113,PIK3CB,-0.0654377622012529,0.0380665474160753
CHEMBL1200733,ATP2A2,-0.0573001962598098,0.0164116692300517
CHEMBL701,GABBR2,-0.055808229356042,0.0347092303857617
CHEMBL896,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL1423,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL471,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL479,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL631,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL83,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL12713,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL517,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL43,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL998,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL623,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL2,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL533,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL296419,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL1008,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL473,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL640,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL709,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL462605,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL723,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL1294,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL16,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL71,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL532,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL6966,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL652,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL184412,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL1107,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL157101,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL611,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL11,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL1628227,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL17157,KCNH2,-0.0508555360118699,0.0233144571011437
CHEMBL41,KCNH2,-0.0508555360118699,0.0233144571011437
This source diff could not be displayed because it is too large. You can view the blob instead.
Drugs,Closest distance,Dc_zscore
CHEMBL623,0.3,-115.57962872808126
CHEMBL113,0.3636363636363636,-103.5128460079722
CHEMBL941,0.5,-94.87934739957429
CHEMBL1628227,0.3529411764705882,-90.84425264943393
CHEMBL17157,0.4285714285714285,-87.50885056097165
CHEMBL83,0.4285714285714285,-85.95038070535077
CHEMBL930,0.5,-78.56768686211016
CHEMBL23588,0.5,-77.73657144185727
CHEMBL640,0.3333333333333333,-75.75625189325385
CHEMBL709,0.5,-74.36576498963095
CHEMBL278819,0.3333333333333333,-65.01494476738038
CHEMBL652,0.5,-51.71753701225944
CHEMBL413,0.3333333333333333,-22.765103095067783
CHEMBL43,0.5,-8.866920486283584
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "eed38b7b",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd\n",
"from Bio import Entrez\n",
"import GEOparse\n",
"import geopandas as gpd\n",
"import os\n",
"import pandas as pd\n",
"from GEOparse import GEOparse\n",
"import gzip\n",
"import shutil\n",
"import funciones_network_medicine\n",
"import networkx as nx"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "947d7a7a-7b53-4c14-8c9f-7d25ccfecdca",
"metadata": {},
"outputs": [],
"source": [
"#nodes\n",
"pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n",
"gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n",
"dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n",
"dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")\n",
"#links\n",
"pro_pro = pd.read_csv('../data/links/pro_pro.tsv', sep=\"\\t\")\n",
"dis_gen = pd.read_csv('../data/links/dis_gen.tsv', sep=\"\\t\")\n",
"dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
"dis_dru_the = pd.read_csv('../data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
"gen_pro = pd.read_csv('../data/links/gen_pro.tsv', sep=\"\\t\")\n",
"dru_pro = pd.read_csv('../data/links/dru_pro.tsv', sep=\"\\t\")"
]
},
{
"cell_type": "markdown",
"id": "f939592a-7c9c-4ec4-b20a-03c07439a56b",
"metadata": {
"tags": []
},
"source": [
"### Interactome"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "37a4ff6e-e3e6-4b0f-ad8d-d3aef695027d",
"metadata": {},
"outputs": [],
"source": [
"G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
]
},
{
"cell_type": "markdown",
"id": "94600860-ca3b-4f30-a2fe-530e719426d9",
"metadata": {},
"source": [
"#### List of genes in LCC"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5f2370f1-9d66-43e2-9b25-88c32e75a3ac",
"metadata": {},
"outputs": [],
"source": [
"gen_schizo = funciones_network_medicine.genes_enf(\"C0036341\", dis_gen)\n",
"dict_schizo= funciones_network_medicine.pro_gen_dict(gen_schizo, gen_pro)\n",
"dict_schizo_PPI = funciones_network_medicine.gen_pro_PPI(dict_schizo, pro_pro)\n",
"SG_schizo= funciones_network_medicine.SG(dict_schizo_PPI, G_ppi)\n",
"lcc_schizo= funciones_network_medicine.lcc(SG_schizo)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4c71bf7a-847f-4b63-919c-4d81207d61cd",
"metadata": {},
"outputs": [],
"source": [
"list_num = []\n",
"for pro in lcc_schizo:\n",
" for i,pro2 in enumerate(gen_pro[\"pro\"]):\n",
" if pro == pro2:\n",
" list_num.append(gen_pro[\"gen\"][i])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8a56ffb7-76fa-451f-9849-ca96309c8fec",
"metadata": {},
"outputs": [],
"source": [
"list_genes = []\n",
"for num_gen in set(list_num):\n",
" for i,num_gen2 in enumerate(gen[\"id\"]):\n",
" if num_gen == num_gen2:\n",
" gen_names = gen[\"name\"][i]\n",
" list_genes.append(gen_names) "
]
},
{
"cell_type": "markdown",
"id": "67209750-1411-45a3-9ad9-5a92f8521c90",
"metadata": {},
"source": [
"#### Microarray GE analysis"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "dd844f76-955b-4708-9b54-a6617044cfe1",
"metadata": {},
"outputs": [],
"source": [
"dge = pd.read_csv('../files/Microarray_SCZ_metaanalysis_092017.csv', sep=\",\") #Gandal et al. 2018"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ce169023-6da3-4e67-8c4e-a38c5e31a60b",
"metadata": {},
"outputs": [],
"source": [
"# Genes from the Gandal study that are found in the schizophrenia module\n",
"\n",
"filtered_dge = dge[dge[\"symbol\"].isin(list_genes)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6be40bf1-e2b7-4470-8196-34620b7147f8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 beta SE p fdr symbol\n",
"7 ENSG00000001084 0.038813 0.029858 0.195025 0.405474 GCLC\n",
"23 ENSG00000002822 -0.051322 0.019815 0.010259 0.058147 MAD1L1\n",
"24 ENSG00000002834 -0.000855 0.014219 0.952121 0.976082 LASP1\n",
"110 ENSG00000006128 -0.200785 0.047132 0.000031 0.000998 TAC1\n",
"126 ENSG00000006611 0.071682 0.030708 0.020512 0.093734 USH1C\n",
"... ... ... ... ... ... ...\n",
"12208 ENSG00000256269 -0.047590 0.016506 0.004341 0.032269 HMBS\n",
"12222 ENSG00000257017 -0.022547 0.024117 0.350880 0.577942 HP\n",
"12251 ENSG00000259207 0.002088 0.020028 0.917063 0.960690 ITGB3\n",
"12298 ENSG00000262683 -0.008338 0.019593 0.670838 0.824663 FHIT\n",
"12386 ENSG00000273079 0.019514 0.026687 0.465453 0.676388 GRIN2B\n",
"\n",
"[769 rows x 6 columns]\n"
]
}
],
"source": [
"print(filtered_dge)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "89f27194-032e-4f1e-b6d1-1f0a2b4c5abc",
"metadata": {},
"outputs": [],
"source": [
"# Genes with significantly higher or lower expression in schizophrenia patients.\n",
"\n",
"fdr_filtered_dge = filtered_dge[filtered_dge[\"fdr\"] <= 0.05] #pvalue fixed with False Discovery Rate"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e59d5d1-7a75-4d8a-b447-49ea639b3db3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 beta SE p fdr symbol\n",
"110 ENSG00000006128 -0.200785 0.047132 3.066855e-05 0.000998 TAC1\n",
"148 ENSG00000007168 -0.061780 0.021111 3.801441e-03 0.029186 PAFAH1B1\n",
"161 ENSG00000007372 0.174375 0.040167 2.189604e-05 0.000770 PAX6\n",
"219 ENSG00000010256 -0.096030 0.019028 9.619559e-07 0.000083 UQCRC1\n",
"260 ENSG00000011405 0.102244 0.036488 5.545087e-03 0.038023 PIK3C2A\n",
"... ... ... ... ... ... ...\n",
"11459 ENSG00000204525 -0.110543 0.034336 1.485081e-03 0.014967 HLA-C\n",
"11471 ENSG00000204580 0.124058 0.035651 6.085740e-04 0.007933 DDR1\n",
"11678 ENSG00000214113 -0.057975 0.021102 6.522134e-03 0.042600 LYRM4\n",
"11906 ENSG00000231925 -0.053500 0.016371 1.262639e-03 0.013331 TAPBP\n",
"12208 ENSG00000256269 -0.047590 0.016506 4.341083e-03 0.032269 HMBS\n",
"\n",
"[178 rows x 6 columns]\n"
]
}
],
"source": [
"print(fdr_filtered_dge)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8bc007a4-2597-4f4a-a542-cced89d231b0",
"metadata": {},
"outputs": [],
"source": [
"#sort based on logFC (in absolute value)\n",
"dge_sorted = fdr_filtered_dge.reindex(fdr_filtered_dge['beta'].abs().sort_values(ascending=False).index)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "45f91300-2b2c-46bc-99f9-78e31ce0a5b3",
"metadata": {},
"outputs": [],
"source": [
"dge_sorted.to_csv(\"dge_sorted.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "4984712e-1730-4e6c-a063-cbdf29092f8b",
"metadata": {},
"outputs": [],
"source": [
"dge_sorted = pd.read_csv(\"../files/dge_sorted.csv\", sep=\",\")"
]
},
{
"cell_type": "markdown",
"id": "b91a30b7-0708-4a06-8c32-b3faf08046c6",
"metadata": {},
"source": [
"### Drug targets"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f86487dd-724d-4c97-95a8-962a5236ca2e",
"metadata": {},
"outputs": [],
"source": [
"#DataFrame with drugs and their targets:\n",
"total_drug_list = set()\n",
"for drug in dru_pro[\"dru\"]:\n",
" if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
" list_drugs_total.add(drug)\n",
"targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a99abc85-90d9-481f-ab68-3c7687868ecb",
"metadata": {},
"outputs": [],
"source": [
"# DataFrame with drugs that have at least one target protein in the LCC with significant differential expression.\n",
"\n",
"gen_dict_ = dict(zip(gen['name'], gen['id']))\n",
"\n",
"drugs = []\n",
"genes = []\n",
"pvalues = []\n",
"logfc = []\n",
"\n",
"for i, gene_symbol in enumerate(dge_sorted[\"symbol\"]):\n",
" gene_id = gen_dict_.get(gene_symbol)\n",
" proteins = []\n",
" for j, gene2 in enumerate(gen_pro[\"gen\"]):\n",
" if gene_id == gene2:\n",
" proteins.append(gen_pro[\"pro\"][j])\n",
" for z, drug in enumerate(targets_total[\"Fármacos\"]):\n",
" target_list = targets_total[\"Dianas\"][z]\n",
" for protein in proteins:\n",
" if protein in target_list:\n",
" drugs.append(drug)\n",
" genes.append(gene_symbol) \n",
" pvalues.append(dge_sorted[\"fdr\"][i])\n",
" logfc.append(dge_sorted[\"beta\"][i])\n",
"\n",
"results = {\n",
" \"Drugs\": drugs,\n",
" \"Gene symbol\": genes,\n",
" \"log(Fold Change)\": logfc,\n",
" \"Corrected pvalue\": pvalues,\n",
"}\n",
"\n",
"dge_drugs = pd.DataFrame(results)\n"
]
},
{
"cell_type": "markdown",
"id": "69dd8106-15d9-44d9-86f6-54511a38c8d8",
"metadata": {},
"source": [
"### Assessment of module membership of co-expressed genes"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "401ee7e6-7bf7-41a8-813a-227976418d08",
"metadata": {},
"outputs": [],
"source": [
"kme = pd.read_csv(\"../files/gandal_2018a_kMEs.csv\", sep = \",\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "e91afef6-7790-44a0-a71d-026f858a9604",
"metadata": {},
"outputs": [],
"source": [
"dge_genes = []\n",
"\n",
"for symbol in dge_drugs[\"Gene symbol\"]:\n",
" for i,symbol2 in enumerate(kme[\"external_gene_id\"]):\n",
" if symbol == symbol2:\n",
" module = kme[\"Module.name\"][i]\n",
" if module.startswith(\"CD\"):\n",
" j = int(module[2:]) # Extrae el número del módulo (por ejemplo, 1, 2, 3, ...)\n",
" if j != 0:\n",
" kme_value = kme.iloc[i, j + 1]\n",
" if kme_value > '0,5':\n",
" dge_genes.append(symbol)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "196811f9-dc94-4722-b67a-b261aa561c5e",
"metadata": {},
"outputs": [],
"source": [
"kme_filtrado = dge_drugs[dge_drugs['Gene symbol'].isin(set(dge_genes))]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "6cc25eae-272d-4992-a493-d97b43fdd42c",
"metadata": {},
"outputs": [],
"source": [
"kme_filtrado.to_csv(\"../results/Filtering_by_DGE_results.csv\", index = False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
# Schizophrenia
Python scripts and Jupyter Notebooks used to apply Network Medicine concepts
in order to characterize the disease module of schizophrenia and determine the proximity between the neurological condition and drugs.
## Repository content
| NAME | DESCRIPTION |
|-----------------------|------------------------ -------------------------------------------------- ---------------|
| [DGE.ipynb]() | Jupyter Notebook used for analyze the Differential Gene Expression (DGE) data |
| [disease module and proximity.ipynb]() | Jypyter Notebook used to characterize the disease module of schizophrenia and determine the closest distance and proximity between the neurologicalcondition and drugs|
| [repurposing.ipynb]() | Jupyter Notebook used to identify drug repurposing candidates for schizophrenia based on the results obtained in the differential gene expression, distance and proximity analyses |
| [functions_network_medicine_schizo.py]() | Python script with the functions implemented in the Jupyter Notebooks |
## Methodology of the analysis
### Characterization of the disease module
1. Generation of the **interactome**.
2. Definition of **pathological proteins**.
3. Development of the **subgraph** of the disease.
4. Identification of the **module** of the disease.
5. **Statistical validation** of the disease modules.
### Determination of disease-drug proximity
1. **Distance** between disease modules and drugs: closest distance *\(d<sub>c</sub>\)*.
2. **Proximity** between disease modules and drugs: distance z-score.
### Differential Gene Expression analysis
Identification of genes that may be potential therapeutic targets depending on whether they meet the following criteria:
1. Differentially expressed in patients with schizophrenia
2. Significantly correlated with the genes belonging to its co-expression module in different psyhiatric and neurological diseases (PNDs).
3. Part of disease module.
Data regarding the DGE and the co-expression modules were obtained, respectively, from the supplementary material of ... and ...
## Criteria for identifying drug repurposing candidates:
1. Distance to schizophrenia module between Q1 and median of distance values across treatment drugs.
2. Proximal to disease module (z-score of distance less than or equal to -0.15).
3. Targetting potential therapeutic targets identificated previously.
This source diff could not be displayed because it is too large. You can view the blob instead.
#! /usr/bin/env python
"""
# ---------------------------------------------------------------------------
#
# functions_network_medicine_schizo.py
# File with all the functions that I have used to calculate the distance
# and the proximity between diseases and drugs. Additionally, functions are included
# employees to obtain the disease module and study its
# significance prior to determining distance and proximity.
#
# Exploring Drug Repurposing Opportunities for Schizophrenia: A Network Medicine Approach
#
# María Marín Tercero
# ----------------------------------------------------------------------------
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from tabulate import tabulate
from networkx.algorithms import bipartite
import random
from scipy.stats import norm
from itertools import combinations
import re
from itertools import product
# =================================================================================
def genes_dis(enf, file):
"""
This function creates a list with the genes associated with the disease "enf" in the dis_gen file
"""
genes=[]
for i, dis in enumerate(file["dis"]):
if dis == enf:
gen = file["gen"][i]
genes.append(gen)
return genes
# =================================================================================
def pro_gen_dict(gene_list, file):
"""
This function creates a dictionary from the list of genes associated with the disease with:
key: protein associated with each gene in the gen_pro file
value: gene related to the key protein in the gen_pro file
"""
result_dict = {}
for i, gen in enumerate(file["gen"]):
# Looping through gen_pro, which relates genes and proteins.
# I'm storing the position of the gene (i) and the gene id (gen).
if gen in gene_list:
# Searching each gene in gen_pro within the corresponding gene list of each disease.
prot = file["pro"][i]
# If that gene is in the gene list of each disease, I find the associated protein at the same position.
result_dict[prot] = gen
# Adding to each disease's dictionary the protein as key and the related gene as value.
return result_dict
# =================================================================================
def gen_pro_PPI(dict1, file):
"""
From a dictionary with the relationships between proteins and genes associated with each of our diseases,
this function retains the prot:gen relationship from the dictionary only if such prot appears in the PPI network of the pro_pro file.
key: proteins appearing in the PPI network
value: genes related to the key protein
"""
result_dict = {}
for prot in dict1.keys():
# Iterating over all proteins in the general prot:gen dictionary.
if prot in file["prA"].tolist() or prot in file["prB"].tolist():
# Selecting proteins that appear in the PPI network.
result_dict[prot] = dict1[prot]
# Adding to the PPI prot:gen dictionary only the prot:gen relationships for proteins that are in the PPI.
return result_dict
# =================================================================================
def SG(dic, PPI):
"""
Input data: dictionary with proteins from the PPI (keys) and associated genes (values) for a disease, PPI network
This function creates a subgraph only with the proteins from the PPI network associated with my disease as nodes.
"""
# Creating a subgraph only with the proteins from the PPI network associated with my disease as nodes
SG = nx.subgraph(PPI, dic.keys())
return SG
# =================================================================================
def lcc(SG):
"""
This function gives us the LCC of the proteins from the PPI network associated with a disease from a subgraph
formed only with the proteins associated with the disease.
"""
lcc = max(nx.connected_components(SG), key=len)
# Calculating the LCC (module comprising the largest number of proteins associated with a disease).
# Our goal is to obtain the number of genes that are part of the LCC of the disease:
# The number of proteins from the disease in the LCC is the same number as the genes in the LCC
# (because we have extracted the list of proteins from the dictionary where they form a tuple with their associated genes).
return lcc
# =================================================================================
def nodes_by_degree(G):
"""
This function returns a dictionary where we will obtain the degrees as keys and, in the values, all the nodes of the network that contain that degree.
"""
degree_dict = {}
for node in G.nodes():
degree = G.degree(node)
if degree not in degree_dict:
degree_dict[degree] = []
degree_dict[degree].append(node)
return degree_dict
# =================================================================================
def lcc_simulation(G, lcc, PPI, dp=False):
"""
Input data: disease module subgraph (G), PPI, and disease LCC (lcc).
Dp can be False to compute networks preserving the degree distribution of the disease module or True otherwise.
This function returns the mean and standard deviation of the LCC of 1000 random networks with the same number
of nodes and edges as the graph G provided as input.
For the creation of these networks in the case dp = False, the associations between disease proteins are randomly
distributed within the network. Therefore, the networks will not have the same structure as the disease module.
For the creation of networks in the case dp = True, nodes in the PPI network with the same degree as the nodes
in the disease module are selected. Thus, networks with the same structure as the disease module are created.
"""
# Preliminary calculations:
# Get the total proteins from the PPI
ppi_nodes = PPI.nodes()
# Get the total proteins in the disease module
disease_nodes = G.nodes()
# Get the number of proteins in the disease module
num_disease_nodes = len(disease_nodes & set(ppi_nodes))
# Group nodes by their degree
degree_grouped_nodes = nodes_by_degree(PPI)
# 1000 random simulations to calculate 1000 LCC
random_list = []
for i in range(1000):
if dp == False: # Simulation for non-degree preserving
# Get a random set of proteins within the total nodes of the PPI network.
# This set has a number of nodes equivalent to the number of nodes in the disease module.
random_nodes = set(random.sample(list(ppi_nodes), num_disease_nodes))
if dp == True: # Simulation for degree preserving
random_nodes = set()
for node in disease_nodes: # For each node in the disease module
degree = PPI.degree(node) # Get its degree in the PPI network
available_nodes = degree_grouped_nodes[degree] # Get a group of nodes in the PPI with the same degree as the iterated node
control = True
while(control): # Loop to choose sampled nodes only once
chosen_node = random.choice(available_nodes) # Choose a node from the selected PPI nodes in the previous step
control = chosen_node in random_nodes # Check if that node is among the sampled nodes
random_nodes.add(chosen_node) # Add that node to the list of nodes that I will use to create the random network
# Only nodes that are not previously among the sampled nodes are added thanks to the while loop
r = nx.subgraph(PPI,random_nodes) # Subgraph of the PPI with the selected random nodes
r = nx.Graph(r) # To remove parallel edges
r.remove_edges_from(nx.selfloop_edges(r)) # To remove self-loops connecting a node to itself
random_list.append(len(max(nx.connected_components(r), key=len)))
mean = np.mean(random_list) # Mean
std = np.std(random_list) # Standard deviation
zscore = (len(lcc) - mean)/std
return mean, std, zscore, random_list
# =================================================================================
def degrees_list(G):
"""
This function returns a list with the nodes and another list with their degrees.
"""
nodes = list(G.nodes())
degrees = list(dict(G.degree()).values())
return nodes, degrees
# =================================================================================
def rep(disease_name, G, PPI, ndp_list, dp_list, LCC, mean_ndp, std_ndp, zscore_ndp, mean_dp, std_dp, zscore_dp):
"""
Input data:
1. Disease name
2. SG: LCC of the proteins from the PPI network associated with a disease
3. G: PPI network
4. ndp_list: list of LCCs (ndp)
5. dp_list: list of LCCs (dp)
6.LCC: Observed LCC of the disease
7. mean_ndp: mean of the list of LCCs (ndp)
8. std_ndp: standard deviation of the list of LCCs (ndp)
9.zscore_ndp: z-score of the list of LCCs (ndp)
10. mean_dp: mean of the list of LCCs (dp)
11. std_dp: standard deviation of the list of LCCs (dp)
12. zscore_dp: z-score of the list of LCCs (dp)
This function performs a representation of:
1. Distribution of LCCs (ndp)
2. Distribution of LCCs (dp)
"""
# Get a list with the nodes and their degrees from the PPI
G_ppi_list = degrees_list(PPI)
degree_ppi = pd.DataFrame(list(zip(G_ppi_list[0], G_ppi_list[1])), columns=['node','degree'])
# Get a list with the nodes and their degrees from the PPI
G_disease_list = degrees_list(G)
degree_disease = pd.DataFrame(list(zip(G_disease_list[0], G_disease_list[1])), columns=['node','degree'])
# Group the list of nodes and degrees from the PPI network by degree and count how many nodes in the network have that degree
G_plot_ppi = degree_ppi.groupby('degree').count()
# Group the list of nodes and degrees from the disease by degree and count how many nodes in the network have that degree
G_plot_disease = degree_disease.groupby('degree').count()
# Create a figure to add 3 subplots
fig, axs = plt.subplots(1, 2, figsize=(15, 4))
# Representation of the distribution of LCC ndp
axs[0].hist(ndp_list, color = '#79C4FF', bins=20, edgecolor='black')
axs[0].set_xlabel('Nº of nodes in LCC', fontsize=12)
axs[0].set_ylabel('Nº of random networks', fontsize=12)
axs[0].set_title('LCC distribution for 1,000 random networks (ndp)', fontsize=14)
axs[0].axvline(x=len(LCC), color='#FF7A7A', linestyle='--')
axs[0].legend(["LCC "+str(disease_name)], loc='upper right')
# Representation of the distribution of LCC dp
axs[1].hist(dp_list, color = '#79C4FF', bins=20, edgecolor='black')
axs[1].set_xlabel('Nº of nodes in LCC', fontsize=12)
axs[1].set_ylabel('Nº of random networks', fontsize=12)
axs[1].set_title('LCC distribution for 1,000 random networks (dp)', fontsize=14)
axs[1].axvline(x=len(LCC), color='#FF7A7A', linestyle='--')
axs[1].legend(["LCC "+str(disease_name)], loc='upper right')
# Adjust the layout and space between plots
plt.tight_layout()
# Legend 1
legend_text_1 = r'$LCC (ndp)_{\mathrm{obs}}$' + ' (' + r'$\mathrm{mean\ LCC} - \mathrm{STD}$, z-score' + ')'
leyenda_valor_1 = f'{len(LCC)} ({round(mean_ndp,2)} - {round(std_ndp,2)}, {round(zscore_ndp,2)})'
fig.text(0.51, -0.05, legend_text_1+str(" = ")+leyenda_valor_1, fontsize=12, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round'))
# Legend 2
legend_text_2 = r'$LCC (dp)_{\mathrm{obs}}$' + ' (' + r'$\mathrm{mean\ LCC} - \mathrm{STD}$, z-score' + ')'
leyenda_valor_2 = f'{len(LCC)} ({round(mean_dp,2)} - {round(std_dp,2)}, {round(zscore_dp,2)})'
fig.text(0.51, -0.15, legend_text_2+str(" = ")+leyenda_valor_2, fontsize=12, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round'))
# General title above the two plots
plt.suptitle(str(disease_name), fontsize=20, y=1.1, fontweight='bold', style='italic')
# Show the figure
plt.show()
# =================================================================================
def targets(drug_list, arch):
"""
Input data: list of drugs treating a disease (drug_list) and a file with drug-target relationships (arch).
This function allows us to obtain a DataFrame with drugs in the "Drugs" column and their targets in the "Targets" column.
"""
targets_total = []
# List to store targets for each drug separated by commas
drugs = []
# List to store drugs found in the 'arch' file
for drug1 in drug_list:
# Iterate over each drug in the drug list
targets = []
# Empty list to store targets for each drug
for i, drug2 in enumerate(arch["dru"]):
# Iterate over drugs in the 'arch' file, keeping track of the drug (drug2) and its row (i) in the drug column
if drug1 == drug2:
# If a drug for a disease is found in the drug-target file
targets.append(arch["pro"][i])
# Add its target to the targets list, which will be in the same row (i) in the target column
if len(targets) > 0:
# Check if the targets list is not empty
drugs.append(drug1)
# Add the drug to the list of drugs, so only drugs appearing in the 'arch' file are stored
targets_total.append(targets)
# Add the list of targets for that drug to the list of targets for all drugs
data = {"Drugs": drugs, "Targets": targets_total}
# Combine the data and classify them into Drugs and Targets
df = pd.DataFrame(data)
# Create a DataFrame with the results
return df
# =================================================================================
def calculate_dc_drug(target_list, dist_matrix, disease_module_proteins, PPI):
"""
Input data: list of targets of the drug (target_list), proteins of the disease module (disease_module_proteins),
matrix file with distances between all nodes of the PPI (dist_matrix), PPI network (PPI).
This function returns the closest measure (dc) of a drug and a disease.
"""
targets_in_ppi = set(target_list) & set(PPI.nodes())
# List of targets of the drug that are also in the PPI network
targets_in_disease_module = set(targets_in_ppi) & set(disease_module_proteins)
# List of targets of the drug that are also part of the disease module
distances_disease_target = dist_matrix.loc[list(targets_in_ppi), list(disease_module_proteins)].values
# Generate a matrix with the shortest path lengths (SPLs) between all drug targets and all proteins in the disease module according to the distance file
non_empty_rows = ~np.isnan(distances_disease_target).all(axis=1)
# Keep rows that are not empty, i.e., remove targets that have no path to any protein in the disease module
if np.isnan(distances_disease_target).all():
# If the previous matrix is empty
return np.nan
# There is no path between any drug target and the disease
elif len(targets_in_disease_module) == len(targets_in_ppi):
# If all drug targets are part of the disease module
return 0
# The dc value will be 0
else:
return np.nanmean(np.nanmin(distances_disease_target[non_empty_rows], axis=1))
# Otherwise, calculate the mean of the minimum SPLs of targets that have a path to the disease module (the mean of the minimum values of each row of the matrix)
# =================================================================================
def proximity(df, arch_dist, prots_enf, PPI):
"""
Input data: DataFrame (df) with "Drugs" and their "Targets"; file with distances between all nodes of the network (arch_dist);
list of proteins related to a disease (prots_enf); and PPI network (PPI).
Function that allows us to obtain a DataFrame with the proximity values (dc) for a list of drugs and a disease.
"""
dc_total = [] # List to store the dc value of all drugs
diseases = [] # List to add whether the drug belongs to a specific disease
for i, drug in enumerate(df["Drugs"]): # For each drug in the drugs and targets DataFrame, I keep the row (i)
targets_list = df["Targets"][i] # Get the list of targets for that drug, which will be in the same row as the drug but in the targets column
dc_total.append(calculate_dc_drug(targets_list, arch_dist, prots_enf, PPI))
data = {'Drugs': list(df["Drugs"]), 'dc': dc_total} # Save the relationship between the list of drugs and the list of dcs
result_table = pd.DataFrame(data) # Convert the results into a DataFrame
return result_table
# =================================================================================
def proximity_random(df, arch_dist, prots_enf, PPI, num_iterations, df_results):
"""
Input data: DataFrame (df) with "Drugs" and their "Targets", file with distances between all nodes of the network (arch_dist),
list of proteins related to a disease (prots_enf), PPI network (PPI), and the number of iterations (num_iterations)
to calculate the dc (num_iterations), DataFrame (df_results) with the results of random target modules iterations for each drug.
This function allows us to obtain a DataFrame with the average proximity values (dc) calculated from:
- 1000 random modules of proteins with the same number of proteins and the same degree distribution as the disease module.
- 1000 random target modules with the same number of proteins and the same degree distribution as each drug in the Drugs and Targets DataFrame.
"""
group_nodes_degree = nodes_by_degree(PPI) # Group nodes by their degree
# Initialize a matrix with null values ​​that has the same number of rows as the total number of drugs in the DataFrame
# and the same number of columns as the number of iterations
proximity_matrix = np.full((len(df["Drugs"]), num_iterations), None, dtype=object)
for i in range(num_iterations): # For each iteration
# Random disease module
random_prots = set() # Create a set of random proteins
for prot in prots_enf: # For each protein in the disease module
degree_prot = PPI.degree(prot) # Calculate its degree
available_prots = group_nodes_degree[degree_prot] # Choose proteins from the PPI with the same degree
random_prots.add(np.random.choice(available_prots)) # Choose the same number of nodes as the disease module, taken from the total list of PPI proteins randomly
# Random target module
random_targets = {} # Create a dictionary to store random targets for each drug
for j, drug in enumerate(df["Drugs"]): # For each drug in the DataFrame
random_target = df_results.iloc[j, i] # Get the random target corresponding to this iteration for this drug
random_targets[drug] = random_target # Store the random target in the dictionary
# Calculate the dc of each drug with the random disease module and the random target
drug_dc_total = calculate_dc_drug(random_target, arch_dist, random_prots, PPI)
proximity_matrix[j, i] = drug_dc_total # Add the dc of each drug (in row j) in the column of the matrix corresponding to the iteration (i)
drug_mean_proximity = [] # List to add the mean of dc after 1000 iterations for each drug
deviation = [] # List to add the deviation of dc after 1000 iterations for each drug
for row in proximity_matrix: # For each row (for each drug)
if all(x is np.nan for x in row): # If the entire row is None
drug_mean_proximity.append(np.nan) # Add None to the drug_mean_proximity list
deviation.append(np.nan) # Add None to the deviation list
else:
drug_mean_proximity.append(np.nanmean(row)) # Add the mean of that row
deviation.append(np.nanstd(row)) # Standard deviation
data = {'Drugs': list(df["Drugs"]), 'dc_mean': drug_mean_proximity, 'dc_std' : deviation}
result_table = pd.DataFrame(data)
return result_table
# =================================================================================
def calculate_random_drug_target_modules(num_iterations, df, PPI):
results = {}
group_nodes_by_degree = functions_network_medicine_schizo.nodes_by_degree(PPI) # Group nodes by their degree
for i in range(num_iterations): # for each iteration
iteration_results = {}
for j, drug in enumerate(df["Drugs"]): # for each drug, keep its row (j)
target_list = df["Targets"][j] # get the list of targets for that drug
target_list_PPI = set(target_list) & set(PPI.nodes()) # get the drug's targets that are in the PPI
drug_targets = []
for target in target_list_PPI: # for each target in the target list
degree_target = PPI.degree(target) # calculate its degree
available_targets = group_nodes_by_degree[degree_target] # choose proteins from the PPI with the same degree
random_target = np.random.choice(available_targets) # same number of nodes as the disease module, taken from the total list of PPI proteins randomly
drug_targets.append(random_target)
iteration_results[drug] = drug_targets
results[i] = iteration_results
df_results = pd.DataFrame(results)
return df_results
# =================================================================================
def determine_treatment(row, dis_dru_the):
disease = row['ID']
drug = row['Drugs']
treatments = dis_dru_the[(dis_dru_the['dis'] == disease) & (dis_dru_the['dru'] == drug)]
treatment = 'yes' if treatments.shape[0] > 0 else 'unknown'
return treatment
# =================================================================================
def rep_prox(df_combined, disease_name):
"""
This function represents in a boxplot the distribution of proximity to a disease and
the distribution of its z-score for the group of drugs used to treat the disease
and the group of drugs not used for its treatment (unknown).
Input:
1. df_combined: DataFrame combined with drugs, their observed proximity, their average random proximity,
the standard deviation of random proximity, their z-score, a column indicating the disease
(dementia, epilepsy, bipolar disorder, or schizophrenia), and another column indicating if the drug
is used for the treatment of the disease in the same row.
2. disease_name: Name of the disease
"""
# Filter the data to generate two new dataframes, one with data for drugs used as treatment for the disease, and another with data for the rest of drugs
drugs_with_disease = df_combined[(df_combined['ID'] == disease_name) & (df_combined['Treatment'] == 'yes')]
drugs_without_disease = df_combined[(df_combined['ID'] == disease_name) & (df_combined['Treatment'] == 'unknown')]
combined_data = pd.concat([drugs_with_disease.assign(Treatment='Treatment'), drugs_without_disease.assign(Treatment='Unknown')])
# Combine the two datasets into a single subplot
fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Create a subplot with 1 row and 2 columns
# Plot the boxplot with both proximity distributions
sns.boxplot(x='Treatment', y='Closest distance', data=combined_data, hue='Treatment', ax=axes[0], palette={'Treatment': '#FF7A7A', 'Unknown': '#79C4FF'}, dodge=False, medianprops=dict(linewidth=2), legend=False)
axes[0].set_ylabel('Closest distance ($\mathregular{d_c}$)', fontsize=12)
axes[0].set_xlabel('')
for label in axes[0].get_xticklabels():
label.set_fontsize(12)
# Plot the boxplot with both proximity distributions
sns.boxplot(x='Treatment', y='Dc_zscore', data=combined_data, hue='Treatment', ax=axes[1], palette={'Treatment': '#FF7A7A', 'Unknown': '#79C4FF'}, dodge=False, medianprops=dict(linewidth=2), legend=False)
axes[1].set_ylabel('Proximity [z-score ($\mathregular{d_c}$)]', fontsize=12)
axes[1].set_xlabel('')
for label in axes[1].get_xticklabels():
label.set_fontsize(12)
plt.tight_layout() # Adjust the layout of the subplot to avoid overlap
plt.show()
# =================================================================================
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"id": "b987b5e5-3a96-46dc-ad64-08edeb477a6c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"from tabulate import tabulate\n",
"from networkx.algorithms import bipartite\n",
"import random\n",
"from scipy.stats import norm\n",
"from itertools import combinations\n",
"import re\n",
"from itertools import product\n"
]
},
{
"cell_type": "markdown",
"id": "fe4b1e17-fa87-42d9-a28d-e53aebb77943",
"metadata": {},
"source": [
"### Data"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c5bf61fe-29af-40b6-a85e-f5453c2b4b30",
"metadata": {},
"outputs": [],
"source": [
"proximity = pd.read_csv(\"../results/Proximity_results.csv\", sep = \",\")\n",
"dge = pd.read_csv(\"../results/Filtering_by_DGE_results.csv\", sep = \",\")"
]
},
{
"cell_type": "markdown",
"id": "9a70afe4-73a7-4436-a8d9-14deba773032",
"metadata": {},
"source": [
"### Filtering of drug repurposing candidates"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "06d790a3-6c11-4724-9ed3-0c764083243f",
"metadata": {},
"outputs": [],
"source": [
"# filtering of schizophrenia data\n",
"proximity_schizo = proximity[proximity[\"ID\"] == \"C0036341\"]"
]
},
{
"cell_type": "markdown",
"id": "07ae6a91-08a8-4bab-97c2-b47070875c48",
"metadata": {},
"source": [
"#### 1) Proximal drugs to schizophrenia module"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "493872db-4d45-4ca6-a613-d0adafacd5c7",
"metadata": {},
"outputs": [],
"source": [
"proximal_drugs_schizo = proximity_schizo[(proximity_schizo[\"Dc_zscore\"] <= -0.15) & \n",
" (proximity_schizo[\"Treatment\"] == \"unknown\")]"
]
},
{
"cell_type": "markdown",
"id": "4e3e28c3-f78d-4d20-9745-3d28dad6838a",
"metadata": {},
"source": [
"#### 2) Distance to schizophrenia module"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "1f33226f-a0b8-4d76-be20-f82ca3851d6e",
"metadata": {},
"outputs": [],
"source": [
"# Q1 and median of closest distance for treatment drugs\n",
"\n",
"treatment_schizo = proximity_schizo[proximity_schizo[\"Treatment\"] == \"yes\"]\n",
"Q1_treatment = treatment_schizo[\"Closest distance\"].quantile(0.25)\n",
"median_treatment = treatment_schizo[\"Closest distance\"].quantile(0.5)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "79a8a797-2089-4c60-bcb3-690de1bdbec6",
"metadata": {},
"outputs": [],
"source": [
"# Unknown drugs with distance to schizophrenia module between Q1 and nedian of treatment drugs\n",
"\n",
"closest_drugs_schizo = proximal_drugs_schizo[(proximal_drugs_schizo[\"Closest distance\"] >= Q1_treatment) &\n",
" (proximal_drugs_schizo[\"Closest distance\"] <= median_treatment)]"
]
},
{
"cell_type": "markdown",
"id": "d6f6a702-06ee-4d94-8982-1ff4659b8dbc",
"metadata": {},
"source": [
"#### 3) Targets with significant DGE in schizophrenia and correlated with its co-expression module"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "0665674b-9fbd-4855-9785-7a93b1907540",
"metadata": {},
"outputs": [],
"source": [
"drugs_in_dge = dge[\"Drugs\"].unique()\n",
"drugs_schizo_filtered = closest_drugs_schizo[(closest_drugs_schizo[\"Drugs\"].isin(drugs_in_dge))]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "418c6fb7-4a32-407e-bb35-9c13804d1dbf",
"metadata": {},
"outputs": [],
"source": [
"proximity_schizo_filtered = drugs_schizo_filtered.sort_values(by=\"Dc_zscore\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "c6dbe376-fca9-43c1-9c76-cad14964f703",
"metadata": {},
"outputs": [],
"source": [
"proximity_schizo_filtered = proximity_schizo_filtered.drop('ID', axis=1).drop('Treatment', axis=1).drop('Dc_std', axis=1).drop('Dc_mean', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "05f3dce7-b0ea-4ce3-b06a-52c52f3bb7a7",
"metadata": {},
"outputs": [],
"source": [
"proximity_schizo_filtered.to_csv(\"../results/Repurposing candidates schizophrenia.csv\", index = False)"
]
},
{
"cell_type": "markdown",
"id": "12a5a03c-76c7-4817-a596-2e89724302d4",
"metadata": {},
"source": [
"### MeSH Pharmacological Actions"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "73e12501-9ea1-4d08-a87a-88a30036d1d8",
"metadata": {},
"outputs": [],
"source": [
"categories = pd.read_csv(\"../files/drug_categories.csv\", sep = \",\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b7c78db-7596-4759-a5e1-50d9e121abb4",
"metadata": {},
"outputs": [],
"source": [
"merged_drugs = pd.merge(proximity_schizo_filtered, dge[['Drugs', 'Gene symbol']], on='Drugs')\n",
"merged_drugs = pd.merge(merged_drugs, categories[['drug_id', 'class_name', 'type']], left_on='Drugs', right_on='drug_id')\n",
"\n",
"drugs_classification = merged_drugs[merged_drugs['class_name'].str.contains('Agents')]\n",
"\n",
"drugs_classification = drugs_classification[['Drugs', 'Closest distance', 'Dc_zscore', 'Gene symbol', 'class_name', 'type']]\n",
"drugs_classification.columns = ['Drugs', 'Closest distance', 'Proximity', 'Gene target', 'Classification', 'Type']\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "21abf2e1-c9df-47ab-86a7-0bcf5523b0ba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Drugs Closest distance Proximity Gene target \\\n",
"1 CHEMBL623 0.300000 -115.579629 KCNH2 \n",
"8 CHEMBL113 0.363636 -103.512846 PIK3CB \n",
"11 CHEMBL113 0.363636 -103.512846 PIK3CB \n",
"15 CHEMBL941 0.500000 -94.879347 DDR1 \n",
"20 CHEMBL1628227 0.352941 -90.844253 KCNH2 \n",
"28 CHEMBL83 0.428571 -85.950381 KCNH2 \n",
"30 CHEMBL83 0.428571 -85.950381 KCNH2 \n",
"34 CHEMBL23588 0.500000 -77.736571 PPARA \n",
"35 CHEMBL640 0.333333 -75.756252 KCNH2 \n",
"39 CHEMBL709 0.500000 -74.365765 KCNH2 \n",
"43 CHEMBL278819 0.333333 -65.014945 MAOA \n",
"44 CHEMBL652 0.500000 -51.717537 KCNH2 \n",
"47 CHEMBL413 0.333333 -22.765103 FGF2 \n",
"49 CHEMBL413 0.333333 -22.765103 FGF2 \n",
"50 CHEMBL413 0.333333 -22.765103 FGF2 \n",
"53 CHEMBL43 0.500000 -8.866920 KCNH2 \n",
"\n",
" Classification Type \n",
"1 Antidepressive Agents, Second-Generation MESHPA \n",
"8 Anti-Inflammatory Agents, Non-Steroidal MESHPA \n",
"11 Antimutagenic Agents MESHPA \n",
"15 Antineoplastic Agents MESHPA \n",
"20 Antidepressive Agents, Tricyclic MESHPA \n",
"28 Antineoplastic Agents, Hormonal MESHPA \n",
"30 Bone Density Conservation Agents MESHPA \n",
"34 Anti-Inflammatory Agents MESHPA \n",
"35 Anti-Arrhythmia Agents MESHPA \n",
"39 Urological Agents MESHPA \n",
"43 Antidepressive Agents MESHPA \n",
"44 Anti-Arrhythmia Agents MESHPA \n",
"47 Anti-Bacterial Agents MESHPA \n",
"49 Antifungal Agents MESHPA \n",
"50 Immunosuppressive Agents MESHPA \n",
"53 Antineoplastic Agents MESHPA \n"
]
}
],
"source": [
"merged_drugs = pd.merge(proximity_schizo_filtered, dge[['Drugs', 'Gene symbol']], on='Drugs')\n",
"merged_drugs = pd.merge(merged_drugs, categories[['drug_id', 'class_name', 'type']], left_on='Drugs', right_on='drug_id')\n",
"\n",
"drugs_classification = merged_drugs[merged_drugs['class_name'].str.contains('Agents')]\n",
"\n",
"drugs_classification = drugs_classification[['Drugs', 'Closest distance', 'Dc_zscore', 'Gene symbol', 'class_name', 'type']]\n",
"drugs_classification.columns = ['Drugs', 'Closest distance', 'Proximity', 'Gene target', 'Classification', 'Type']\n",
"\n",
"\n",
"print(drugs_classification)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "447d822e-7e21-45fa-993e-53659fda6fc5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3898d83-c1ed-4c95-88d5-eca1ae6debe6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f28f5e21-ca09-42c9-b7c3-1868d37db298",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"from tabulate import tabulate\n",
"from networkx.algorithms import bipartite\n",
"import random\n",
"from scipy.stats import norm\n",
"from itertools import combinations\n",
"import re\n",
"from itertools import product\n",
"import sys\n",
"sys.path.append('schizophrenia/functions_network_medicine_schizo.py') # Reemplaza '/ruta/a/tu/notebook' con la ruta real al directorio que contiene el archivo functions.py\n",
"import functions_network_medicine_schizo"
]
},
{
"cell_type": "markdown",
"id": "99573d30-ec91-4703-8cc2-1597e6ed6fc3",
"metadata": {},
"source": [
"### Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1dbaebbd-1ad3-4483-8418-7bf44a41db5e",
"metadata": {},
"outputs": [],
"source": [
"#nodes\n",
"pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n",
"gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n",
"dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n",
"dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8db5703d-6d8e-4295-8e34-895fe8b6d1ae",
"metadata": {},
"outputs": [],
"source": [
"#links\n",
"pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep=\"\\t\")\n",
"dis_gen = pd.read_csv('data/links/dis_gen.tsv', sep=\"\\t\")\n",
"dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
"dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
"gen_pro = pd.read_csv('data/links/gen_pro.tsv', sep=\"\\t\")\n",
"dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d33a5a56-e66c-4a5d-a695-b5ef8972f9cc",
"metadata": {},
"outputs": [],
"source": [
"#file with SPLs between all PPI nodes\n",
"spl = pd.read_csv('files/SPL PPI.csv', index_col='Source')"
]
},
{
"cell_type": "markdown",
"id": "2519e85f-388b-4146-9906-aa450cfd4316",
"metadata": {},
"source": [
"### Interactome"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5160b911-92a7-4687-8c8c-dfe38f1d0372",
"metadata": {},
"outputs": [],
"source": [
"G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
]
},
{
"cell_type": "markdown",
"id": "ebc315c4-08ad-4f50-849e-0e4bfac34b1f",
"metadata": {},
"source": [
"### Identification of disease module"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cf1179a-49fe-4202-9f60-1600bb0a5634",
"metadata": {},
"outputs": [],
"source": [
"# Dictionaries to store the results for each disease\n",
"dis_gen_dict = {} # Seed genes\n",
"dis_lcc = {} # Module size\n",
"dis_lcc_dp = {} # Results for statistical validation dp\n",
"dis_lcc_ndp = {} # Results for statistical validation ndp\n",
"\n",
"dis_total = dis_dru_the['dis'].unique().tolist()\n",
"\n",
"for dis in dis_total:\n",
" # Seed genes\n",
" genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
" dis_gen_dict[dis] = genes\n",
" \n",
" # Disease proteins \n",
" prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
" \n",
" # Disease proteins in interactome\n",
" prots_interactome = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
"\n",
" # Module size\n",
" SG_dis = G_ppi.subgraph(prots_interactome)\n",
" \n",
" if SG_dis: # if disease has at least one protein in its module\n",
" lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
" dis_lcc[dis] = lcc\n",
" \n",
" # Statistical validation ndp\n",
" lcc_ndp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=False)\n",
" dis_lcc_ndp[dis] = lcc_ndp\n",
" \n",
" # Statistical validation dp\n",
" lcc_dp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=True)\n",
" dis_lcc_dp[dis] = lcc_dp \n",
" \n",
" else:\n",
" dis_lcc[dis] = []\n",
" dis_lcc_ndp[dis] = []\n",
" dis_lcc_dp[dis] = []\n",
"\n",
"\n",
"results = {\n",
" \"ID\": dis_total,\n",
" \"Seed genes\": [len(dis_gen_dict[dis]) for dis in dis_total],\n",
" \"Genes in PPI\": [len(df_pro_dis_total_filt[df_pro_dis_total_filt[\"dis\"] == dis][\"pro_ppi\"].iloc[0]) for dis in dis_total],\n",
" \"Genes in LCC\": [len(dis_lcc[dis]) for dis in dis_total],\n",
" \"Relative LCC\": [len(dis_lcc[dis]) / len(df_pro_dis_total_filt[df_pro_dis_total_filt[\"dis\"] == dis][\"pro_ppi\"].iloc[0]) for dis in dis_total],\n",
" \"Mean random LCC (ndp)\": [dis_lcc_ndp[dis][0] for dis in dis_total],\n",
" \"Std random LCC (ndp)\": [dis_lcc_ndp[dis][1] for dis in dis_total],\n",
" \"Z-score (ndp)\": [dis_lcc_ndp[dis][2] for dis in dis_total],\n",
" \"Mean random LCC (dp)\": [dis_lcc_dp[dis][0] for dis in dis_total],\n",
" \"Std random LCCs (dp)\": [dis_lcc_dp[dis][1] for dis in dis_total], \n",
" \"Z-score (dp)\": [dis_lcc_dp[dis][2] for dis in dis_total]\n",
"}\n",
"\n",
"dis_module_df = pd.DataFrame(results)\n",
"dis_module_df.to_csv(\"Module.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"id": "92f1b21e-eb2c-4a35-b9a0-1562e4751677",
"metadata": {},
"source": [
"### Disease filtering"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "480659cf-1dd8-409c-a2d5-418a16e97bb2",
"metadata": {},
"outputs": [],
"source": [
"# Filter of diseases with a significant module size\n",
"significative = dis_module_df[(dis_module_df['Z-score (dp)'] > 1.65) & (dis_module_df['Z-score (ndp)'] > 1.65)]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "115a0101-7ed4-4ef4-959e-ab07c7681c61",
"metadata": {},
"outputs": [],
"source": [
"# Filter of diseases that have more than 15 pathological proteins in its module belonging to the interactome\n",
"\n",
"for i, lcc in enumerate(significative[\"Genes in LCC\"]):\n",
" if lcc < 15: \n",
" disease_no_sig = significative[\"ID\"][i]\n",
" \n",
" row = significative[significative['ID'] == disease_no_sig]\n",
" \n",
" significative = significative.drop(row.index)\n",
" \n",
"significative.reset_index(drop=True, inplace=True)\n",
"significative.to_csv(\"Module signif.csv\", index = False)"
]
},
{
"cell_type": "markdown",
"id": "194a51fd-885e-4e82-96a5-aa2a77250ab1",
"metadata": {},
"source": [
"### Closest distance disease-drug"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "722cf051-f5ee-4473-9674-e5519d8616ce",
"metadata": {},
"outputs": [],
"source": [
"significative = pd.read_csv('Module signif.csv', sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1553f0f2-c28e-4dd3-8643-f791be9e1dca",
"metadata": {},
"outputs": [],
"source": [
"#DataFrame with drugs and their targets:\n",
"total_drug_list = set()\n",
"for drug in dru_pro[\"dru\"]:\n",
" if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
" list_drugs_total.add(drug)\n",
"targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfdd0ff9-b286-43b1-8c9a-7f6e067cf066",
"metadata": {},
"outputs": [],
"source": [
"results = [] #list with the results of each disease for the observed distance\n",
"\n",
"for i, dis in enumerate(df_pro_dis_total_filt[\"dis\"]):\n",
" SG_dis = G_ppi.subgraph(df_pro_dis_total_filt[\"pro_ppi\"][i]) #disease subnetwork\n",
" lcc = funciones_network_medicine.lcc(SG_dis) # disease module\n",
" proximity_obs = funciones_network_medicine.proximity(targets_total, spl, lcc, G_ppi) # DF with drugs and distance to disease\n",
" \n",
" for _, row in proximity_obs.iterrows():\n",
" results.append({\"ID\": dis, \"Drugs\": row[\"Fármacos\"], \"Closest distance\": row[\"dc\"]})\n",
"\n",
"df_distance = pd.DataFrame(results)\n",
"\n",
"df_distance.to_csv(\"Closest distance.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"id": "5a496aeb-4674-417a-ac3e-5a8323e148ed",
"metadata": {},
"source": [
"### Proximity disease-drug"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "aaefdc73-2962-46c7-a510-8fe22adf0e70",
"metadata": {},
"outputs": [],
"source": [
"# random target modules\n",
"target_modules_list_random_drugs = functions_network_medicine_schizo.calculate_random_drug_target_modules(1000, targets_total, G_ppi)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d58deeea-711d-45d8-9ed9-91bbbeef16f0",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(columns=[\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
"\n",
"# empty file in which I will add the results\n",
"with open(\"Rand_closest_distance_rand.csv\", 'w', newline='') as file:\n",
" writer = csv.writer(file)\n",
" writer.writerow([\"ID\", \"Drugs\", \"Dc_mean\", \"Dc_std\"])\n",
"\n",
"# determination of closest distance in random set of disease module and target module\n",
"for i, dis in enumerate(significative[\"ID\"]):\n",
" genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)\n",
" prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)\n",
" pro_ppi = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)\n",
" SG_dis = G_ppi.subgraph(pro_ppi)\n",
" lcc = functions_network_medicine_schizo.lcc(SG_dis)\n",
" proximity_rand = functions_network_medicine_schizo.proximity_random(targets_total, spl, lcc, G_ppi, 1000, lista_modulos_diana_farmacos_aleatorios)\n",
" \n",
" results = []\n",
" for _, row in proximity_rand.iterrows():\n",
" results.append({\"ID\": dis, \"Drugs\": row[\"Drugs\"], \"Dc_mean\": row[\"dc_mean\"], \"Dc_std\": row[\"dc_std\"]})\n",
" \n",
" with open(\"Rand_closest_distance_rand.csv\", 'a', newline='') as file:\n",
" writer = csv.writer(file)\n",
" for result in results:\n",
" writer.writerow([result[\"ID\"], result[\"Drugs\"], result[\"Dc_mean\"], result[\"Dc_std\"]])\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c96edf3d-41dc-4837-9893-a9ae1d5fa2f1",
"metadata": {},
"outputs": [],
"source": [
"rand_prox = pd.read_csv(\"Rand_closest_distance_rand.csv\", sep = \",\")"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "084b52bd-b94b-4508-b29e-f131c72837ba",
"metadata": {},
"outputs": [],
"source": [
"distance=pd.read_csv(\"Closest distance.csv\", sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "672acf4e-7d8e-421b-bfcb-5d05b4be4fa9",
"metadata": {},
"outputs": [],
"source": [
"# Join df_obs and df_prox based on the ID and Drugs columns\n",
"df_merged = pd.merge(distance, rand_prox, on=['ID', 'Drugs'])\n",
"\n",
"# dc_zscore\n",
"df_merged['Dc_zscore'] = (df_merged['Closest distance'] - df_merged['Dc_mean']) / df_merged['Dc_std']\n",
"\n",
"# New column indicating if each drug is used for the treatment of the disease\n",
"df_merged['Treatment'] = df_merged.apply(lambda row: functions_network_medicine_schizo.determine_treatment(row, dis_dru_the), axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "92dc074e-4ebc-4466-92a2-9a8893e9a6f7",
"metadata": {},
"outputs": [],
"source": [
"df_merged.to_csv(\"results/Proximity_results.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "3586e5aa-4877-4d70-b986-5bb92d881bc0",
"metadata": {},
"outputs": [],
"source": [
"proximity = pd.read_csv(\"Proximity_results.csv\", sep = \",\")"
]
},
{
"cell_type": "markdown",
"id": "d0a5d2a3-b632-4874-bed6-67ed6e3e4714",
"metadata": {},
"source": [
"### Statistical analysis of the proximity results across all the diseases"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "5b03a68c-307e-42b5-8209-a27c44dafb00",
"metadata": {},
"outputs": [],
"source": [
"# Filter the data to include only relevant columns\n",
"df_relevant = proximity[['Closest distance', 'Dc_zscore', 'Treatment']]\n",
"\n",
"# Compute descriptive statistics for 'Closest distance' and 'Dc_zscore' grouped by 'Treatment'\n",
"describe_closest_distance = df_relevant.groupby('Treatment')['Closest distance'].describe()\n",
"describe_dc_zscore = df_relevant.groupby('Treatment')['Dc_zscore'].describe()\n"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "52b62828-5c27-4578-bb46-9a2b5de60258",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Describe de Closest distance:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Treatment</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>516993.0</td>\n",
" <td>1.741093</td>\n",
" <td>0.590947</td>\n",
" <td>0.0</td>\n",
" <td>1.4</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yes</th>\n",
" <td>12162.0</td>\n",
" <td>1.473756</td>\n",
" <td>0.691416</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.6</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% 75% max\n",
"Treatment \n",
"unknown 516993.0 1.741093 0.590947 0.0 1.4 2.0 2.0 4.0\n",
"yes 12162.0 1.473756 0.691416 0.0 1.0 1.6 2.0 3.0"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"\\nDescribe Closest distance:\")\n",
"describe_closest_distance.head()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "099c6bab-f16f-4b16-bb17-99a86a298498",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Describe de Dc_zscore:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Treatment</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>510797.0</td>\n",
" <td>-0.159867</td>\n",
" <td>4.989091</td>\n",
" <td>-187.636997</td>\n",
" <td>-0.709030</td>\n",
" <td>0.123404</td>\n",
" <td>0.733509</td>\n",
" <td>164.760255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yes</th>\n",
" <td>12053.0</td>\n",
" <td>-1.295405</td>\n",
" <td>8.074283</td>\n",
" <td>-163.309391</td>\n",
" <td>-1.741129</td>\n",
" <td>-0.465569</td>\n",
" <td>0.400364</td>\n",
" <td>104.948135</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"Treatment \n",
"unknown 510797.0 -0.159867 4.989091 -187.636997 -0.709030 0.123404 \n",
"yes 12053.0 -1.295405 8.074283 -163.309391 -1.741129 -0.465569 \n",
"\n",
" 75% max \n",
"Treatment \n",
"unknown 0.733509 164.760255 \n",
"yes 0.400364 104.948135 "
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"\\nDescribe Dc_zscore:\")\n",
"describe_dc_zscore.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment