{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "eed38b7b", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "from Bio import Entrez\n", "import GEOparse\n", "import geopandas as gpd\n", "import os\n", "import pandas as pd\n", "from GEOparse import GEOparse\n", "import gzip\n", "import shutil\n", "import funciones_network_medicine\n", "import networkx as nx" ] }, { "cell_type": "code", "execution_count": 2, "id": "947d7a7a-7b53-4c14-8c9f-7d25ccfecdca", "metadata": {}, "outputs": [], "source": [ "#nodes\n", "pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n", "gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n", "dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n", "dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")\n", "#links\n", "pro_pro = pd.read_csv('../data/links/pro_pro.tsv', sep=\"\\t\")\n", "dis_gen = pd.read_csv('../data/links/dis_gen.tsv', sep=\"\\t\")\n", "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n", "dis_dru_the = pd.read_csv('../data/links/dis_dru_the.tsv', sep=\"\\t\")\n", "gen_pro = pd.read_csv('../data/links/gen_pro.tsv', sep=\"\\t\")\n", "dru_pro = pd.read_csv('../data/links/dru_pro.tsv', sep=\"\\t\")" ] }, { "cell_type": "markdown", "id": "f939592a-7c9c-4ec4-b20a-03c07439a56b", "metadata": { "tags": [] }, "source": [ "### Interactome" ] }, { "cell_type": "code", "execution_count": 3, "id": "37a4ff6e-e3e6-4b0f-ad8d-d3aef695027d", "metadata": {}, "outputs": [], "source": [ "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')" ] }, { "cell_type": "markdown", "id": "94600860-ca3b-4f30-a2fe-530e719426d9", "metadata": {}, "source": [ "#### List of genes in LCC" ] }, { "cell_type": "code", "execution_count": 4, "id": "5f2370f1-9d66-43e2-9b25-88c32e75a3ac", "metadata": {}, "outputs": [], "source": [ "gen_schizo = funciones_network_medicine.genes_enf(\"C0036341\", dis_gen)\n", "dict_schizo= funciones_network_medicine.pro_gen_dict(gen_schizo, gen_pro)\n", "dict_schizo_PPI = funciones_network_medicine.gen_pro_PPI(dict_schizo, pro_pro)\n", "SG_schizo= funciones_network_medicine.SG(dict_schizo_PPI, G_ppi)\n", "lcc_schizo= funciones_network_medicine.lcc(SG_schizo)" ] }, { "cell_type": "code", "execution_count": 5, "id": "4c71bf7a-847f-4b63-919c-4d81207d61cd", "metadata": {}, "outputs": [], "source": [ "list_num = []\n", "for pro in lcc_schizo:\n", " for i,pro2 in enumerate(gen_pro[\"pro\"]):\n", " if pro == pro2:\n", " list_num.append(gen_pro[\"gen\"][i])" ] }, { "cell_type": "code", "execution_count": 6, "id": "8a56ffb7-76fa-451f-9849-ca96309c8fec", "metadata": {}, "outputs": [], "source": [ "list_genes = []\n", "for num_gen in set(list_num):\n", " for i,num_gen2 in enumerate(gen[\"id\"]):\n", " if num_gen == num_gen2:\n", " gen_names = gen[\"name\"][i]\n", " list_genes.append(gen_names) " ] }, { "cell_type": "markdown", "id": "67209750-1411-45a3-9ad9-5a92f8521c90", "metadata": {}, "source": [ "#### Microarray GE analysis" ] }, { "cell_type": "code", "execution_count": 7, "id": "dd844f76-955b-4708-9b54-a6617044cfe1", "metadata": {}, "outputs": [], "source": [ "dge = pd.read_csv('../files/Microarray_SCZ_metaanalysis_092017.csv', sep=\",\") #Gandal et al. 2018" ] }, { "cell_type": "code", "execution_count": 8, "id": "ce169023-6da3-4e67-8c4e-a38c5e31a60b", "metadata": {}, "outputs": [], "source": [ "# Genes from the Gandal study that are found in the schizophrenia module\n", "\n", "filtered_dge = dge[dge[\"symbol\"].isin(list_genes)]" ] }, { "cell_type": "code", "execution_count": 9, "id": "6be40bf1-e2b7-4470-8196-34620b7147f8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 beta SE p fdr symbol\n", "7 ENSG00000001084 0.038813 0.029858 0.195025 0.405474 GCLC\n", "23 ENSG00000002822 -0.051322 0.019815 0.010259 0.058147 MAD1L1\n", "24 ENSG00000002834 -0.000855 0.014219 0.952121 0.976082 LASP1\n", "110 ENSG00000006128 -0.200785 0.047132 0.000031 0.000998 TAC1\n", "126 ENSG00000006611 0.071682 0.030708 0.020512 0.093734 USH1C\n", "... ... ... ... ... ... ...\n", "12208 ENSG00000256269 -0.047590 0.016506 0.004341 0.032269 HMBS\n", "12222 ENSG00000257017 -0.022547 0.024117 0.350880 0.577942 HP\n", "12251 ENSG00000259207 0.002088 0.020028 0.917063 0.960690 ITGB3\n", "12298 ENSG00000262683 -0.008338 0.019593 0.670838 0.824663 FHIT\n", "12386 ENSG00000273079 0.019514 0.026687 0.465453 0.676388 GRIN2B\n", "\n", "[769 rows x 6 columns]\n" ] } ], "source": [ "print(filtered_dge)" ] }, { "cell_type": "code", "execution_count": 10, "id": "89f27194-032e-4f1e-b6d1-1f0a2b4c5abc", "metadata": {}, "outputs": [], "source": [ "# Genes with significantly higher or lower expression in schizophrenia patients.\n", "\n", "fdr_filtered_dge = filtered_dge[filtered_dge[\"fdr\"] <= 0.05] #pvalue fixed with False Discovery Rate" ] }, { "cell_type": "code", "execution_count": 11, "id": "7e59d5d1-7a75-4d8a-b447-49ea639b3db3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 beta SE p fdr symbol\n", "110 ENSG00000006128 -0.200785 0.047132 3.066855e-05 0.000998 TAC1\n", "148 ENSG00000007168 -0.061780 0.021111 3.801441e-03 0.029186 PAFAH1B1\n", "161 ENSG00000007372 0.174375 0.040167 2.189604e-05 0.000770 PAX6\n", "219 ENSG00000010256 -0.096030 0.019028 9.619559e-07 0.000083 UQCRC1\n", "260 ENSG00000011405 0.102244 0.036488 5.545087e-03 0.038023 PIK3C2A\n", "... ... ... ... ... ... ...\n", "11459 ENSG00000204525 -0.110543 0.034336 1.485081e-03 0.014967 HLA-C\n", "11471 ENSG00000204580 0.124058 0.035651 6.085740e-04 0.007933 DDR1\n", "11678 ENSG00000214113 -0.057975 0.021102 6.522134e-03 0.042600 LYRM4\n", "11906 ENSG00000231925 -0.053500 0.016371 1.262639e-03 0.013331 TAPBP\n", "12208 ENSG00000256269 -0.047590 0.016506 4.341083e-03 0.032269 HMBS\n", "\n", "[178 rows x 6 columns]\n" ] } ], "source": [ "print(fdr_filtered_dge)" ] }, { "cell_type": "code", "execution_count": 12, "id": "8bc007a4-2597-4f4a-a542-cced89d231b0", "metadata": {}, "outputs": [], "source": [ "#sort based on logFC (in absolute value)\n", "dge_sorted = fdr_filtered_dge.reindex(fdr_filtered_dge['beta'].abs().sort_values(ascending=False).index)" ] }, { "cell_type": "code", "execution_count": 13, "id": "45f91300-2b2c-46bc-99f9-78e31ce0a5b3", "metadata": {}, "outputs": [], "source": [ "dge_sorted.to_csv(\"dge_sorted.csv\", index = False)" ] }, { "cell_type": "code", "execution_count": 19, "id": "4984712e-1730-4e6c-a063-cbdf29092f8b", "metadata": {}, "outputs": [], "source": [ "dge_sorted = pd.read_csv(\"../files/dge_sorted.csv\", sep=\",\")" ] }, { "cell_type": "markdown", "id": "b91a30b7-0708-4a06-8c32-b3faf08046c6", "metadata": {}, "source": [ "### Drug targets" ] }, { "cell_type": "code", "execution_count": 15, "id": "f86487dd-724d-4c97-95a8-962a5236ca2e", "metadata": {}, "outputs": [], "source": [ "#DataFrame with drugs and their targets:\n", "total_drug_list = set()\n", "for drug in dru_pro[\"dru\"]:\n", " if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n", " list_drugs_total.add(drug)\n", "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)" ] }, { "cell_type": "code", "execution_count": 28, "id": "a99abc85-90d9-481f-ab68-3c7687868ecb", "metadata": {}, "outputs": [], "source": [ "# DataFrame with drugs that have at least one target protein in the LCC with significant differential expression.\n", "\n", "gen_dict_ = dict(zip(gen['name'], gen['id']))\n", "\n", "drugs = []\n", "genes = []\n", "pvalues = []\n", "logfc = []\n", "\n", "for i, gene_symbol in enumerate(dge_sorted[\"symbol\"]):\n", " gene_id = gen_dict_.get(gene_symbol)\n", " proteins = []\n", " for j, gene2 in enumerate(gen_pro[\"gen\"]):\n", " if gene_id == gene2:\n", " proteins.append(gen_pro[\"pro\"][j])\n", " for z, drug in enumerate(targets_total[\"Fármacos\"]):\n", " target_list = targets_total[\"Dianas\"][z]\n", " for protein in proteins:\n", " if protein in target_list:\n", " drugs.append(drug)\n", " genes.append(gene_symbol) \n", " pvalues.append(dge_sorted[\"fdr\"][i])\n", " logfc.append(dge_sorted[\"beta\"][i])\n", "\n", "results = {\n", " \"Drugs\": drugs,\n", " \"Gene symbol\": genes,\n", " \"log(Fold Change)\": logfc,\n", " \"Corrected pvalue\": pvalues,\n", "}\n", "\n", "dge_drugs = pd.DataFrame(results)\n" ] }, { "cell_type": "markdown", "id": "69dd8106-15d9-44d9-86f6-54511a38c8d8", "metadata": {}, "source": [ "### Assessment of module membership of co-expressed genes" ] }, { "cell_type": "code", "execution_count": 29, "id": "401ee7e6-7bf7-41a8-813a-227976418d08", "metadata": {}, "outputs": [], "source": [ "kme = pd.read_csv(\"../files/gandal_2018a_kMEs.csv\", sep = \",\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "e91afef6-7790-44a0-a71d-026f858a9604", "metadata": {}, "outputs": [], "source": [ "dge_genes = []\n", "\n", "for symbol in dge_drugs[\"Gene symbol\"]:\n", " for i,symbol2 in enumerate(kme[\"external_gene_id\"]):\n", " if symbol == symbol2:\n", " module = kme[\"Module.name\"][i]\n", " if module.startswith(\"CD\"):\n", " j = int(module[2:]) # Extrae el número del módulo (por ejemplo, 1, 2, 3, ...)\n", " if j != 0:\n", " kme_value = kme.iloc[i, j + 1]\n", " if kme_value > '0,5':\n", " dge_genes.append(symbol)" ] }, { "cell_type": "code", "execution_count": 31, "id": "196811f9-dc94-4722-b67a-b261aa561c5e", "metadata": {}, "outputs": [], "source": [ "kme_filtrado = dge_drugs[dge_drugs['Gene symbol'].isin(set(dge_genes))]" ] }, { "cell_type": "code", "execution_count": 33, "id": "6cc25eae-272d-4992-a493-d97b43fdd42c", "metadata": {}, "outputs": [], "source": [ "kme_filtrado.to_csv(\"../results/Filtering_by_DGE_results.csv\", index = False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }