{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "eed38b7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from Bio import Entrez\n",
    "import GEOparse\n",
    "import geopandas as gpd\n",
    "import os\n",
    "import pandas as pd\n",
    "from GEOparse import GEOparse\n",
    "import gzip\n",
    "import shutil\n",
    "import funciones_network_medicine\n",
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "947d7a7a-7b53-4c14-8c9f-7d25ccfecdca",
   "metadata": {},
   "outputs": [],
   "source": [
    "#nodes\n",
    "pro = pd.read_csv('../data/nodes/pro.tsv', sep=\"\\t\")\n",
    "gen = pd.read_csv('../data/nodes/gen.tsv', sep=\"\\t\")\n",
    "dru = pd.read_csv('../data/nodes/dru.tsv', sep=\"\\t\")\n",
    "dis = pd.read_csv('../data/nodes/dis.tsv', sep=\"\\t\")\n",
    "#links\n",
    "pro_pro = pd.read_csv('../data/links/pro_pro.tsv', sep=\"\\t\")\n",
    "dis_gen = pd.read_csv('../data/links/dis_gen.tsv', sep=\"\\t\")\n",
    "dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep=\"\\t\")\n",
    "dis_dru_the = pd.read_csv('../data/links/dis_dru_the.tsv', sep=\"\\t\")\n",
    "gen_pro = pd.read_csv('../data/links/gen_pro.tsv', sep=\"\\t\")\n",
    "dru_pro = pd.read_csv('../data/links/dru_pro.tsv', sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f939592a-7c9c-4ec4-b20a-03c07439a56b",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Interactome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "37a4ff6e-e3e6-4b0f-ad8d-d3aef695027d",
   "metadata": {},
   "outputs": [],
   "source": [
    "G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94600860-ca3b-4f30-a2fe-530e719426d9",
   "metadata": {},
   "source": [
    "#### List of genes in LCC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5f2370f1-9d66-43e2-9b25-88c32e75a3ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_schizo = funciones_network_medicine.genes_enf(\"C0036341\", dis_gen)\n",
    "dict_schizo= funciones_network_medicine.pro_gen_dict(gen_schizo, gen_pro)\n",
    "dict_schizo_PPI = funciones_network_medicine.gen_pro_PPI(dict_schizo, pro_pro)\n",
    "SG_schizo= funciones_network_medicine.SG(dict_schizo_PPI, G_ppi)\n",
    "lcc_schizo= funciones_network_medicine.lcc(SG_schizo)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4c71bf7a-847f-4b63-919c-4d81207d61cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "list_num = []\n",
    "for pro in lcc_schizo:\n",
    "    for i,pro2 in enumerate(gen_pro[\"pro\"]):\n",
    "        if pro == pro2:\n",
    "            list_num.append(gen_pro[\"gen\"][i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8a56ffb7-76fa-451f-9849-ca96309c8fec",
   "metadata": {},
   "outputs": [],
   "source": [
    "list_genes = []\n",
    "for num_gen in set(list_num):\n",
    "    for i,num_gen2 in enumerate(gen[\"id\"]):\n",
    "        if num_gen == num_gen2:\n",
    "            gen_names = gen[\"name\"][i]\n",
    "            list_genes.append(gen_names) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67209750-1411-45a3-9ad9-5a92f8521c90",
   "metadata": {},
   "source": [
    "#### Microarray GE analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "dd844f76-955b-4708-9b54-a6617044cfe1",
   "metadata": {},
   "outputs": [],
   "source": [
    "dge = pd.read_csv('../files/Microarray_SCZ_metaanalysis_092017.csv', sep=\",\") #Gandal et al. 2018"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ce169023-6da3-4e67-8c4e-a38c5e31a60b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Genes from the Gandal study that are found in the schizophrenia module\n",
    "\n",
    "filtered_dge = dge[dge[\"symbol\"].isin(list_genes)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6be40bf1-e2b7-4470-8196-34620b7147f8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            Unnamed: 0      beta        SE         p       fdr  symbol\n",
      "7      ENSG00000001084  0.038813  0.029858  0.195025  0.405474    GCLC\n",
      "23     ENSG00000002822 -0.051322  0.019815  0.010259  0.058147  MAD1L1\n",
      "24     ENSG00000002834 -0.000855  0.014219  0.952121  0.976082   LASP1\n",
      "110    ENSG00000006128 -0.200785  0.047132  0.000031  0.000998    TAC1\n",
      "126    ENSG00000006611  0.071682  0.030708  0.020512  0.093734   USH1C\n",
      "...                ...       ...       ...       ...       ...     ...\n",
      "12208  ENSG00000256269 -0.047590  0.016506  0.004341  0.032269    HMBS\n",
      "12222  ENSG00000257017 -0.022547  0.024117  0.350880  0.577942      HP\n",
      "12251  ENSG00000259207  0.002088  0.020028  0.917063  0.960690   ITGB3\n",
      "12298  ENSG00000262683 -0.008338  0.019593  0.670838  0.824663    FHIT\n",
      "12386  ENSG00000273079  0.019514  0.026687  0.465453  0.676388  GRIN2B\n",
      "\n",
      "[769 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "print(filtered_dge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "89f27194-032e-4f1e-b6d1-1f0a2b4c5abc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Genes with significantly higher or lower expression in schizophrenia patients.\n",
    "\n",
    "fdr_filtered_dge = filtered_dge[filtered_dge[\"fdr\"] <= 0.05] #pvalue fixed with False Discovery Rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7e59d5d1-7a75-4d8a-b447-49ea639b3db3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            Unnamed: 0      beta        SE             p       fdr    symbol\n",
      "110    ENSG00000006128 -0.200785  0.047132  3.066855e-05  0.000998      TAC1\n",
      "148    ENSG00000007168 -0.061780  0.021111  3.801441e-03  0.029186  PAFAH1B1\n",
      "161    ENSG00000007372  0.174375  0.040167  2.189604e-05  0.000770      PAX6\n",
      "219    ENSG00000010256 -0.096030  0.019028  9.619559e-07  0.000083    UQCRC1\n",
      "260    ENSG00000011405  0.102244  0.036488  5.545087e-03  0.038023   PIK3C2A\n",
      "...                ...       ...       ...           ...       ...       ...\n",
      "11459  ENSG00000204525 -0.110543  0.034336  1.485081e-03  0.014967     HLA-C\n",
      "11471  ENSG00000204580  0.124058  0.035651  6.085740e-04  0.007933      DDR1\n",
      "11678  ENSG00000214113 -0.057975  0.021102  6.522134e-03  0.042600     LYRM4\n",
      "11906  ENSG00000231925 -0.053500  0.016371  1.262639e-03  0.013331     TAPBP\n",
      "12208  ENSG00000256269 -0.047590  0.016506  4.341083e-03  0.032269      HMBS\n",
      "\n",
      "[178 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "print(fdr_filtered_dge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8bc007a4-2597-4f4a-a542-cced89d231b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "#sort based on logFC (in absolute value)\n",
    "dge_sorted = fdr_filtered_dge.reindex(fdr_filtered_dge['beta'].abs().sort_values(ascending=False).index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "45f91300-2b2c-46bc-99f9-78e31ce0a5b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "dge_sorted.to_csv(\"dge_sorted.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4984712e-1730-4e6c-a063-cbdf29092f8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "dge_sorted = pd.read_csv(\"../files/dge_sorted.csv\", sep=\",\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b91a30b7-0708-4a06-8c32-b3faf08046c6",
   "metadata": {},
   "source": [
    "### Drug targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f86487dd-724d-4c97-95a8-962a5236ca2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#DataFrame with drugs and their targets:\n",
    "total_drug_list = set()\n",
    "for drug in dru_pro[\"dru\"]:\n",
    "     if drug in set(dis_dru_the[\"dru\"].values): # drugs for which we have information about what diseases they treat\n",
    "            list_drugs_total.add(drug)\n",
    "targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a99abc85-90d9-481f-ab68-3c7687868ecb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# DataFrame with drugs that have at least one target protein in the LCC with significant differential expression.\n",
    "\n",
    "gen_dict_ = dict(zip(gen['name'], gen['id']))\n",
    "\n",
    "drugs = []\n",
    "genes = []\n",
    "pvalues = []\n",
    "logfc = []\n",
    "\n",
    "for i, gene_symbol in enumerate(dge_sorted[\"symbol\"]):\n",
    "    gene_id = gen_dict_.get(gene_symbol)\n",
    "    proteins = []\n",
    "    for j, gene2 in enumerate(gen_pro[\"gen\"]):\n",
    "        if gene_id == gene2:\n",
    "            proteins.append(gen_pro[\"pro\"][j])\n",
    "    for z, drug in enumerate(targets_total[\"Fármacos\"]):\n",
    "        target_list = targets_total[\"Dianas\"][z]\n",
    "        for protein in proteins:\n",
    "            if protein in target_list:\n",
    "                drugs.append(drug)\n",
    "                genes.append(gene_symbol) \n",
    "                pvalues.append(dge_sorted[\"fdr\"][i])\n",
    "                logfc.append(dge_sorted[\"beta\"][i])\n",
    "\n",
    "results = {\n",
    "    \"Drugs\": drugs,\n",
    "    \"Gene symbol\": genes,\n",
    "    \"log(Fold Change)\": logfc,\n",
    "    \"Corrected pvalue\": pvalues,\n",
    "}\n",
    "\n",
    "dge_drugs = pd.DataFrame(results)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69dd8106-15d9-44d9-86f6-54511a38c8d8",
   "metadata": {},
   "source": [
    "### Assessment of module membership of co-expressed genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "401ee7e6-7bf7-41a8-813a-227976418d08",
   "metadata": {},
   "outputs": [],
   "source": [
    "kme = pd.read_csv(\"../files/gandal_2018a_kMEs.csv\", sep = \",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "e91afef6-7790-44a0-a71d-026f858a9604",
   "metadata": {},
   "outputs": [],
   "source": [
    "dge_genes = []\n",
    "\n",
    "for symbol in dge_drugs[\"Gene symbol\"]:\n",
    "    for i,symbol2 in enumerate(kme[\"external_gene_id\"]):\n",
    "        if symbol == symbol2:\n",
    "            module = kme[\"Module.name\"][i]\n",
    "            if module.startswith(\"CD\"):\n",
    "                j = int(module[2:])  # Extrae el número del módulo (por ejemplo, 1, 2, 3, ...)\n",
    "                if j != 0:\n",
    "                    kme_value = kme.iloc[i, j + 1]\n",
    "                    if kme_value > '0,5':\n",
    "                        dge_genes.append(symbol)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "196811f9-dc94-4722-b67a-b261aa561c5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "kme_filtrado = dge_drugs[dge_drugs['Gene symbol'].isin(set(dge_genes))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "6cc25eae-272d-4992-a493-d97b43fdd42c",
   "metadata": {},
   "outputs": [],
   "source": [
    "kme_filtrado.to_csv(\"../results/Filtering_by_DGE_results.csv\", index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}