{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b5c3adf4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "import mysql.connector" ] }, { "cell_type": "code", "execution_count": null, "id": "2cff675e", "metadata": {}, "outputs": [], "source": [ "triples_repodb = pd.read_csv(\"triples_filter_repodb_final.tsv\", sep='\\,')\n", "triples_repodb = triples_repodb.drop(columns=['Unnamed: 0'])" ] }, { "cell_type": "code", "execution_count": 7, "id": "ba5e2928", "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 8, "id": "be28a4f6", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path = triples_repodb.merge(dis_path_direct, how = \"inner\", left_on = \"disease2\", right_on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "55b5fc3b", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path = od_d_nd_path.drop(columns=['disease2'])" ] }, { "cell_type": "code", "execution_count": 8, "id": "c2d3f708", "metadata": {}, "outputs": [], "source": [ "drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')\n", "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 27, "id": "3a0dcd9b", "metadata": {}, "outputs": [], "source": [ "drug_gen = drug_gen.rename(columns={\"gene_id\": \"gene_id_target\"})" ] }, { "cell_type": "code", "execution_count": 30, "id": "dfba018e", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target = od_d_nd_path.merge(drug_gen, left_on = \"drug\",right_on = \"drug_id\" , how = \"inner\")" ] }, { "cell_type": "code", "execution_count": 31, "id": "18787654", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target = od_d_nd_path_target.drop([\"drug\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 32, "id": "8dd7c4bb", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target = od_d_nd_path_target.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 9, "id": "e61d4d31", "metadata": {}, "outputs": [], "source": [ "dis_gen = pd.read_csv('dis_gen.tsv', sep='\\t')\n", "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 37, "id": "b7ab095a", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene = od_d_nd_path_target.merge(dis_gen, on = \"disease_id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "b9218233", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene = od_d_nd_path_target_gene.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 40, "id": "ceaaff21", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene[\"Type of triples\"] = np.where(od_d_nd_path_target_gene[\"gene_id_target\"] == od_d_nd_path_target_gene[\"gene_id\"], 1, 0)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2eb61c5a", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene[od_d_nd_path_target_gene[\"Type of triples\"] == 1]" ] }, { "cell_type": "code", "execution_count": 41, "id": "16e612a8", "metadata": {}, "outputs": [], "source": [ "triples_singen = od_d_nd_path_target_gene[od_d_nd_path_target_gene[\"Type of triples\"] == 0]" ] }, { "cell_type": "code", "execution_count": null, "id": "e3be6ca2", "metadata": {}, "outputs": [], "source": [ "triples_singen.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 23, "id": "7c0c5378", "metadata": {}, "outputs": [], "source": [ "drug_gen_pw = pd.read_csv(\"drug_gen_pw.tsv\", sep='\\t')\n", "drug_gen_pw = drug_gen_pw.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 24, "id": "6ec60bfb", "metadata": {}, "outputs": [], "source": [ "drug_gen_pw = drug_gen_pw.rename(columns={\"gene_id\": \"gene_id_target_pw\",\"pathway_id\":\"pathway_id_drug\"})" ] }, { "cell_type": "code", "execution_count": 45, "id": "adcad27c", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws = triples_singen.merge(drug_gen_pw, on = \"drug_id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 46, "id": "cf4b8039", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws[\"Type of triples_pws\"] = np.where(triples_singen_pws[\"pathway_id\"] == triples_singen_pws[\"pathway_id_drug\"], 1, 0)" ] }, { "cell_type": "code", "execution_count": null, "id": "50c26976", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws.to_excel(\"triples_singen_pws.xlsx\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "8e295a34", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter = triples_singen_pws[triples_singen_pws[\"Type of triples_pws\"] == 1]" ] }, { "cell_type": "code", "execution_count": 50, "id": "ba3056a2", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final = triples_singen_pws_filter[[\"disease1\",\"drug_id\",\"disease_id\"]]" ] }, { "cell_type": "code", "execution_count": 94, "id": "10cbc38c", "metadata": {}, "outputs": [], "source": [ "triples_final_repodb_gen_pw = triples_singen_pws_filter[[\"disease1\",\"drug_id\",\"disease_id\",\"pathway_id\",\"gene_id_target_pw\"]]" ] }, { "cell_type": "code", "execution_count": 95, "id": "54cc6f1c", "metadata": {}, "outputs": [], "source": [ "triples_final_repodb_gen_pw.to_excel(\"debriop_repodb_pws_info.xlsx\")" ] }, { "cell_type": "code", "execution_count": 78, "id": "dd31e3b6", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_repo = triples_singen_pws_filter_final.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 92, "id": "b91f1162", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(triples_singen_pws_filter_final_repo)" ] }, { "cell_type": "code", "execution_count": 93, "id": "2e9f0b78", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_repo.to_excel(\"triples_singen_pws_filter_final_repo.xlsx\")" ] }, { "cell_type": "markdown", "id": "301a27b4", "metadata": {}, "source": [ "# CSBJ" ] }, { "cell_type": "code", "execution_count": 3, "id": "fee88359", "metadata": {}, "outputs": [], "source": [ "triplets_csbj = pd.read_excel(\"triplets_chembl_disnet.xlsx\",engine='openpyxl')\n", "triplets_csbj =triplets_csbj.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 28, "id": "45875ab7", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_csbj = triplets_csbj.merge(dis_path_direct, how = \"inner\", left_on = \"New Condition CUI\", right_on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "5ac1e72e", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_csbj = od_d_nd_path_csbj.drop(columns=['New Condition CUI'])" ] }, { "cell_type": "code", "execution_count": 33, "id": "1e0c7a5b", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_csbj = od_d_nd_path_csbj.merge(drug_gen,on = \"drug_id\" , how = \"inner\")" ] }, { "cell_type": "code", "execution_count": 36, "id": "8b819e18", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene_csbj = od_d_nd_path_target_csbj.merge(dis_gen, on = \"disease_id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "c7895254", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene_csbj = od_d_nd_path_target_gene_csbj.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "id": "1b50ab1b", "metadata": {}, "outputs": [], "source": [ "od_d_nd_path_target_gene_csbj[\"Type of triples\"] = np.where(od_d_nd_path_target_gene_csbj[\"gene_id_target\"] == od_d_nd_path_target_gene_csbj[\"gene_id\"], 1, 0)" ] }, { "cell_type": "code", "execution_count": 42, "id": "9cacc04a", "metadata": {}, "outputs": [], "source": [ "triples_singen_csbj = od_d_nd_path_target_gene_csbj[od_d_nd_path_target_gene_csbj[\"Type of triples\"] == 0]" ] }, { "cell_type": "code", "execution_count": 43, "id": "04940a5d", "metadata": {}, "outputs": [], "source": [ "triples_singen_csbj = triples_singen_csbj.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 47, "id": "ba2d8862", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_csbj = triples_singen_csbj.merge(drug_gen_pw, on = \"drug_id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 48, "id": "a71b115c", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_csbj[\"Type of triples_pws\"] = np.where(triples_singen_pws_csbj[\"pathway_id\"] == triples_singen_pws_csbj[\"pathway_id_drug\"], 1, 0)" ] }, { "cell_type": "code", "execution_count": 50, "id": "e4287678", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_csbj = triples_singen_pws_csbj[triples_singen_pws_csbj[\"Type of triples_pws\"] == 1]" ] }, { "cell_type": "code", "execution_count": 52, "id": "53abc83d", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_csbj = triples_singen_pws_filter_csbj[[\"Original Condition CUI\",\"drug_id\",\"disease_id\"]]" ] }, { "cell_type": "code", "execution_count": 53, "id": "23792fa8", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_csbj =triples_singen_pws_filter_final_csbj.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 55, "id": "d61a1707", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_csbj.to_excel(\"triples_csbj_drebiop.xlsx\")" ] }, { "cell_type": "markdown", "id": "bd946bbb", "metadata": {}, "source": [ "# FINAL TRIPLES DREBIOP" ] }, { "cell_type": "code", "execution_count": 82, "id": "6fee7ca1", "metadata": {}, "outputs": [], "source": [ "triples_singen_pws_filter_final_repo =triples_singen_pws_filter_final_repo.rename(columns={\"disease1\": \"Original Condition CUI\"})" ] }, { "cell_type": "code", "execution_count": 83, "id": "d98bdb9d", "metadata": {}, "outputs": [], "source": [ "triples_drebiop_final = pd.concat([triples_singen_pws_filter_final_repo,triples_singen_pws_filter_final_csbj])" ] }, { "cell_type": "code", "execution_count": 88, "id": "94aa1fe8", "metadata": {}, "outputs": [], "source": [ "triples_drebiop_final.to_excel(\"triples_drebiop_final.xlsx\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }