{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "duples_repodb = pd.read_csv(\"./Data/Input/Drug Repurposing/repoDB_all_disdru.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "duples_csbj = pd.read_csv(\"./Data/Input/Drug Repurposing/duplas_CSBJ.tsv\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "duples_csbj = duples_csbj.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "duples_csbj = duples_csbj.rename(columns={\"Disease CUI\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes_gda.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "dis_gen_sinfil = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "dis_gen_sinfil = dis_gen_sinfil.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JOIN REPODB" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dis_drug_path = duples_repodb.merge(dis_path_direct, how='inner', on='disease_id')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_fil = dis_drug_path.drop([\"pathway_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_fil = dis_drug_path_fil.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "dis_drug_gen = dis_gen.merge(drug_gen, how =\"inner\", on = \"gene_id\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "dis_drug_gen = dis_drug_gen.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "dis_drug_original = dis_drug_path_fil.merge(dis_drug_gen, how = \"outer\", on =[\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "dis_drug =dis_drug_original.drop_duplicates()[['disease_id','drug_id']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "cases_repodb_gda = duples_repodb.merge(dis_drug,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "cases_repodb_gda.to_csv(\"cases_repodb_gda.tsv\", sep =\"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JOIN CSBJ" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_csbj = duples_csbj.merge(dis_path_direct , how = \"inner\", on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_csbj_fil = dis_drug_path_csbj.drop([\"pathway_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "dis_drug_original_csbj = dis_drug_path_csbj_fil.merge(dis_drug_gen, how = \"outer\", on =[\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "dis_drug_csbj =dis_drug_original_csbj.drop_duplicates()[['disease_id','drug_id']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "cases_csbj = duples_csbj.merge(dis_drug_csbj,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "cases_csbj.to_csv(\"cases_csbj.tsv\", sep=\"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Join pathway-disease with pathway-drug" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "cases_repodb = pd.read_csv(\"./Data/Input/DISNET/cases_repodb_gda.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "55" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cases_repodb[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(cases_repodb[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_num = dis_drug_path" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "51" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dis_drug_path_num[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "drug_gen_pw = pd.read_csv(\"./Data/Input/DISNET/drug_gen_pw.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "drug_gen_pw = drug_gen_pw.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## 1º Join pws-based DR cases with their pathways" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "pw_based_DR_pw = cases_repodb.merge(dis_drug_path_num,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "51" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(pw_based_DR_pw[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## 2º Join with drug and pathway-target" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "final = pw_based_DR_pw.merge(drug_gen_pw,how = \"inner\",on = [\"pathway_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "final_cases_repodb =final.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_drug_path_num = dis_drug_path_num.groupby(['disease_id', 'drug_id']).count()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "final_cases_repodb.to_csv(\"final_cases_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "final_cases_repodb = pd.read_csv(\"./Data/Input/DISNET/final_cases_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "cases_csbj = pd.read_csv(\"./Data/Input/DISNET/cases_csbj_gda.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "cases_csbj = cases_csbj.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "pw_based_DR_pw_csbj = cases_csbj.merge(dis_drug_path_num,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "final_csbj = pw_based_DR_pw_csbj.merge(drug_gen_pw,how = \"inner\",on = [\"pathway_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "final_csbj.to_csv(\"final_cases_csbj.tsv\", sep='\\t')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Number of pathways shared between disease and drug " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "cases_csbj = pd.read_csv(\"./Data/Input/DISNET/final_cases_csbj.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "cases_csbj = cases_csbj.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "cases_csbj_num = cases_csbj.groupby(['disease_id', 'drug_id']).count()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "cases_repodb = pd.read_csv(\"./Data/Input/DISNET/final_cases_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "cases_repodb = cases_repodb.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "cases_repodb_pw = cases_repodb.drop([\"gene_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "cases_repodb_pw = cases_repodb_pw.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "cases_repodb_pw_num = cases_repodb_pw.groupby(['disease_id', 'drug_id']).count()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "cases_repodb_pw_num.to_csv(\"cases_repodb_pw_num.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "cases_repodb_pw_num = pd.read_csv(\"./Data/Input/DISNET/cases_repodb_pw_num.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.4565217391304348" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cases_repodb_pw_num[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathway_id
count46.000000
mean1.456522
std0.503610
min1.000000
25%1.000000
50%1.000000
75%2.000000
max2.000000
\n", "
" ], "text/plain": [ " pathway_id\n", "count 46.000000\n", "mean 1.456522\n", "std 0.503610\n", "min 1.000000\n", "25% 1.000000\n", "50% 1.000000\n", "75% 2.000000\n", "max 2.000000" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cases_repodb_pw_num.describe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }