{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "cases_repodb = pd.read_csv(\"./Data/Input/DISNET/final_cases_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "cases_repodb = cases_repodb.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "triplets_repodb = pd.read_csv('./Data/Input/DISNET/repodb_all_triples.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_one = triplets_repodb.rename(columns={\"disease1\": \"disease_id\",\"drug\":\"drug_id\"})" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_one = triplets_repodb_one.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_two = triplets_repodb.rename(columns={\"disease2\": \"disease_id\",\"drug\":\"drug_id\"})" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_two = triplets_repodb_two.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "join_one = cases_repodb.merge(triplets_repodb_one,how = \"inner\",on = [\"drug_id\",\"disease_id\"])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "join_two = cases_repodb.merge(triplets_repodb_two,how = \"inner\",on = [\"drug_id\",\"disease_id\"])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "join_two = join_two.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "join_one_filter = join_one.drop(columns = [\"disease_id\",\"pathway_id\",\"gene_id\"])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "join_two_filter = join_two.drop(columns = [\"disease_id\",\"pathway_id\",\"gene_id\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. DRUG - GENE - TARGET" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')\n", "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.13747265487982685" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen[\"score\"].mean()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "gen_dise_join = joinone_csbj_diseases.merge(dis_gen,how = \"inner\",on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')\n", "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = [\"gene_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_dru.to_csv(\"score_gdas_one_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1446285714285714" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_dise_join_dru[\"score\"].mean()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(gen_dise_join_dru[\"disease_id\"].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### JOIN TWO " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_two = jointwo_csbj_diseases.merge(dis_gen,how = \"inner\",on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_dru_two = gen_dise_join_two.merge(drug_gen,how = \"inner\",on = [\"gene_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_dru_two.to_csv(\"score_gdas_two_repodb.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.18121951219512178" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_dise_join_dru_two[\"score\"].mean()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(gen_dise_join_dru_two[\"disease_id\"].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.PATHWAYS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1 Pathways direct " ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "joinone_csbj_diseases_fil = joinone_csbj_diseases.drop([\"drug_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "direct_dise_pw = joinone_csbj_diseases_fil.merge(dis_path_direct,how = \"inner\",on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "direct_dise_pw = direct_dise_pw.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(direct_dise_pw[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "cases_repodb_filter = cases_repodb.drop([\"drug_id\",\"gene_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "pws_direct_ori_new = direct_dise_pw.merge(cases_repodb_filter,how = \"inner\",on = [\"disease_id\",\"pathway_id\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### JOIN TWO" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "jointwo_csbj_diseases_fil = jointwo_csbj_diseases.drop([\"drug_id\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "direct_dise_pw_two = jointwo_csbj_diseases_fil.merge(dis_path_direct,how = \"inner\",on = \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "direct_dise_pw_two = direct_dise_pw_two.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(direct_dise_pw_two[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "pws_direct_ori_new_two = direct_dise_pw_two.merge(cases_repodb_filter,how = \"inner\",on = [\"disease_id\",\"pathway_id\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.2 Pathways via genes" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = pd.read_csv('./Data/Input/DISNET/dis_gen_pw.tsv', sep='\\t')\n", "dis_gen_pw = dis_gen_pw.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "join_one_filter = join_one.drop(columns = [\"disease_id\",\"drug_id\",\"gene_id\"])" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "join_one_filter = join_one_filter.rename(columns={\"disease2\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "pws_via_gen = join_one_filter.merge(dis_gen_pw,how = \"inner\",on = [\"disease_id\",\"pathway_id\"])" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(pws_via_gen[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "pws_via_gen_final = pws_via_gen.merge(cases_repodb,how = \"inner\",on = [\"disease_id\",\"pathway_id\",\"gene_id\"])" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "join_two_filter = join_two.drop(columns = [\"disease_id\",\"drug_id\",\"gene_id\"])" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "join_two_filter = join_two_filter.rename(columns={\"disease1\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "pws_via_gen_two = join_two_filter.merge(dis_gen_pw,how = \"inner\",on = [\"disease_id\",\"pathway_id\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. FINAL DATASET" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "join_one_re = join_one.rename(columns={\"disease2\": \"disease_no_PwB\"})" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "join_two_re = join_two.rename(columns={\"disease1\": \"disease_no_PwB\"})" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "all_triplets_repodb = pd.concat([join_one_re, join_two_re])" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "all_triplets_repodb = all_triplets_repodb.rename(columns={\"disease_id\": \"disease_PwB\"})" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "all_triplets_repodb.to_csv(\"all_triplets_repodb_union.tsv\", sep='\\t' )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }