{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. DATA " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "duples_repodb = pd.read_csv(\"./Data/Input/Drug Repurposing/repoDB_all_disdru.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "duples_csbj = pd.read_csv(\"./Data/Input/Drug Repurposing/duplas_CSBJ.tsv\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "duples_csbj = duples_csbj.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "duples_csbj = duples_csbj.rename(columns={\"Disease CUI\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
disease_idpathway_id
0C0020538WP554
1C0018799WP1544
2C0018799WP1528
3C0027947WP229
4C0013369WP229
.........
659C0268274WP4153
660C0085131WP4153
661C0036161WP4153
662C0268275WP4153
663C0162666WP4236
\n", "

664 rows × 2 columns

\n", "
" ], "text/plain": [ " disease_id pathway_id\n", "0 C0020538 WP554\n", "1 C0018799 WP1544\n", "2 C0018799 WP1528\n", "3 C0027947 WP229\n", "4 C0013369 WP229\n", ".. ... ...\n", "659 C0268274 WP4153\n", "660 C0085131 WP4153\n", "661 C0036161 WP4153\n", "662 C0268275 WP4153\n", "663 C0162666 WP4236\n", "\n", "[664 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')\n", "dis_path_direct" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes_gda.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "dis_gen_sinfil = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "dis_gen_sinfil = dis_gen_sinfil.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JOIN REPODB" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "dis_drug_gen_sinfil = dis_gen_sinfil.merge(drug_gen, how =\"inner\", on = \"gene_id\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "duplas_target_repodb = dis_drug_path_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "cases_repodb_target = duples_repodb.merge(duplas_target_repodb,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "cases_repodb_target.to_csv(\"cases_repodb_target.tsv\", sep =\"\\t\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.27445783132529894" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cases_repodb_target[\"score\"].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JOIN CSBJ" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "duplas_target_csbj = dis_drug_path_csbj_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "cases_csbj_target = duples_csbj.merge(duplas_target_csbj,how = \"inner\",on = [\"disease_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "cases_csbj_target.to_csv(\"cases_csbj_target.tsv\", sep =\"\\t\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }