{ "cells": [ { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "cases_repodb = pd.read_csv(\"cases_repodb_target.tsv\", sep='\\t')\n", "cases_repodb = cases_repodb.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "34" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cases_repodb[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "triplets_repodb = pd.read_csv('repodb_all_triples.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "550" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(triplets_repodb[\"disease1\"].unique())" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_one = triplets_repodb.rename(columns={\"disease1\": \"disease_id\",\"drug\":\"drug_id\"})" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_one = triplets_repodb_one.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_two = triplets_repodb.rename(columns={\"disease2\": \"disease_id\",\"drug\":\"drug_id\"})" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "triplets_repodb_two = triplets_repodb_two.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "join_one = cases_repodb.merge(triplets_repodb_one,how = \"inner\",on = [\"drug_id\",\"disease_id\"])" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(join_one[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "join_two = cases_repodb.merge(triplets_repodb_two,how = \"inner\",on = [\"drug_id\",\"disease_id\"])" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "join_two = join_two.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(join_two[\"disease_id\"].unique())" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "triples_repo_all = pd.concat([joinone_csbj_diseases,jointwo_csbj_diseases])" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "triples_repo_all = triples_repo_all.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(triples_repo_all[\"disease_id\"].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. DRUG - GENE - TARGET" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "dis_gen = pd.read_csv('dis_genes.tsv', sep='\\t')\n", "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.13747265487982685" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen[\"score\"].mean()" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "gen_dise_join = triples_repo_all.merge(dis_gen,how = \"inner\",on = [\"disease_id\",\"gene_id\",\"score\"])" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')\n", "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = [\"gene_id\",\"drug_id\"])" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "score_gdas_repodb_target = gen_dise_join_dru.merge(triples_repo_all ,how = \"inner\",on = [\"drug_id\",\"disease_id\",\"score\",\"gene_id\",\"disease_new\"])" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "score_gdas_repodb_target = score_gdas_repodb_target.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.21210059171595477" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score_gdas_repodb_target[\"score\"].mean()" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "score_gdas_repodb_target.to_csv(\"score_gdas_repodb_target_final.tsv\", sep='\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }