{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sqlalchemy import create_engine\n",
    "from sklearn import preprocessing\n",
    "import mysql.connector\n",
    "from pandas import DataFrame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "cases_repodb = pd.read_csv(\"cases_repodb_target.tsv\", sep='\\t')\n",
    "cases_repodb = cases_repodb.drop([\"Unnamed: 0\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cases_repodb[\"disease_id\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_repodb = pd.read_csv('repodb_all_triples.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "550"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(triplets_repodb[\"disease1\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_repodb_one = triplets_repodb.rename(columns={\"disease1\": \"disease_id\",\"drug\":\"drug_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_repodb_one = triplets_repodb_one.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_repodb_two = triplets_repodb.rename(columns={\"disease2\": \"disease_id\",\"drug\":\"drug_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets_repodb_two = triplets_repodb_two.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "join_one = cases_repodb.merge(triplets_repodb_one,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "27"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(join_one[\"disease_id\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "join_two = cases_repodb.merge(triplets_repodb_two,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "join_two = join_two.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(join_two[\"disease_id\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "triples_repo_all = pd.concat([joinone_csbj_diseases,jointwo_csbj_diseases])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "triples_repo_all = triples_repo_all.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(triples_repo_all[\"disease_id\"].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. DRUG - GENE - TARGET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "dis_gen = pd.read_csv('dis_genes.tsv', sep='\\t')\n",
    "dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.13747265487982685"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dis_gen[\"score\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_dise_join = triples_repo_all.merge(dis_gen,how = \"inner\",on = [\"disease_id\",\"gene_id\",\"score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')\n",
    "drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = [\"gene_id\",\"drug_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_gdas_repodb_target = gen_dise_join_dru.merge(triples_repo_all ,how = \"inner\",on = [\"drug_id\",\"disease_id\",\"score\",\"gene_id\",\"disease_new\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_gdas_repodb_target = score_gdas_repodb_target.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.21210059171595477"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score_gdas_repodb_target[\"score\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_gdas_repodb_target.to_csv(\"score_gdas_repodb_target_final.tsv\", sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}