Commit e4456ac6 authored by Belen Otero Carrasco's avatar Belen Otero Carrasco

code

parent 50e8fe6e
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DATA "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"duples_repodb = pd.read_csv(\"./Data/Input/Drug Repurposing/repoDB_all_disdru.tsv\", sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = pd.read_csv(\"./Data/Input/Drug Repurposing/duplas_CSBJ.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = duples_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = duples_csbj.rename(columns={\"Disease CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>disease_id</th>\n",
" <th>pathway_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>C0020538</td>\n",
" <td>WP554</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>C0018799</td>\n",
" <td>WP1544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C0018799</td>\n",
" <td>WP1528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>C0027947</td>\n",
" <td>WP229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>C0013369</td>\n",
" <td>WP229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>659</th>\n",
" <td>C0268274</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>660</th>\n",
" <td>C0085131</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>661</th>\n",
" <td>C0036161</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>662</th>\n",
" <td>C0268275</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>663</th>\n",
" <td>C0162666</td>\n",
" <td>WP4236</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>664 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" disease_id pathway_id\n",
"0 C0020538 WP554\n",
"1 C0018799 WP1544\n",
"2 C0018799 WP1528\n",
"3 C0027947 WP229\n",
"4 C0013369 WP229\n",
".. ... ...\n",
"659 C0268274 WP4153\n",
"660 C0085131 WP4153\n",
"661 C0036161 WP4153\n",
"662 C0268275 WP4153\n",
"663 C0162666 WP4236\n",
"\n",
"[664 rows x 2 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')\n",
"dis_path_direct"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes_gda.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"dis_gen_sinfil = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"dis_gen_sinfil = dis_gen_sinfil.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### JOIN REPODB"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"dis_drug_gen_sinfil = dis_gen_sinfil.merge(drug_gen, how =\"inner\", on = \"gene_id\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"duplas_target_repodb = dis_drug_path_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"cases_repodb_target = duples_repodb.merge(duplas_target_repodb,how = \"inner\",on = [\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"cases_repodb_target.to_csv(\"cases_repodb_target.tsv\", sep =\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.27445783132529894"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cases_repodb_target[\"score\"].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### JOIN CSBJ"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"duplas_target_csbj = dis_drug_path_csbj_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target = duples_csbj.merge(duplas_target_csbj,how = \"inner\",on = [\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target.to_csv(\"cases_csbj_target.tsv\", sep =\"\\t\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target = pd.read_csv(\"./Data/Input/DISNET/cases_csbj_target.tsv\", sep =\"\\t\")\n",
"cases_csbj = cases_csbj_target.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(cases_csbj[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj = pd.read_excel(\"./Data/Input/DISNET/triplets_chembl_disnet.xlsx\",engine='openpyxl')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj =triplets_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj =triplets_csbj.rename(columns={\"Original Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"join_csbj = cases_csbj.merge(triplets_csbj,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(join_csbj[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_drug = join_csbj.drop([\"gene_id\",\"score\",\"Original Condition\",\"New Condition\",\"New Condition CUI\",\"Drugs\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_drug =join_csbj_drug.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = join_csbj[\"New Condition CUI\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = pd.DataFrame(join_csbj_diseases).drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases =join_csbj_diseases.rename(columns={\"New Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DRUG - GENE - TARGET"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')\n",
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join = join_csbj_diseases.merge(dis_gen,how = \"inner\",on = \"disease_id\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')\n",
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = \"gene_id\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj = gen_dise_join_dru.merge(join_csbj ,how = \"inner\",on = [\"drug_id\",\"disease_id\",\"score\",\"gene_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.28196850393700795"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_gdas_csbj[\"score\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj.to_csv(\"score_gdas_csbj_target_filtergen.tsv\", sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"cases_repodb = pd.read_csv(\"./Data/Input/DISNET/cases_repodb_target.tsv\", sep='\\t')\n",
"cases_repodb = cases_repodb.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(cases_repodb[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"triplets_repodb = pd.read_csv('./Data/Input/DISNET/repodb_all_triples.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"550"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(triplets_repodb[\"disease1\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"triplets_repodb_one = triplets_repodb.rename(columns={\"disease1\": \"disease_id\",\"drug\":\"drug_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"triplets_repodb_one = triplets_repodb_one.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"triplets_repodb_two = triplets_repodb.rename(columns={\"disease2\": \"disease_id\",\"drug\":\"drug_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"triplets_repodb_two = triplets_repodb_two.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"join_one = cases_repodb.merge(triplets_repodb_one,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"27"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(join_one[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"join_two = cases_repodb.merge(triplets_repodb_two,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"join_two = join_two.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"32"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(join_two[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"triples_repo_all = pd.concat([joinone_csbj_diseases,jointwo_csbj_diseases])"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"triples_repo_all = triples_repo_all.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"32"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(triples_repo_all[\"disease_id\"].unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DRUG - GENE - TARGET"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('./Data/Input/DISNET/dis_genes.tsv', sep='\\t')\n",
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13747265487982685"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dis_gen[\"score\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join = triples_repo_all.merge(dis_gen,how = \"inner\",on = [\"disease_id\",\"gene_id\",\"score\"])"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('./Data/Input/DISNET/drug_gen.tsv', sep='\\t')\n",
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = [\"gene_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_repodb_target = gen_dise_join_dru.merge(triples_repo_all ,how = \"inner\",on = [\"drug_id\",\"disease_id\",\"score\",\"gene_id\",\"disease_new\"])"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_repodb_target = score_gdas_repodb_target.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.21210059171595477"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_gdas_repodb_target[\"score\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_repodb_target.to_csv(\"score_gdas_repodb_target_final.tsv\", sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment