{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_excel('./Data/Input/DISNET/triples_drebiop_final_dos.xlsx',engine='openpyxl')\n", "triplets_total = triplets_total.drop(columns=['Unnamed: 0'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "disease_two = triplets_total[[\"disease_id\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# genes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "genes = pd.read_excel((\"./Data/Input/DISNET/dis_gen.xlsx\"),engine='openpyxl')\n", "genes = gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_gene = disease_two.merge(genes, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 6.000000\n", "mean 423.666667\n", "std 372.215350\n", "min 3.000000\n", "25% 104.500000\n", "50% 428.500000\n", "75% 687.250000\n", "max 912.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pw_gene = pd.read_csv('./Data/Input/DISNET/pathways_genes.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = disease_two.merge(pw_gene, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = dis_pw_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = dis_gen_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 6.000000\n", "mean 1568.000000\n", "std 1344.291337\n", "min 3.000000\n", "25% 415.000000\n", "50% 1745.500000\n", "75% 2380.000000\n", "max 3351.000000\n", "Name: pathway_id, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].describe()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dr_pw = disease_two.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "25.833333333333332" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dr_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### drug" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug = pd.read_csv('./Data/Input/DISNET/drugs.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_drug = disease_two.merge(drug, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "dis_drug = dis_drug.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "dis_drug = dis_drug.reset_index()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "657.3333333333334" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_drug[\"drug_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "### symptom" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sint = pd.read_csv('./Data/Input/DISNET/sint_all.tsv', sep='\\t')\n", "sint = sint.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_sint = disease_two.merge(sint, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "dis_sint = dis_sint.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "dis_sint = dis_sint.reset_index()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "106.66666666666667" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_sint[\"symptom\"].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pathways_type" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "drebiop_repodb = pd.read_excel((\"./Data/Input/DISNET/debriop_repodb_pws_info.xlsx\"),engine='openpyxl')\n", "drebiop_repodb = drebiop_repodb.drop(columns=['Unnamed: 0'])\n", "drebiop_repodb = drebiop_repodb.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "drebiop_repodb_flip = pd.read_excel((\"./Data/Input/DISNET/debriop_repodb_flip_pws_info.xlsx\"),engine='openpyxl')\n", "drebiop_repodb_flip = drebiop_repodb_flip.drop(columns=['Unnamed: 0'])\n", "drebiop_repodb_flip = drebiop_repodb_flip.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "drebiop_repodb_flip = drebiop_repodb_flip.rename(columns={\"disease2\": \"disease1\"})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "drebiop_csbj = pd.read_excel((\"./Data/Input/DISNET/debriop_csbj_pws_info.xlsx\"),engine='openpyxl')\n", "drebiop_csbj = drebiop_csbj.drop(columns=['Unnamed: 0'])\n", "drebiop_csbj = drebiop_csbj.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "drebiop_csbj = drebiop_csbj.rename(columns={\"Original Condition CUI\": \"disease1\"})" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "pws_all_triples_drebiop = pd.concat([drebiop_repodb,drebiop_repodb_flip,drebiop_csbj])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "60" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(pws_all_triples_drebiop)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "WP554 18\n", "WP1531 18\n", "WP4756 9\n", "WP2371 6\n", "WP3303 6\n", "WP1533 3\n", "Name: pathway_id, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pws_all_triples_drebiop[\"pathway_id\"].value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 4 }