{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_excel('./Data/Input/DISNET/triples_drebiop_final_dos.xlsx',engine='openpyxl')\n", "triplets_total = triplets_total.drop(columns=['Unnamed: 0'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "disease_one = triplets_total[[\"Original Condition CUI\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "disease_one = disease_one.rename(columns={\"Original Condition CUI\": \"disease_id\"})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# genes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "genes = pd.read_excel((\"./Data/Input/DISNET/dis_gen.xlsx\"),engine='openpyxl')\n", "genes = gen.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_gene = disease_one.merge(genes, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gene.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 23.000000\n", "mean 105.130435\n", "std 149.308194\n", "min 2.000000\n", "25% 17.500000\n", "50% 37.000000\n", "75% 91.500000\n", "max 526.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pw_gene = pd.read_csv('./Data/Input/DISNET/pathways_genes.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = disease_one.merge(pw_gene, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = dis_pw_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_pw_gen = dis_pw_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "391.04347826086956" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_pw_gen[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "dis_path_direct = pd.read_csv('./Data/Input/DISNET/disease_pathway.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "dr_pw = disease_one.merge(dis_path_direct,on=\"disease_id\",how= \"inner\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "dr_pw = dr_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "nan" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dr_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### drug" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug = pd.read_csv('./Data/Input/DISNET/drugs.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_drug = disease_one.merge(drug, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "dis_drug = dis_drug.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "dis_drug = dis_drug.reset_index()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "384.6" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_drug[\"drug_id\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### symptom" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sint = pd.read_csv('./Data/Input/DISNET/sint_all.tsv', sep='\\t')\n", "sint = sint.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis_sint = disease_one.merge(sint, how= \"inner\", on= \"disease_id\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "dis_sint = dis_sint.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "dis_sint = dis_sint.reset_index()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "68.6086956521739" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_sint[\"symptom\"].mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 4 }