{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sqlalchemy import create_engine\n", "from sklearn import preprocessing\n", "import mysql.connector\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "Triples_target_final = pd.read_csv(\"Triples_target_final.tsv\", sep='\\t')\n", "Triples_target_final = Triples_target_final.drop([\"Unnamed: 0\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "triplets_total = pd.read_csv('triplets_total.csv', sep=';')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "triplets_total = triplets_total.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "disease_id = triplets_total[\"disease_PwB\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def convert(lista):\n", " return tuple(i for i in lista)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0035579', 'C0035579', 'C0020538', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538', 'C0020538', 'C0020538', 'C0020538', 'C0035579', 'C0035579', 'C0035579', 'C0030567', 'C0020538', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0002892', 'C0035579', 'C0030567', 'C0030567', 'C0002395', 'C0002395', 'C0002395', 'C0002395', 'C0030567', 'C0030567', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0035579', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0026769', 'C0025202', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0030567', 'C0020538')\n" ] } ], "source": [ "# Driver function\n", "lista = disease_id\n", "print(convert(lista))\n", "list_disease_id = convert(lista)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "dis_gen = f'''SELECT disease_id, gene_id FROM disnet_biolayer.disease_gene\n", "where sio_id != 'SIO_001120'\n", "and sio_id != 'NO_CURATED'\n", "and disease_id IN {list_disease_id}\n", ";'''" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dis_gen=pd.read_sql(dis_gen, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dis_gen_num_gen = dis_gen_num_gen.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "581.8571428571429" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 581.857143\n", "std 464.301780\n", "min 3.000000\n", "25% 267.500000\n", "50% 559.000000\n", "75% 854.500000\n", "max 1267.000000\n", "Name: gene_id, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_num_gen[\"gene_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw via gene" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = f'''\n", "SELECT dg.disease_id,gp.pathway_id\n", " FROM disnet_biolayer.disease_gene dg \n", " JOIN disnet_biolayer.disease ds ON ds.disease_id = dg.disease_id\n", " JOIN disnet_biolayer.tmp_gene_pathway gp ON gp.gene_id = dg.gene_id\n", " where dg.disease_id in {list_disease_id}\n", "'''" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw=pd.read_sql(dis_gen_pw, con=disnet_db_ares)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.groupby(['disease_id']).count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dis_gen_pw = dis_gen_pw.reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1847.142857142857" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].mean()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 7.000000\n", "mean 1847.142857\n", "std 1531.396251\n", "min 3.000000\n", "25% 795.000000\n", "50% 1640.000000\n", "75% 2748.500000\n", "max 4200.000000\n", "Name: pathway_id, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dis_gen_pw[\"pathway_id\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### pw direct" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | disease_id | \n", "pathway_id | \n", "
---|---|---|
0 | \n", "C0020538 | \n", "WP554 | \n", "
1 | \n", "C0018799 | \n", "WP1544 | \n", "
2 | \n", "C0018799 | \n", "WP1528 | \n", "
3 | \n", "C0027947 | \n", "WP229 | \n", "
4 | \n", "C0013369 | \n", "WP229 | \n", "
... | \n", "... | \n", "... | \n", "
659 | \n", "C0268274 | \n", "WP4153 | \n", "
660 | \n", "C0085131 | \n", "WP4153 | \n", "
661 | \n", "C0036161 | \n", "WP4153 | \n", "
662 | \n", "C0268275 | \n", "WP4153 | \n", "
663 | \n", "C0162666 | \n", "WP4236 | \n", "
664 rows × 2 columns
\n", "