Commit a64aa495 authored by Belen Otero Carrasco's avatar Belen Otero Carrasco

code sex-bias

parent 5556a685
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Usuario\\anaconda3\\lib\\site-packages\\pandas\\core\\computation\\expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).\n",
" from pandas.core.computation.check import NUMEXPR_INSTALLED\n"
]
}
],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pandas import DataFrame\n",
"from cmapPy.pandasGEXpress.parse import parse\n",
"from scipy.stats import hypergeom\n",
"from tqdm import tqdm\n",
"import mysql.connector"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"disnet_db_ares = mysql.connector.connect(\n",
" host=\"138.4.130.153\",\n",
" port = \"30602\",\n",
" user=\"disnet_user\",\n",
" password=\"tYkX4JxV8p79\",\n",
" database=\"disnet_drugslayer\"\n",
")\n",
"\n",
" \n",
"\n",
"disnet_mysql_cursor = disnet_db_ares.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"q1 = \"\"\"SELECT * FROM disnet_drugslayer.drug \"\"\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-4-6f46a91a900f>:1: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
" drug_disnet =pd.read_sql(q1, con=disnet_db_ares)\n"
]
}
],
"source": [
"drug_disnet =pd.read_sql(q1, con=disnet_db_ares)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"pertub_id = pd.read_csv(\"./GSE92742_Broad_LINCS_pert_info.txt\", sep=\"\\t\", dtype=str)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28957"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(pertub_id[\"pert_iname\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"pertub_id_filter = pertub_id[pertub_id[\"is_touchstone\"]== \"1\"]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"pertub_id_filter_cp = pertub_id_filter[pertub_id_filter[\"pert_type\"] ==\"trt_cp\"]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2429"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(pertub_id_filter_cp[\"pert_iname\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"drug_common_inchi = drug_disnet.merge(pertub_id_filter_cp, on =\"inchi_key\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-21-bd692c32d11c>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" pertub_id_filter_cp['drug_name'] = pertub_id_filter_cp['pert_iname'].str.upper()\n"
]
}
],
"source": [
"pertub_id_filter_cp['drug_name'] = pertub_id_filter_cp['pert_iname'].str.upper() "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"drug_common_name = drug_disnet.merge(pertub_id_filter_cp, on =\"drug_name\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"drug_common_name_fil = drug_common_name[[\"drug_id\",\"drug_name\"]]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"drug_common_inchi_fil = drug_common_inchi[[\"drug_id\",\"drug_name\"]]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"drugs_all_disnet_clue = pd.concat([drug_common_name,drug_common_inchi]).drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>source_id</th>\n",
" <th>drug_name</th>\n",
" <th>molecular_type</th>\n",
" <th>chemical_structure</th>\n",
" <th>inchi_key_x</th>\n",
" <th>pert_id</th>\n",
" <th>pert_iname</th>\n",
" <th>pert_type</th>\n",
" <th>is_touchstone</th>\n",
" <th>inchi_key_prefix</th>\n",
" <th>inchi_key_y</th>\n",
" <th>canonical_smiles</th>\n",
" <th>pubchem_cid</th>\n",
" <th>inchi_key</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>1</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>Small molecule</td>\n",
" <td>O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A42571354</td>\n",
" <td>cetirizine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>ZKLPARSLTMPFCP</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1008</td>\n",
" <td>1</td>\n",
" <td>BEPRIDIL</td>\n",
" <td>Small molecule</td>\n",
" <td>CC(C)COCC(CN(Cc1ccccc1)c1ccccc1)N1CCCC1</td>\n",
" <td>UIEATEWHFDRYRU-UHFFFAOYSA-N</td>\n",
" <td>BRD-A91008255</td>\n",
" <td>bepridil</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>UIEATEWHFDRYRU</td>\n",
" <td>UIEATEWHFDRYRU-UHFFFAOYSA-N</td>\n",
" <td>CC(C)COCC(CN(Cc1ccccc1)c1ccccc1)N1CCCC1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL101</td>\n",
" <td>1</td>\n",
" <td>PHENYLBUTAZONE</td>\n",
" <td>Small molecule</td>\n",
" <td>CCCCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O</td>\n",
" <td>VYMDGNCVAMGZFE-UHFFFAOYSA-N</td>\n",
" <td>BRD-K10843433</td>\n",
" <td>phenylbutazone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>VYMDGNCVAMGZFE</td>\n",
" <td>VYMDGNCVAMGZFE-UHFFFAOYSA-N</td>\n",
" <td>CCCCC1C(=O)N(N(C1=O)c1ccccc1)c1ccccc1</td>\n",
" <td>4781</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL1017</td>\n",
" <td>1</td>\n",
" <td>TELMISARTAN</td>\n",
" <td>Small molecule</td>\n",
" <td>CCCc1nc2c(C)cc(-c3nc4ccccc4n3C)cc2n1Cc1ccc(-c2...</td>\n",
" <td>RMMXLENWKUUMAY-UHFFFAOYSA-N</td>\n",
" <td>BRD-K73999723</td>\n",
" <td>telmisartan</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>RMMXLENWKUUMAY</td>\n",
" <td>RMMXLENWKUUMAY-UHFFFAOYSA-N</td>\n",
" <td>CCCc1nc2c(C)cc(cc2n1Cc1ccc(cc1)-c1ccccc1C(O)=O...</td>\n",
" <td>65999</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>642</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>643</th>\n",
" <td>CHEMBL991</td>\n",
" <td>1</td>\n",
" <td>STAVUDINE</td>\n",
" <td>Small molecule</td>\n",
" <td>Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K93880783</td>\n",
" <td>stavudine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>XNKLLVCARDGLGL</td>\n",
" <td>NaN</td>\n",
" <td>Cc1cn([C@@H]2O[C@H](CO)C=C2)c(=O)[nH]c1=O</td>\n",
" <td>18283</td>\n",
" <td>XNKLLVCARDGLGL-JGVFFNPUSA-N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>644</th>\n",
" <td>CHEMBL996</td>\n",
" <td>1</td>\n",
" <td>CEFOXITIN</td>\n",
" <td>Small molecule</td>\n",
" <td>CO[C@@]1(NC(=O)Cc2cccs2)C(=O)N2C(C(=O)O)=C(COC...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K70976396</td>\n",
" <td>cefoxitin</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>WZOZEZRFJCJXNZ</td>\n",
" <td>NaN</td>\n",
" <td>CO[C@]1(NC(=O)Cc2cccs2)[C@H]2SCC(COC(N)=O)=C(N...</td>\n",
" <td>23667300</td>\n",
" <td>WZOZEZRFJCJXNZ-ZBFHGGJFSA-N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>645</th>\n",
" <td>CHEMBL9967</td>\n",
" <td>1</td>\n",
" <td>PIRENZEPINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc32)CC1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K89375097</td>\n",
" <td>pirenzepine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>RMHMFHUVIITRHF</td>\n",
" <td>NaN</td>\n",
" <td>CN1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc23)CC1</td>\n",
" <td>185248</td>\n",
" <td>RMHMFHUVIITRHF-UHFFFAOYSA-N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>646</th>\n",
" <td>CHEMBL998</td>\n",
" <td>1</td>\n",
" <td>LORATADINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K82795137</td>\n",
" <td>loratadine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>JCCNYMKQOSZNPW</td>\n",
" <td>NaN</td>\n",
" <td>CCOC(=O)N1CCC(CC1)=C1c2ccc(Cl)cc2CCc2cccnc12</td>\n",
" <td>3957</td>\n",
" <td>JCCNYMKQOSZNPW-UHFFFAOYSA-N</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1390 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id source_id drug_name molecular_type \\\n",
"0 CHEMBL1000 1 CETIRIZINE Small molecule \n",
"1 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"2 CHEMBL1008 1 BEPRIDIL Small molecule \n",
"3 CHEMBL101 1 PHENYLBUTAZONE Small molecule \n",
"4 CHEMBL1017 1 TELMISARTAN Small molecule \n",
".. ... ... ... ... \n",
"642 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"643 CHEMBL991 1 STAVUDINE Small molecule \n",
"644 CHEMBL996 1 CEFOXITIN Small molecule \n",
"645 CHEMBL9967 1 PIRENZEPINE Small molecule \n",
"646 CHEMBL998 1 LORATADINE Small molecule \n",
"\n",
" chemical_structure \\\n",
"0 O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1 \n",
"1 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"2 CC(C)COCC(CN(Cc1ccccc1)c1ccccc1)N1CCCC1 \n",
"3 CCCCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O \n",
"4 CCCc1nc2c(C)cc(-c3nc4ccccc4n3C)cc2n1Cc1ccc(-c2... \n",
".. ... \n",
"642 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"643 Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O \n",
"644 CO[C@@]1(NC(=O)Cc2cccs2)C(=O)N2C(C(=O)O)=C(COC... \n",
"645 CN1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc32)CC1 \n",
"646 CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1 \n",
"\n",
" inchi_key_x pert_id pert_iname pert_type \\\n",
"0 ZKLPARSLTMPFCP-UHFFFAOYSA-N BRD-A42571354 cetirizine trt_cp \n",
"1 HCFDWZZGGLSKEP-UHFFFAOYSA-N BRD-A44008656 doxylamine trt_cp \n",
"2 UIEATEWHFDRYRU-UHFFFAOYSA-N BRD-A91008255 bepridil trt_cp \n",
"3 VYMDGNCVAMGZFE-UHFFFAOYSA-N BRD-K10843433 phenylbutazone trt_cp \n",
"4 RMMXLENWKUUMAY-UHFFFAOYSA-N BRD-K73999723 telmisartan trt_cp \n",
".. ... ... ... ... \n",
"642 NaN BRD-K94353609 fluocinolone trt_cp \n",
"643 NaN BRD-K93880783 stavudine trt_cp \n",
"644 NaN BRD-K70976396 cefoxitin trt_cp \n",
"645 NaN BRD-K89375097 pirenzepine trt_cp \n",
"646 NaN BRD-K82795137 loratadine trt_cp \n",
"\n",
" is_touchstone inchi_key_prefix inchi_key_y \\\n",
"0 1 ZKLPARSLTMPFCP ZKLPARSLTMPFCP-UHFFFAOYSA-N \n",
"1 1 HCFDWZZGGLSKEP HCFDWZZGGLSKEP-UHFFFAOYSA-N \n",
"2 1 UIEATEWHFDRYRU UIEATEWHFDRYRU-UHFFFAOYSA-N \n",
"3 1 VYMDGNCVAMGZFE VYMDGNCVAMGZFE-UHFFFAOYSA-N \n",
"4 1 RMMXLENWKUUMAY RMMXLENWKUUMAY-UHFFFAOYSA-N \n",
".. ... ... ... \n",
"642 1 FEBLZLNTKCEFIT NaN \n",
"643 1 XNKLLVCARDGLGL NaN \n",
"644 1 WZOZEZRFJCJXNZ NaN \n",
"645 1 RMHMFHUVIITRHF NaN \n",
"646 1 JCCNYMKQOSZNPW NaN \n",
"\n",
" canonical_smiles pubchem_cid \\\n",
"0 OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1 -666 \n",
"1 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"2 CC(C)COCC(CN(Cc1ccccc1)c1ccccc1)N1CCCC1 -666 \n",
"3 CCCCC1C(=O)N(N(C1=O)c1ccccc1)c1ccccc1 4781 \n",
"4 CCCc1nc2c(C)cc(cc2n1Cc1ccc(cc1)-c1ccccc1C(O)=O... 65999 \n",
".. ... ... \n",
"642 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"643 Cc1cn([C@@H]2O[C@H](CO)C=C2)c(=O)[nH]c1=O 18283 \n",
"644 CO[C@]1(NC(=O)Cc2cccs2)[C@H]2SCC(COC(N)=O)=C(N... 23667300 \n",
"645 CN1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc23)CC1 185248 \n",
"646 CCOC(=O)N1CCC(CC1)=C1c2ccc(Cl)cc2CCc2cccnc12 3957 \n",
"\n",
" inchi_key \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
".. ... \n",
"642 FEBLZLNTKCEFIT-VSXGLTOVSA-N \n",
"643 XNKLLVCARDGLGL-JGVFFNPUSA-N \n",
"644 WZOZEZRFJCJXNZ-ZBFHGGJFSA-N \n",
"645 RMHMFHUVIITRHF-UHFFFAOYSA-N \n",
"646 JCCNYMKQOSZNPW-UHFFFAOYSA-N \n",
"\n",
"[1390 rows x 15 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drugs_all_disnet_clue"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"drugs_disnet_clue = pd.concat([drug_common_name_fil,drug_common_inchi_fil]).drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>drug_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>CETIRIZINE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>DOXYLAMINE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1008</td>\n",
" <td>BEPRIDIL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL101</td>\n",
" <td>PHENYLBUTAZONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL1017</td>\n",
" <td>TELMISARTAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>562</th>\n",
" <td>CHEMBL776</td>\n",
" <td>METAPROTERENOL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>583</th>\n",
" <td>CHEMBL829</td>\n",
" <td>METHYLPROMAZINE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>603</th>\n",
" <td>CHEMBL867</td>\n",
" <td>IOPANOIC ACID</td>\n",
" </tr>\n",
" <tr>\n",
" <th>606</th>\n",
" <td>CHEMBL869</td>\n",
" <td>NITROFURAZONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>642</th>\n",
" <td>CHEMBL989</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>827 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id drug_name\n",
"0 CHEMBL1000 CETIRIZINE\n",
"1 CHEMBL1004 DOXYLAMINE\n",
"2 CHEMBL1008 BEPRIDIL\n",
"3 CHEMBL101 PHENYLBUTAZONE\n",
"4 CHEMBL1017 TELMISARTAN\n",
".. ... ...\n",
"562 CHEMBL776 METAPROTERENOL\n",
"583 CHEMBL829 METHYLPROMAZINE\n",
"603 CHEMBL867 IOPANOIC ACID\n",
"606 CHEMBL869 NITROFURAZONE\n",
"642 CHEMBL989 FLUOCINOLONE ACETONIDE\n",
"\n",
"[827 rows x 2 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drugs_disnet_clue"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"q2 = \"\"\"SELECT * FROM disnet_drugslayer.ATC_code\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-30-81cdba29d433>:1: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
" drug_atc =pd.read_sql(q2, con=disnet_db_ares)\n"
]
}
],
"source": [
"drug_atc =pd.read_sql(q2, con=disnet_db_ares)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>ATC_code_id</th>\n",
" <th>source_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL100116</td>\n",
" <td>N02AD01</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>R06AA59</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL1005</td>\n",
" <td>N01AH06</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3218</th>\n",
" <td>CHEMBL991</td>\n",
" <td>J05AF04</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3219</th>\n",
" <td>CHEMBL996</td>\n",
" <td>J01DC01</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3220</th>\n",
" <td>CHEMBL9967</td>\n",
" <td>A02BX03</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3221</th>\n",
" <td>CHEMBL997</td>\n",
" <td>M05BA06</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3222</th>\n",
" <td>CHEMBL998</td>\n",
" <td>R06AX13</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3223 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id ATC_code_id source_id\n",
"0 CHEMBL1000 R06AE07 1\n",
"1 CHEMBL100116 N02AD01 1\n",
"2 CHEMBL1004 R06AA09 1\n",
"3 CHEMBL1004 R06AA59 1\n",
"4 CHEMBL1005 N01AH06 1\n",
"... ... ... ...\n",
"3218 CHEMBL991 J05AF04 1\n",
"3219 CHEMBL996 J01DC01 1\n",
"3220 CHEMBL9967 A02BX03 1\n",
"3221 CHEMBL997 M05BA06 1\n",
"3222 CHEMBL998 R06AX13 1\n",
"\n",
"[3223 rows x 3 columns]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drug_atc"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"atc_class = drugs_disnet_clue.merge(drug_atc,on=\"drug_id\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>drug_name</th>\n",
" <th>ATC_code_id</th>\n",
" <th>source_id</th>\n",
" <th>index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>R06AA59</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1008</td>\n",
" <td>BEPRIDIL</td>\n",
" <td>C08EA02</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL101</td>\n",
" <td>PHENYLBUTAZONE</td>\n",
" <td>M01AA01</td>\n",
" <td>1</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1125</th>\n",
" <td>CHEMBL869</td>\n",
" <td>NITROFURAZONE</td>\n",
" <td>S02AA02</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1126</th>\n",
" <td>CHEMBL989</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>C05AA10</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1127</th>\n",
" <td>CHEMBL989</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>D07AC04</td>\n",
" <td>1</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1128</th>\n",
" <td>CHEMBL989</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>S01BA15</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1129</th>\n",
" <td>CHEMBL989</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>S02BA08</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1130 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id drug_name ATC_code_id source_id index\n",
"0 CHEMBL1000 CETIRIZINE R06AE07 1 R\n",
"1 CHEMBL1004 DOXYLAMINE R06AA09 1 R\n",
"2 CHEMBL1004 DOXYLAMINE R06AA59 1 R\n",
"3 CHEMBL1008 BEPRIDIL C08EA02 1 C\n",
"4 CHEMBL101 PHENYLBUTAZONE M01AA01 1 M\n",
"... ... ... ... ... ...\n",
"1125 CHEMBL869 NITROFURAZONE S02AA02 1 S\n",
"1126 CHEMBL989 FLUOCINOLONE ACETONIDE C05AA10 1 C\n",
"1127 CHEMBL989 FLUOCINOLONE ACETONIDE D07AC04 1 D\n",
"1128 CHEMBL989 FLUOCINOLONE ACETONIDE S01BA15 1 S\n",
"1129 CHEMBL989 FLUOCINOLONE ACETONIDE S02BA08 1 S\n",
"\n",
"[1130 rows x 5 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"atc_class['index'] = atc_class['ATC_code_id'].astype(str).str[0]\n",
"atc_class"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>drug_name</th>\n",
" <th>ATC_code_id</th>\n",
" <th>source_id</th>\n",
" <th>index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>CHEMBL1117</td>\n",
" <td>IDARUBICIN</td>\n",
" <td>L01DB06</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>CHEMBL1173055</td>\n",
" <td>RUCAPARIB</td>\n",
" <td>L01XX55</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>CHEMBL1173655</td>\n",
" <td>AFATINIB</td>\n",
" <td>L01XE13</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>CHEMBL118</td>\n",
" <td>CELECOXIB</td>\n",
" <td>L01XX33</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>CHEMBL1200374</td>\n",
" <td>EXEMESTANE</td>\n",
" <td>L02BG06</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1004</th>\n",
" <td>CHEMBL98</td>\n",
" <td>VORINOSTAT</td>\n",
" <td>L01XX38</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1057</th>\n",
" <td>CHEMBL1651906</td>\n",
" <td>STREPTOZOCIN</td>\n",
" <td>L01AD04</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1081</th>\n",
" <td>CHEMBL46286</td>\n",
" <td>OMACETAXINE MEPESUCCINATE</td>\n",
" <td>L01XX40</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1103</th>\n",
" <td>CHEMBL601</td>\n",
" <td>AMINOLEVULINIC ACID</td>\n",
" <td>L01XD04</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1114</th>\n",
" <td>CHEMBL717</td>\n",
" <td>MEDROXYPROGESTERONE ACETATE</td>\n",
" <td>L02AB02</td>\n",
" <td>1</td>\n",
" <td>L</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>99 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id drug_name ATC_code_id source_id index\n",
"57 CHEMBL1117 IDARUBICIN L01DB06 1 L\n",
"74 CHEMBL1173055 RUCAPARIB L01XX55 1 L\n",
"75 CHEMBL1173655 AFATINIB L01XE13 1 L\n",
"79 CHEMBL118 CELECOXIB L01XX33 1 L\n",
"93 CHEMBL1200374 EXEMESTANE L02BG06 1 L\n",
"... ... ... ... ... ...\n",
"1004 CHEMBL98 VORINOSTAT L01XX38 1 L\n",
"1057 CHEMBL1651906 STREPTOZOCIN L01AD04 1 L\n",
"1081 CHEMBL46286 OMACETAXINE MEPESUCCINATE L01XX40 1 L\n",
"1103 CHEMBL601 AMINOLEVULINIC ACID L01XD04 1 L\n",
"1114 CHEMBL717 MEDROXYPROGESTERONE ACETATE L02AB02 1 L\n",
"\n",
"[99 rows x 5 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"atc_class[atc_class[\"index\"]== \"L\"]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"### repito el proceso con los datos que tienen las claves para buscar en clue"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"atc_class_all = drugs_all_disnet_clue.merge(drug_atc,on=\"drug_id\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>source_id_x</th>\n",
" <th>drug_name</th>\n",
" <th>molecular_type</th>\n",
" <th>chemical_structure</th>\n",
" <th>inchi_key_x</th>\n",
" <th>pert_id</th>\n",
" <th>pert_iname</th>\n",
" <th>pert_type</th>\n",
" <th>is_touchstone</th>\n",
" <th>inchi_key_prefix</th>\n",
" <th>inchi_key_y</th>\n",
" <th>canonical_smiles</th>\n",
" <th>pubchem_cid</th>\n",
" <th>inchi_key</th>\n",
" <th>ATC_code_id</th>\n",
" <th>source_id_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>1</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>Small molecule</td>\n",
" <td>O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A42571354</td>\n",
" <td>cetirizine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>ZKLPARSLTMPFCP</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>1</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>Small molecule</td>\n",
" <td>O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-A42571354</td>\n",
" <td>cetirizine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>ZKLPARSLTMPFCP</td>\n",
" <td>NaN</td>\n",
" <td>OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1</td>\n",
" <td>-666</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AA59</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>NaN</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1840</th>\n",
" <td>CHEMBL869</td>\n",
" <td>1</td>\n",
" <td>NITROFURAZONE</td>\n",
" <td>Small molecule</td>\n",
" <td>NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K79092138</td>\n",
" <td>nitrofural</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>IAIWVQXQOWNYOU</td>\n",
" <td>NaN</td>\n",
" <td>NC(=O)N/N=C/c1ccc(o1)[N+](=O)[O-]</td>\n",
" <td>-666</td>\n",
" <td>IAIWVQXQOWNYOU-FPYGCLRLSA-N</td>\n",
" <td>S02AA02</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1841</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>C05AA10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1842</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>D07AC04</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1843</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>S01BA15</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1844</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>S02BA08</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1845 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id source_id_x drug_name molecular_type \\\n",
"0 CHEMBL1000 1 CETIRIZINE Small molecule \n",
"1 CHEMBL1000 1 CETIRIZINE Small molecule \n",
"2 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"3 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"4 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"... ... ... ... ... \n",
"1840 CHEMBL869 1 NITROFURAZONE Small molecule \n",
"1841 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1842 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1843 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1844 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"\n",
" chemical_structure \\\n",
"0 O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1 \n",
"1 O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1 \n",
"2 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"3 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"4 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"... ... \n",
"1840 NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1 \n",
"1841 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1842 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1843 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1844 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"\n",
" inchi_key_x pert_id pert_iname pert_type \\\n",
"0 ZKLPARSLTMPFCP-UHFFFAOYSA-N BRD-A42571354 cetirizine trt_cp \n",
"1 NaN BRD-A42571354 cetirizine trt_cp \n",
"2 HCFDWZZGGLSKEP-UHFFFAOYSA-N BRD-A44008656 doxylamine trt_cp \n",
"3 HCFDWZZGGLSKEP-UHFFFAOYSA-N BRD-A44008656 doxylamine trt_cp \n",
"4 NaN BRD-A44008656 doxylamine trt_cp \n",
"... ... ... ... ... \n",
"1840 NaN BRD-K79092138 nitrofural trt_cp \n",
"1841 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1842 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1843 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1844 NaN BRD-K94353609 fluocinolone trt_cp \n",
"\n",
" is_touchstone inchi_key_prefix inchi_key_y \\\n",
"0 1 ZKLPARSLTMPFCP ZKLPARSLTMPFCP-UHFFFAOYSA-N \n",
"1 1 ZKLPARSLTMPFCP NaN \n",
"2 1 HCFDWZZGGLSKEP HCFDWZZGGLSKEP-UHFFFAOYSA-N \n",
"3 1 HCFDWZZGGLSKEP HCFDWZZGGLSKEP-UHFFFAOYSA-N \n",
"4 1 HCFDWZZGGLSKEP NaN \n",
"... ... ... ... \n",
"1840 1 IAIWVQXQOWNYOU NaN \n",
"1841 1 FEBLZLNTKCEFIT NaN \n",
"1842 1 FEBLZLNTKCEFIT NaN \n",
"1843 1 FEBLZLNTKCEFIT NaN \n",
"1844 1 FEBLZLNTKCEFIT NaN \n",
"\n",
" canonical_smiles pubchem_cid \\\n",
"0 OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1 -666 \n",
"1 OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1 -666 \n",
"2 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"3 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"4 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"... ... ... \n",
"1840 NC(=O)N/N=C/c1ccc(o1)[N+](=O)[O-] -666 \n",
"1841 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1842 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1843 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1844 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"\n",
" inchi_key ATC_code_id source_id_y \n",
"0 NaN R06AE07 1 \n",
"1 ZKLPARSLTMPFCP-UHFFFAOYSA-N R06AE07 1 \n",
"2 NaN R06AA09 1 \n",
"3 NaN R06AA59 1 \n",
"4 HCFDWZZGGLSKEP-UHFFFAOYSA-N R06AA09 1 \n",
"... ... ... ... \n",
"1840 IAIWVQXQOWNYOU-FPYGCLRLSA-N S02AA02 1 \n",
"1841 FEBLZLNTKCEFIT-VSXGLTOVSA-N C05AA10 1 \n",
"1842 FEBLZLNTKCEFIT-VSXGLTOVSA-N D07AC04 1 \n",
"1843 FEBLZLNTKCEFIT-VSXGLTOVSA-N S01BA15 1 \n",
"1844 FEBLZLNTKCEFIT-VSXGLTOVSA-N S02BA08 1 \n",
"\n",
"[1845 rows x 17 columns]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"atc_class_all"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drug_id</th>\n",
" <th>source_id_x</th>\n",
" <th>drug_name</th>\n",
" <th>molecular_type</th>\n",
" <th>chemical_structure</th>\n",
" <th>inchi_key_x</th>\n",
" <th>pert_id</th>\n",
" <th>pert_iname</th>\n",
" <th>pert_type</th>\n",
" <th>is_touchstone</th>\n",
" <th>inchi_key_prefix</th>\n",
" <th>inchi_key_y</th>\n",
" <th>canonical_smiles</th>\n",
" <th>pubchem_cid</th>\n",
" <th>inchi_key</th>\n",
" <th>ATC_code_id</th>\n",
" <th>source_id_y</th>\n",
" <th>index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>1</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>Small molecule</td>\n",
" <td>O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A42571354</td>\n",
" <td>cetirizine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>ZKLPARSLTMPFCP</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL1000</td>\n",
" <td>1</td>\n",
" <td>CETIRIZINE</td>\n",
" <td>Small molecule</td>\n",
" <td>O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-A42571354</td>\n",
" <td>cetirizine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>ZKLPARSLTMPFCP</td>\n",
" <td>NaN</td>\n",
" <td>OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1</td>\n",
" <td>-666</td>\n",
" <td>ZKLPARSLTMPFCP-UHFFFAOYSA-N</td>\n",
" <td>R06AE07</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>NaN</td>\n",
" <td>R06AA59</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL1004</td>\n",
" <td>1</td>\n",
" <td>DOXYLAMINE</td>\n",
" <td>Small molecule</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-A44008656</td>\n",
" <td>doxylamine</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>HCFDWZZGGLSKEP</td>\n",
" <td>NaN</td>\n",
" <td>CN(C)CCOC(C)(c1ccccc1)c1ccccn1</td>\n",
" <td>-666</td>\n",
" <td>HCFDWZZGGLSKEP-UHFFFAOYSA-N</td>\n",
" <td>R06AA09</td>\n",
" <td>1</td>\n",
" <td>R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1840</th>\n",
" <td>CHEMBL869</td>\n",
" <td>1</td>\n",
" <td>NITROFURAZONE</td>\n",
" <td>Small molecule</td>\n",
" <td>NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K79092138</td>\n",
" <td>nitrofural</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>IAIWVQXQOWNYOU</td>\n",
" <td>NaN</td>\n",
" <td>NC(=O)N/N=C/c1ccc(o1)[N+](=O)[O-]</td>\n",
" <td>-666</td>\n",
" <td>IAIWVQXQOWNYOU-FPYGCLRLSA-N</td>\n",
" <td>S02AA02</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1841</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>C05AA10</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1842</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>D07AC04</td>\n",
" <td>1</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1843</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>S01BA15</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1844</th>\n",
" <td>CHEMBL989</td>\n",
" <td>1</td>\n",
" <td>FLUOCINOLONE ACETONIDE</td>\n",
" <td>Small molecule</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>NaN</td>\n",
" <td>BRD-K94353609</td>\n",
" <td>fluocinolone</td>\n",
" <td>trt_cp</td>\n",
" <td>1</td>\n",
" <td>FEBLZLNTKCEFIT</td>\n",
" <td>NaN</td>\n",
" <td>CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...</td>\n",
" <td>6215</td>\n",
" <td>FEBLZLNTKCEFIT-VSXGLTOVSA-N</td>\n",
" <td>S02BA08</td>\n",
" <td>1</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1845 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" drug_id source_id_x drug_name molecular_type \\\n",
"0 CHEMBL1000 1 CETIRIZINE Small molecule \n",
"1 CHEMBL1000 1 CETIRIZINE Small molecule \n",
"2 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"3 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"4 CHEMBL1004 1 DOXYLAMINE Small molecule \n",
"... ... ... ... ... \n",
"1840 CHEMBL869 1 NITROFURAZONE Small molecule \n",
"1841 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1842 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1843 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"1844 CHEMBL989 1 FLUOCINOLONE ACETONIDE Small molecule \n",
"\n",
" chemical_structure \\\n",
"0 O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1 \n",
"1 O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1 \n",
"2 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"3 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"4 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 \n",
"... ... \n",
"1840 NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1 \n",
"1841 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1842 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1843 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"1844 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... \n",
"\n",
" inchi_key_x pert_id pert_iname pert_type \\\n",
"0 ZKLPARSLTMPFCP-UHFFFAOYSA-N BRD-A42571354 cetirizine trt_cp \n",
"1 NaN BRD-A42571354 cetirizine trt_cp \n",
"2 HCFDWZZGGLSKEP-UHFFFAOYSA-N BRD-A44008656 doxylamine trt_cp \n",
"3 HCFDWZZGGLSKEP-UHFFFAOYSA-N BRD-A44008656 doxylamine trt_cp \n",
"4 NaN BRD-A44008656 doxylamine trt_cp \n",
"... ... ... ... ... \n",
"1840 NaN BRD-K79092138 nitrofural trt_cp \n",
"1841 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1842 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1843 NaN BRD-K94353609 fluocinolone trt_cp \n",
"1844 NaN BRD-K94353609 fluocinolone trt_cp \n",
"\n",
" is_touchstone inchi_key_prefix inchi_key_y \\\n",
"0 1 ZKLPARSLTMPFCP ZKLPARSLTMPFCP-UHFFFAOYSA-N \n",
"1 1 ZKLPARSLTMPFCP NaN \n",
"2 1 HCFDWZZGGLSKEP HCFDWZZGGLSKEP-UHFFFAOYSA-N \n",
"3 1 HCFDWZZGGLSKEP HCFDWZZGGLSKEP-UHFFFAOYSA-N \n",
"4 1 HCFDWZZGGLSKEP NaN \n",
"... ... ... ... \n",
"1840 1 IAIWVQXQOWNYOU NaN \n",
"1841 1 FEBLZLNTKCEFIT NaN \n",
"1842 1 FEBLZLNTKCEFIT NaN \n",
"1843 1 FEBLZLNTKCEFIT NaN \n",
"1844 1 FEBLZLNTKCEFIT NaN \n",
"\n",
" canonical_smiles pubchem_cid \\\n",
"0 OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1 -666 \n",
"1 OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1 -666 \n",
"2 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"3 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"4 CN(C)CCOC(C)(c1ccccc1)c1ccccn1 -666 \n",
"... ... ... \n",
"1840 NC(=O)N/N=C/c1ccc(o1)[N+](=O)[O-] -666 \n",
"1841 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1842 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1843 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"1844 CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)... 6215 \n",
"\n",
" inchi_key ATC_code_id source_id_y index \n",
"0 NaN R06AE07 1 R \n",
"1 ZKLPARSLTMPFCP-UHFFFAOYSA-N R06AE07 1 R \n",
"2 NaN R06AA09 1 R \n",
"3 NaN R06AA59 1 R \n",
"4 HCFDWZZGGLSKEP-UHFFFAOYSA-N R06AA09 1 R \n",
"... ... ... ... ... \n",
"1840 IAIWVQXQOWNYOU-FPYGCLRLSA-N S02AA02 1 S \n",
"1841 FEBLZLNTKCEFIT-VSXGLTOVSA-N C05AA10 1 C \n",
"1842 FEBLZLNTKCEFIT-VSXGLTOVSA-N D07AC04 1 D \n",
"1843 FEBLZLNTKCEFIT-VSXGLTOVSA-N S01BA15 1 S \n",
"1844 FEBLZLNTKCEFIT-VSXGLTOVSA-N S02BA08 1 S \n",
"\n",
"[1845 rows x 18 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"atc_class_all['index'] = atc_class_all['ATC_code_id'].astype(str).str[0]\n",
"atc_class_all"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"atc_cancer = atc_class_all[atc_class_all[\"index\"]== \"L\"]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"atc_cancer = atc_cancer[[\"pert_iname\",\"drug_name\",\"drug_id\",\"pert_id\"]].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"95"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(atc_cancer[\"pert_iname\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"atc_cancer.to_csv(\"drugs_cancer_clue.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment