{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "41a9395c-0e12-4e7a-85b8-1eac4f02870d", "metadata": {}, "outputs": [], "source": [ "from Bio import Entrez\n", "import GEOparse\n", "import pandas as pd\n", "import os\n", "import preprocess_functions\n", "import insert_tables\n", "import mysql.connector\n", "from mysql.connector import errorcode" ] }, { "cell_type": "code", "execution_count": 7, "id": "34bc052f-3e52-4a26-8946-0c12a9317bde", "metadata": {}, "outputs": [], "source": [ "gpl_path=\"/home/lmasa/GEO_Laura/data/gpl\"" ] }, { "cell_type": "code", "execution_count": 8, "id": "4981165e-a2bf-462b-8b1d-8573872e930b", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "06-Jul-2024 13:30:43 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL94.annot: \n", "06-Jul-2024 13:30:43 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:43 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL571.annot: \n", "06-Jul-2024 13:30:43 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:43 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL570.annot: \n", "06-Jul-2024 13:30:43 DEBUG GEOparse - ANNOTATION: \n", "/home/lmasa/miniconda3/lib/python3.12/site-packages/GEOparse/GEOparse.py:401: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.\n", " return read_csv(StringIO(data), index_col=None, sep=\"\\t\")\n", "06-Jul-2024 13:30:43 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL93.annot: \n", "06-Jul-2024 13:30:43 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL96.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL1426.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL201.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL8300.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL4191.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL97.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:44 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL10526.annot: \n", "06-Jul-2024 13:30:44 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL92.annot: \n", "06-Jul-2024 13:30:45 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL95.annot: \n", "06-Jul-2024 13:30:45 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL74.annot: \n", "06-Jul-2024 13:30:45 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL4133.annot: \n", "06-Jul-2024 13:30:45 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL80.annot: \n", "06-Jul-2024 13:30:45 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:45 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL10558.annot: \n", "06-Jul-2024 13:30:46 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:46 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL91.annot: \n", "06-Jul-2024 13:30:46 DEBUG GEOparse - ANNOTATION: \n", "06-Jul-2024 13:30:46 INFO GEOparse - Parsing /home/lmasa/GEO_Laura/data/gpl/GPL246.annot: \n", "06-Jul-2024 13:30:46 DEBUG GEOparse - ANNOTATION: \n" ] } ], "source": [ "gpl_data=preprocess_functions.fetch_gpl_annot(gpl_path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "23f1d834-caeb-4f0e-a28c-95d258f8ea53", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | gpl_id | \n", "gpl_title | \n", "
---|---|---|
0 | \n", "GPL94 | \n", "[HG_U95D] Affymetrix Human Genome U95D Array | \n", "
1 | \n", "GPL571 | \n", "[HG-U133A_2] Affymetrix Human Genome U133A 2.0... | \n", "
2 | \n", "GPL570 | \n", "[HG-U133_Plus_2] Affymetrix Human Genome U133 ... | \n", "
3 | \n", "GPL93 | \n", "[HG_U95C] Affymetrix Human Genome U95C Array | \n", "
4 | \n", "GPL96 | \n", "[HG-U133A] Affymetrix Human Genome U133A Array | \n", "