From 70a81af319c12b3ca77275bf3186bb4052584b5c Mon Sep 17 00:00:00 2001 From: Laura Masa Date: Mon, 1 Jul 2024 19:01:50 +0200 Subject: [PATCH] Initial commit --- download_files.py | 302 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 download_files.py diff --git a/download_files.py b/download_files.py new file mode 100644 index 0000000..873a45b --- /dev/null +++ b/download_files.py @@ -0,0 +1,302 @@ +import requests +from Bio import Entrez +import GEOparse +import os +import gzip +import shutil +import pandas as pd +from os.path import join +from collections import defaultdict +import numpy as np + + +# ================================================================================= + +def extract_tsv(tsv_file): + """ + Extracts specific data from a TSV file that contains the disease cui and disease name and returns a DataFrame + with selected columns (cui and disease_name) and renamed the 'cui' column for 'disease_id' header. + + Input Parameters: + - tsv_file (str): Path to the TSV file to be read. + + Returns: + - pd.DataFrame: A DataFrame containing specific data with the renamed columns. + """ + + use_cols = ['cui', 'disease_name'] + new_column_names = {'cui': 'disease_id'} + + disease_df = pd.read_csv(tsv_file, delimiter='\t',usecols=use_cols) + + disease_df.rename(columns=new_column_names, inplace=True) + + return disease_df + + + + +########################### + + +def download_and_save_gds(email_request, disease_df, gds_path): + """ + Queries GEO GDS to find datasets associated with given disease names and returns a DataFrame. + + Input Parameters: + - email_request (str): Email address for Entrez API usage. + - disease_df (pd.DataFrame): DataFrame containing 'disease_id' and 'disease_name' columns with disease names to query. + + Returns: + - pd.DataFrame: DataFrame with columns 'disease_id' and 'gds_id' representing the relationship between + disease IDs and GDS identifiers. + """ + Entrez.email = email_request + + rows = [] + + # Ensure the directory exists + if not os.path.exists(gds_path): + os.makedirs(gds_path) + + unique_disease_names = set(disease_df['disease_name']) + + for disease_name in unique_disease_names: + query = f'"Homo sapiens"[Organism] AND "disease state" AND "{disease_name}"' + + with Entrez.esearch(db="gds", term=query, retmax=1000) as handle: + record = Entrez.read(handle) + + id_list = record.get("IdList", []) + + if not id_list: + print(f"No matching records found for: {disease_name}") + continue + + for gds_id in id_list: + with Entrez.esummary(db="gds", id=gds_id) as handle: + gds_summaries = Entrez.read(handle) + + for summary in gds_summaries: + if summary['Accession'].startswith("GDS"): + geo_accession = summary['Accession'] + gds = GEOparse.get_GEO(geo=geo_accession, destdir=gds_path, annotate_gpl=True) + disease_id = disease_df[disease_df['disease_name'] == disease_name]['disease_id'].values[0] + rows.append({ + "disease_id": disease_id, + "gds_id": summary['Accession'] + }) + break + + dis_gds_df = pd.DataFrame(rows) + + return dis_gds_df + + + + + +# ================================================================================= + + + + + +def decompress_gds_gz_files(gds_path): + """ + Decompresses .gz files in a directory. + + Input Parameters: + destdir (str): Path to the directory containing .gz files. + + Returns: + None + """ + + for filename in os.listdir(gds_path): + + if filename.startswith("GDS") and filename.endswith(".gz"): + + filepath = os.path.join(gds_path, filename) + + uncompressed_filepath = os.path.join(gds_path, filename[:-3]) #[:3] to remove the .gz extension + + with gzip.open(filepath, 'rb') as compressed_file: + with open(uncompressed_filepath, 'wb') as uncompressed_file: + + shutil.copyfileobj(compressed_file, uncompressed_file) + + + + + + +def filter_gds(gds_path, disease_gds_df): + """ + Filters GDS files based on specific criteria (`value_type` of the dataset must be "count", the `channel_count` must be 1 and the disease state column must be present) and updates the provided DataFrame to include only those GDS IDs that meet the criteria. + + Args: + gds_path (str): Path to the directory containing GDS .soft files. + disease_gds_df (pandas.DataFrame): DataFrame containing GDS IDs and associated disease information. + This DataFrame must have a column named 'gds_id' which contains GDS IDs. + + Returns: + pandas.DataFrame: A filtered DataFrame containing only the rows where 'gds_id' meets the criteria (count value type and single channel). + + """ + + # Initialize an empty list to store valid GDS IDs + valid_gds_ids = [] + + # Iterate over all files in the directory specified by gds_path + for filename in os.listdir(gds_path): + # Check if the file is a GDS .soft file + if filename.startswith("GDS") and filename.endswith(".soft"): + filepath = os.path.join(gds_path, filename) + gds = GEOparse.get_GEO(filepath=filepath) # Load the GDS file using GEOparse + gds_id = gds.name # Extract the GDS ID from the GDS metadata + + # Get the 'value_type' and 'channel_count' from the GDS metadata + value_type = gds.metadata.get('value_type', [None])[0] + channel_count = int(gds.metadata.get('channel_count', [None])[0]) + + # Check if 'value_type' is 'count' and 'channel_count' is 1 + if value_type.lower() == 'count' and channel_count == 1: + gds_annot = gds.columns.reset_index().rename(columns={'index': 'gsm_id'}) #get the GDS annotations + + # Check if the 'disease state' column is present + if 'disease state' in gds_annot.columns: + valid_gds_ids.append(gds_id) #add the GDS ID to the list of valid GDS IDs + + # Filter the disease_gds_df to include only rows where 'gds_id' is in the list of valid GDS IDs + diseases_gds_filtered = disease_gds_df[disease_gds_df['gds_id'].isin(valid_gds_ids)] + + return diseases_gds_filtered + + + + + + + + + + + + + +def create_gds_gpl_mapping_and_download(gds_path,gpl_path): + """ + Creates a mapping of GDS IDs to corresponding GPL IDs. + + Input Parameters: + gds_path (str): Path to the directory containing GDS files. + + Returns: + pd.DataFrame: DataFrame with columns 'gds_id' and 'gpl_id', mapping GDS IDs to GPL IDs. + """ + rows = [] + + for filename in os.listdir(gds_path): + if filename.startswith("GDS") and filename.endswith(".soft"): + filepath = os.path.join(gds_path, filename) + gds = GEOparse.get_GEO(filepath=filepath) + gds_id=gds.name + geo_platform = gds.metadata.get('platform', []) + + for gpl_id in geo_platform: + rows.append({'gds_id': gds.name, 'gpl_id': gpl_id}) + gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True) + print(f"Downloaded GPL {gpl_id} for GDS {gds_id} to {os.path.join(gpl_path, gpl_id)}") + + + gds_gpl_df = pd.DataFrame(rows) + + return gds_gpl_df + + +# ================================================================================= + + + + +def download_gpl(gds_gpl_df, gpl_path): + """ + Fetches unique GPL data for the provided GDS-GPL mapping from a DataFrame and stores them in the specified directory. + + Input Parameters: + gds_gpl_df (pandas.DataFrame): DataFrame containing GDS IDs and corresponding GPL IDs. + gpl_path (str): Path to the directory to store GPL files. + """ + # Ensure the directory exists + if not os.path.exists(gpl_path): + os.makedirs(gpl_path) + + # Extract unique GPL IDs + diff_gpl_ids = gds_gpl_df['gpl_id'].unique() + + # Iterate through the unique GPL IDs and download GPL files + for gpl_id in diff_gpl_ids: + gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True) + + + +# ================================================================================= + + + +def decompress_gpl_gz_files(gpl_path): + """ + Decompresses .gz files in a directory. + + Input Parameters: + destdir (str): Path to the directory containing .gz files. + + Returns: + None + """ + + for filename in os.listdir(gpl_path): + + if filename.startswith("GPL") and filename.endswith(".gz"): + + filepath = os.path.join(gpl_path, filename) + + uncompressed_filepath = os.path.join(gpl_path, filename[:-3]) #[:3] to remove the .gz extension + + with gzip.open(filepath, 'rb') as compressed_file: + + with open(uncompressed_filepath, 'wb') as uncompressed_file: + + shutil.copyfileobj(compressed_file, uncompressed_file) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file -- 2.24.1