New folder

8f0c5b2f · Laura Masa · af1b227b · 8f0c5b2f
Commit 8f0c5b2f authored Jul 01, 2024 by Laura Masa
Hide whitespace changes
Inline Side-by-side

Showing with 302 additions and 0 deletions

data_processing/download_files.py data_processing/download_files.py +302 -0

No files found.
--- a/data_processing/download_files.py
+++ b/data_processing/download_files.py
+import requests
+from Bio import Entrez
+import GEOparse
+import os
+import gzip
+import shutil
+import pandas as pd
+from os.path import join
+from collections import defaultdict
+import numpy as np
+
+
+# ================================================================================= 
+
+def extract_tsv(tsv_file):
+    """
+    Extracts specific data from a TSV file that contains the disease cui and disease name and returns a DataFrame
+    with selected columns (cui and disease_name) and renamed the 'cui' column for 'disease_id' header.
+
+    Input Parameters:
+    - tsv_file (str): Path to the TSV file to be read.
+
+    Returns:
+    - pd.DataFrame: A DataFrame containing specific data with the renamed columns.
+    """
+
+    use_cols = ['cui', 'disease_name'] 
+    new_column_names = {'cui': 'disease_id'}  
+
+    disease_df = pd.read_csv(tsv_file, delimiter='\t',usecols=use_cols)
+
+    disease_df.rename(columns=new_column_names, inplace=True)
+
+    return disease_df 
+
+
+
+
+###########################
+
+
+def download_and_save_gds(email_request, disease_df, gds_path):
+    """
+    Queries GEO GDS to find datasets associated with given disease names and returns a DataFrame.
+
+    Input Parameters:
+    - email_request (str): Email address for Entrez API usage.
+    - disease_df (pd.DataFrame): DataFrame containing 'disease_id' and 'disease_name' columns with disease names to query.
+
+    Returns:
+    - pd.DataFrame: DataFrame with columns 'disease_id' and 'gds_id' representing the relationship between
+                    disease IDs and GDS identifiers.
+    """
+    Entrez.email = email_request
+
+    rows = []
+    
+    # Ensure the directory exists
+    if not os.path.exists(gds_path):
+        os.makedirs(gds_path)
+
+    unique_disease_names = set(disease_df['disease_name'])
+
+    for disease_name in unique_disease_names:
+        query = f'"Homo sapiens"[Organism] AND "disease state" AND "{disease_name}"'
+
+        with Entrez.esearch(db="gds", term=query, retmax=1000) as handle:
+            record = Entrez.read(handle)
+
+        id_list = record.get("IdList", [])
+
+        if not id_list:
+            print(f"No matching records found for: {disease_name}")
+            continue
+
+        for gds_id in id_list:
+            with Entrez.esummary(db="gds", id=gds_id) as handle:
+                gds_summaries = Entrez.read(handle)
+
+            for summary in gds_summaries:
+                if summary['Accession'].startswith("GDS"):
+                    geo_accession = summary['Accession']
+                    gds = GEOparse.get_GEO(geo=geo_accession, destdir=gds_path, annotate_gpl=True)
+                    disease_id = disease_df[disease_df['disease_name'] == disease_name]['disease_id'].values[0]
+                    rows.append({
+                        "disease_id": disease_id,
+                        "gds_id": summary['Accession']
+                    })
+                    break
+
+    dis_gds_df = pd.DataFrame(rows)
+
+    return dis_gds_df
+
+
+
+
+
+# ================================================================================= 
+
+
+
+            
+                
+def decompress_gds_gz_files(gds_path):
+    """
+    Decompresses .gz files in a directory.
+
+    Input Parameters:
+        destdir (str): Path to the directory containing .gz files.
+
+    Returns:
+        None
+    """
+    
+    for filename in os.listdir(gds_path):
+
+        if filename.startswith("GDS") and filename.endswith(".gz"):
+
+            filepath = os.path.join(gds_path, filename)
+            
+            uncompressed_filepath = os.path.join(gds_path, filename[:-3])  #[:3] to remove the .gz extension
+            
+            with gzip.open(filepath, 'rb') as compressed_file:
+                with open(uncompressed_filepath, 'wb') as uncompressed_file:
+                 
+                    shutil.copyfileobj(compressed_file, uncompressed_file)
+               
+                
+                
+
+
+
+def filter_gds(gds_path, disease_gds_df):
+    """
+    Filters GDS files based on specific criteria (`value_type` of the dataset must be "count", the `channel_count` must be 1 and the disease state column must be present) and updates the provided DataFrame to include only those GDS IDs that meet the criteria.
+
+    Args:
+        gds_path (str): Path to the directory containing GDS .soft files.
+        disease_gds_df (pandas.DataFrame): DataFrame containing GDS IDs and associated disease information.
+            This DataFrame must have a column named 'gds_id' which contains GDS IDs.
+
+    Returns:
+        pandas.DataFrame: A filtered DataFrame containing only the rows where 'gds_id' meets the criteria (count value type and single channel).
+
+    """
+
+    # Initialize an empty list to store valid GDS IDs
+    valid_gds_ids = []
+
+    # Iterate over all files in the directory specified by gds_path
+    for filename in os.listdir(gds_path):
+        # Check if the file is a GDS .soft file
+        if filename.startswith("GDS") and filename.endswith(".soft"):
+            filepath = os.path.join(gds_path, filename)
+            gds = GEOparse.get_GEO(filepath=filepath)  # Load the GDS file using GEOparse
+            gds_id = gds.name  # Extract the GDS ID from the GDS metadata
+
+            # Get the 'value_type' and 'channel_count' from the GDS metadata
+            value_type = gds.metadata.get('value_type', [None])[0]
+            channel_count = int(gds.metadata.get('channel_count', [None])[0])
+
+            # Check if 'value_type' is 'count' and 'channel_count' is 1
+            if value_type.lower() == 'count' and channel_count == 1:
+                gds_annot = gds.columns.reset_index().rename(columns={'index': 'gsm_id'})  #get the GDS annotations
+
+                # Check if the 'disease state' column is present
+                if 'disease state' in gds_annot.columns:
+                    valid_gds_ids.append(gds_id)  #add the GDS ID to the list of valid GDS IDs
+
+    # Filter the disease_gds_df to include only rows where 'gds_id' is in the list of valid GDS IDs
+    diseases_gds_filtered = disease_gds_df[disease_gds_df['gds_id'].isin(valid_gds_ids)]
+
+    return diseases_gds_filtered
+
+
+
+
+
+
+
+
+
+
+ 
+
+            
+def create_gds_gpl_mapping_and_download(gds_path,gpl_path):
+    """
+    Creates a mapping of GDS IDs to corresponding GPL IDs.
+
+    Input Parameters:
+        gds_path (str): Path to the directory containing GDS files.
+
+    Returns:
+        pd.DataFrame: DataFrame with columns 'gds_id' and 'gpl_id', mapping GDS IDs to GPL IDs.
+    """
+    rows = []
+
+    for filename in os.listdir(gds_path):
+        if filename.startswith("GDS") and filename.endswith(".soft"):
+            filepath = os.path.join(gds_path, filename)
+            gds = GEOparse.get_GEO(filepath=filepath)
+            gds_id=gds.name
+            geo_platform = gds.metadata.get('platform', [])
+
+            for gpl_id in geo_platform:
+                rows.append({'gds_id': gds.name, 'gpl_id': gpl_id})
+                gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True)
+                print(f"Downloaded GPL {gpl_id} for GDS {gds_id} to {os.path.join(gpl_path, gpl_id)}")
+      
+
+    gds_gpl_df = pd.DataFrame(rows)
+
+    return gds_gpl_df
+              
+            
+# =================================================================================            
+                
+                
+        
+                
+def download_gpl(gds_gpl_df, gpl_path):
+    """
+    Fetches unique GPL data for the provided GDS-GPL mapping from a DataFrame and stores them in the specified directory.
+
+    Input Parameters:
+        gds_gpl_df (pandas.DataFrame): DataFrame containing GDS IDs and corresponding GPL IDs.
+        gpl_path (str): Path to the directory to store GPL files.
+    """
+    # Ensure the directory exists
+    if not os.path.exists(gpl_path):
+        os.makedirs(gpl_path)
+
+    # Extract unique GPL IDs
+    diff_gpl_ids = gds_gpl_df['gpl_id'].unique()
+
+    # Iterate through the unique GPL IDs and download GPL files
+    for gpl_id in diff_gpl_ids:
+        gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True)
+                
+        
+        
+# =================================================================================        
+        
+        
+        
+def decompress_gpl_gz_files(gpl_path):
+    """
+    Decompresses .gz files in a directory.
+
+    Input Parameters:
+        destdir (str): Path to the directory containing .gz files.
+
+    Returns:
+        None
+    """
+   
+    for filename in os.listdir(gpl_path):
+        
+        if filename.startswith("GPL") and filename.endswith(".gz"):
+           
+            filepath = os.path.join(gpl_path, filename)
+            
+            uncompressed_filepath = os.path.join(gpl_path, filename[:-3])  #[:3] to remove the .gz extension
+            
+            with gzip.open(filepath, 'rb') as compressed_file:
+                
+                with open(uncompressed_filepath, 'wb') as uncompressed_file:
+                    
+                    shutil.copyfileobj(compressed_file, uncompressed_file)
+
+         
+            
+        
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+            
+            
+            
+            
+            
+            
+            
+            
+            
\ No newline at end of file