From 147232414f82f8e4acfe4d09bafc3b47689ceacc Mon Sep 17 00:00:00 2001
From: Laura Masa <l.masa@alumnos.upm.es>
Date: Tue, 9 Jul 2024 10:47:11 +0200
Subject: [PATCH] Correcting files

---
 ..._neuro_diseases_final_disease_selected.tsv |   0
 .../data}/raw/top10_genecount_disease.tsv     |   0
 .../results}/disease_gds.csv                  |   0
 .../results}/gds_gpl.csv                      |   0
 .../results}/prefilt_disease_gds.csv          |   0
 .../scripts}/download_files.ipynb             |   0
 .../{ => scripts}/download_files.py           |   0
 .../scripts}/gpl_main.ipynb                   |   0
 .../scripts}/insert_tables.py                 |   0
 .../scripts}/preprocess_functions.py          |   0
 .../scripts}/preprocess_upload_data.ipynb     |   0
 scripts/download_files.py                     | 302 ------------------
 12 files changed, 302 deletions(-)
 rename {data => data_processing/data}/raw/data_01_neuro_diseases_final_disease_selected.tsv (100%)
 rename {data => data_processing/data}/raw/top10_genecount_disease.tsv (100%)
 rename {results => data_processing/results}/disease_gds.csv (100%)
 rename {results => data_processing/results}/gds_gpl.csv (100%)
 rename {results => data_processing/results}/prefilt_disease_gds.csv (100%)
 rename {scripts => data_processing/scripts}/download_files.ipynb (100%)
 rename data_processing/{ => scripts}/download_files.py (100%)
 rename {scripts => data_processing/scripts}/gpl_main.ipynb (100%)
 rename {scripts => data_processing/scripts}/insert_tables.py (100%)
 rename {scripts => data_processing/scripts}/preprocess_functions.py (100%)
 rename {scripts => data_processing/scripts}/preprocess_upload_data.ipynb (100%)
 delete mode 100644 scripts/download_files.py

diff --git a/data/raw/data_01_neuro_diseases_final_disease_selected.tsv b/data_processing/data/raw/data_01_neuro_diseases_final_disease_selected.tsv
similarity index 100%
rename from data/raw/data_01_neuro_diseases_final_disease_selected.tsv
rename to data_processing/data/raw/data_01_neuro_diseases_final_disease_selected.tsv
diff --git a/data/raw/top10_genecount_disease.tsv b/data_processing/data/raw/top10_genecount_disease.tsv
similarity index 100%
rename from data/raw/top10_genecount_disease.tsv
rename to data_processing/data/raw/top10_genecount_disease.tsv
diff --git a/results/disease_gds.csv b/data_processing/results/disease_gds.csv
similarity index 100%
rename from results/disease_gds.csv
rename to data_processing/results/disease_gds.csv
diff --git a/results/gds_gpl.csv b/data_processing/results/gds_gpl.csv
similarity index 100%
rename from results/gds_gpl.csv
rename to data_processing/results/gds_gpl.csv
diff --git a/results/prefilt_disease_gds.csv b/data_processing/results/prefilt_disease_gds.csv
similarity index 100%
rename from results/prefilt_disease_gds.csv
rename to data_processing/results/prefilt_disease_gds.csv
diff --git a/scripts/download_files.ipynb b/data_processing/scripts/download_files.ipynb
similarity index 100%
rename from scripts/download_files.ipynb
rename to data_processing/scripts/download_files.ipynb
diff --git a/data_processing/download_files.py b/data_processing/scripts/download_files.py
similarity index 100%
rename from data_processing/download_files.py
rename to data_processing/scripts/download_files.py
diff --git a/scripts/gpl_main.ipynb b/data_processing/scripts/gpl_main.ipynb
similarity index 100%
rename from scripts/gpl_main.ipynb
rename to data_processing/scripts/gpl_main.ipynb
diff --git a/scripts/insert_tables.py b/data_processing/scripts/insert_tables.py
similarity index 100%
rename from scripts/insert_tables.py
rename to data_processing/scripts/insert_tables.py
diff --git a/scripts/preprocess_functions.py b/data_processing/scripts/preprocess_functions.py
similarity index 100%
rename from scripts/preprocess_functions.py
rename to data_processing/scripts/preprocess_functions.py
diff --git a/scripts/preprocess_upload_data.ipynb b/data_processing/scripts/preprocess_upload_data.ipynb
similarity index 100%
rename from scripts/preprocess_upload_data.ipynb
rename to data_processing/scripts/preprocess_upload_data.ipynb
diff --git a/scripts/download_files.py b/scripts/download_files.py
deleted file mode 100644
index 873a45b..0000000
--- a/scripts/download_files.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import requests
-from Bio import Entrez
-import GEOparse
-import os
-import gzip
-import shutil
-import pandas as pd
-from os.path import join
-from collections import defaultdict
-import numpy as np
-
-
-# ================================================================================= 
-
-def extract_tsv(tsv_file):
-    """
-    Extracts specific data from a TSV file that contains the disease cui and disease name and returns a DataFrame
-    with selected columns (cui and disease_name) and renamed the 'cui' column for 'disease_id' header.
-
-    Input Parameters:
-    - tsv_file (str): Path to the TSV file to be read.
-
-    Returns:
-    - pd.DataFrame: A DataFrame containing specific data with the renamed columns.
-    """
-
-    use_cols = ['cui', 'disease_name'] 
-    new_column_names = {'cui': 'disease_id'}  
-
-    disease_df = pd.read_csv(tsv_file, delimiter='\t',usecols=use_cols)
-
-    disease_df.rename(columns=new_column_names, inplace=True)
-
-    return disease_df 
-
-
-
-
-###########################
-
-
-def download_and_save_gds(email_request, disease_df, gds_path):
-    """
-    Queries GEO GDS to find datasets associated with given disease names and returns a DataFrame.
-
-    Input Parameters:
-    - email_request (str): Email address for Entrez API usage.
-    - disease_df (pd.DataFrame): DataFrame containing 'disease_id' and 'disease_name' columns with disease names to query.
-
-    Returns:
-    - pd.DataFrame: DataFrame with columns 'disease_id' and 'gds_id' representing the relationship between
-                    disease IDs and GDS identifiers.
-    """
-    Entrez.email = email_request
-
-    rows = []
-    
-    # Ensure the directory exists
-    if not os.path.exists(gds_path):
-        os.makedirs(gds_path)
-
-    unique_disease_names = set(disease_df['disease_name'])
-
-    for disease_name in unique_disease_names:
-        query = f'"Homo sapiens"[Organism] AND "disease state" AND "{disease_name}"'
-
-        with Entrez.esearch(db="gds", term=query, retmax=1000) as handle:
-            record = Entrez.read(handle)
-
-        id_list = record.get("IdList", [])
-
-        if not id_list:
-            print(f"No matching records found for: {disease_name}")
-            continue
-
-        for gds_id in id_list:
-            with Entrez.esummary(db="gds", id=gds_id) as handle:
-                gds_summaries = Entrez.read(handle)
-
-            for summary in gds_summaries:
-                if summary['Accession'].startswith("GDS"):
-                    geo_accession = summary['Accession']
-                    gds = GEOparse.get_GEO(geo=geo_accession, destdir=gds_path, annotate_gpl=True)
-                    disease_id = disease_df[disease_df['disease_name'] == disease_name]['disease_id'].values[0]
-                    rows.append({
-                        "disease_id": disease_id,
-                        "gds_id": summary['Accession']
-                    })
-                    break
-
-    dis_gds_df = pd.DataFrame(rows)
-
-    return dis_gds_df
-
-
-
-
-
-# ================================================================================= 
-
-
-
-            
-                
-def decompress_gds_gz_files(gds_path):
-    """
-    Decompresses .gz files in a directory.
-
-    Input Parameters:
-        destdir (str): Path to the directory containing .gz files.
-
-    Returns:
-        None
-    """
-    
-    for filename in os.listdir(gds_path):
-
-        if filename.startswith("GDS") and filename.endswith(".gz"):
-
-            filepath = os.path.join(gds_path, filename)
-            
-            uncompressed_filepath = os.path.join(gds_path, filename[:-3])  #[:3] to remove the .gz extension
-            
-            with gzip.open(filepath, 'rb') as compressed_file:
-                with open(uncompressed_filepath, 'wb') as uncompressed_file:
-                 
-                    shutil.copyfileobj(compressed_file, uncompressed_file)
-               
-                
-                
-
-
-
-def filter_gds(gds_path, disease_gds_df):
-    """
-    Filters GDS files based on specific criteria (`value_type` of the dataset must be "count", the `channel_count` must be 1 and the disease state column must be present) and updates the provided DataFrame to include only those GDS IDs that meet the criteria.
-
-    Args:
-        gds_path (str): Path to the directory containing GDS .soft files.
-        disease_gds_df (pandas.DataFrame): DataFrame containing GDS IDs and associated disease information.
-            This DataFrame must have a column named 'gds_id' which contains GDS IDs.
-
-    Returns:
-        pandas.DataFrame: A filtered DataFrame containing only the rows where 'gds_id' meets the criteria (count value type and single channel).
-
-    """
-
-    # Initialize an empty list to store valid GDS IDs
-    valid_gds_ids = []
-
-    # Iterate over all files in the directory specified by gds_path
-    for filename in os.listdir(gds_path):
-        # Check if the file is a GDS .soft file
-        if filename.startswith("GDS") and filename.endswith(".soft"):
-            filepath = os.path.join(gds_path, filename)
-            gds = GEOparse.get_GEO(filepath=filepath)  # Load the GDS file using GEOparse
-            gds_id = gds.name  # Extract the GDS ID from the GDS metadata
-
-            # Get the 'value_type' and 'channel_count' from the GDS metadata
-            value_type = gds.metadata.get('value_type', [None])[0]
-            channel_count = int(gds.metadata.get('channel_count', [None])[0])
-
-            # Check if 'value_type' is 'count' and 'channel_count' is 1
-            if value_type.lower() == 'count' and channel_count == 1:
-                gds_annot = gds.columns.reset_index().rename(columns={'index': 'gsm_id'})  #get the GDS annotations
-
-                # Check if the 'disease state' column is present
-                if 'disease state' in gds_annot.columns:
-                    valid_gds_ids.append(gds_id)  #add the GDS ID to the list of valid GDS IDs
-
-    # Filter the disease_gds_df to include only rows where 'gds_id' is in the list of valid GDS IDs
-    diseases_gds_filtered = disease_gds_df[disease_gds_df['gds_id'].isin(valid_gds_ids)]
-
-    return diseases_gds_filtered
-
-
-
-
-
-
-
-
-
-
- 
-
-            
-def create_gds_gpl_mapping_and_download(gds_path,gpl_path):
-    """
-    Creates a mapping of GDS IDs to corresponding GPL IDs.
-
-    Input Parameters:
-        gds_path (str): Path to the directory containing GDS files.
-
-    Returns:
-        pd.DataFrame: DataFrame with columns 'gds_id' and 'gpl_id', mapping GDS IDs to GPL IDs.
-    """
-    rows = []
-
-    for filename in os.listdir(gds_path):
-        if filename.startswith("GDS") and filename.endswith(".soft"):
-            filepath = os.path.join(gds_path, filename)
-            gds = GEOparse.get_GEO(filepath=filepath)
-            gds_id=gds.name
-            geo_platform = gds.metadata.get('platform', [])
-
-            for gpl_id in geo_platform:
-                rows.append({'gds_id': gds.name, 'gpl_id': gpl_id})
-                gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True)
-                print(f"Downloaded GPL {gpl_id} for GDS {gds_id} to {os.path.join(gpl_path, gpl_id)}")
-      
-
-    gds_gpl_df = pd.DataFrame(rows)
-
-    return gds_gpl_df
-              
-            
-# =================================================================================            
-                
-                
-        
-                
-def download_gpl(gds_gpl_df, gpl_path):
-    """
-    Fetches unique GPL data for the provided GDS-GPL mapping from a DataFrame and stores them in the specified directory.
-
-    Input Parameters:
-        gds_gpl_df (pandas.DataFrame): DataFrame containing GDS IDs and corresponding GPL IDs.
-        gpl_path (str): Path to the directory to store GPL files.
-    """
-    # Ensure the directory exists
-    if not os.path.exists(gpl_path):
-        os.makedirs(gpl_path)
-
-    # Extract unique GPL IDs
-    diff_gpl_ids = gds_gpl_df['gpl_id'].unique()
-
-    # Iterate through the unique GPL IDs and download GPL files
-    for gpl_id in diff_gpl_ids:
-        gpl = GEOparse.get_GEO(geo=gpl_id, destdir=gpl_path, annotate_gpl=True)
-                
-        
-        
-# =================================================================================        
-        
-        
-        
-def decompress_gpl_gz_files(gpl_path):
-    """
-    Decompresses .gz files in a directory.
-
-    Input Parameters:
-        destdir (str): Path to the directory containing .gz files.
-
-    Returns:
-        None
-    """
-   
-    for filename in os.listdir(gpl_path):
-        
-        if filename.startswith("GPL") and filename.endswith(".gz"):
-           
-            filepath = os.path.join(gpl_path, filename)
-            
-            uncompressed_filepath = os.path.join(gpl_path, filename[:-3])  #[:3] to remove the .gz extension
-            
-            with gzip.open(filepath, 'rb') as compressed_file:
-                
-                with open(uncompressed_filepath, 'wb') as uncompressed_file:
-                    
-                    shutil.copyfileobj(compressed_file, uncompressed_file)
-
-         
-            
-        
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-            
-            
-            
-            
-            
-            
-            
-            
-            
\ No newline at end of file
-- 
2.24.1