Code update

84259f21 · Andrea · cd9227e2 · 84259f21 · cd9227e2 · 84259f21
Commit 84259f21 authored Mar 28, 2025 by Andrea
3 changed files
--- a/code/functions/functions.py
+++ b/code/functions/functions.py
+#! /usr/bin/env python
+
+"""
+# ---------------------------------------------------------------------------
+# functions.py
+# ----------------------------------------------------------------------------
+"""
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import networkx as nx
+from scipy.stats import norm
+from itertools import combinations
+from tqdm import tqdm
+import re
+from itertools import product
+from statannotations.Annotator import Annotator
+    
+
+# =================================================================================
+
+def genes_dis(enf, file):
+    """
+    This function creates a list with the genes associated with the disease "enf" in the dis_gen file   
+    
+    """
+    genes=[]
+    for i, dis in enumerate(file["cui"]):
+        if dis == enf:
+            gen = file["gene_id"][i]
+            genes.append(gen) 
+    return genes
+
+# =================================================================================
+
+def pro_gen_dict(gene_list, file): 
+    """
+    This function creates a dictionary from the list of genes associated with the disease with:
+     key: protein associated with each gene in the gen_pro file
+     value: gene related to the key protein in the gen_pro file
+    """
+    result_dict = {}
+    for i, gen in enumerate(file["gene_id"]): 
+        # Looping through gen_pro, which relates genes and proteins.
+        # I'm storing the position of the gene (i) and the gene id (gen).
+        if gen in gene_list: 
+            # Searching each gene in gen_pro within the corresponding gene list of each disease.
+            prot = file["protein_id"][i] 
+            # If that gene is in the gene list of each disease, I find the associated protein at the same position.
+            result_dict[prot] = gen 
+            # Adding to each disease's dictionary the protein as key and the related gene as value.
+    return result_dict
+
+# =================================================================================
+
+def gen_pro_PPI(dict1, file):
+    """
+    From a dictionary with the relationships between proteins and genes associated with each of our diseases,
+    this function retains the prot:gen relationship from the dictionary only if such prot appears in the PPI network of the pro_pro file.
+    key: proteins appearing in the PPI network
+    value: genes related to the key protein
+    """
+    result_dict = {}
+    for prot in dict1.keys(): 
+        # Iterating over all proteins in the general prot:gen dictionary.
+        if prot in file["prA"].tolist() or prot in file["prB"].tolist(): 
+            # Selecting proteins that appear in the PPI network.
+            result_dict[prot] = dict1[prot] 
+            # Adding to the PPI prot:gen dictionary only the prot:gen relationships for proteins that are in the PPI.
+    return result_dict 
+
+# =================================================================================
+
+def lcc(SG):
+    """
+    This function gives us the LCC of the proteins from the PPI network associated with a disease from a subgraph
+    formed only with the proteins associated with the disease.
+    """
+    lcc = max(nx.connected_components(SG), key=len) 
+    # Calculating the LCC (module comprising the largest number of proteins associated with a disease).
+    
+    # Our goal is to obtain the number of genes that are part of the LCC of the disease:
+    # The number of proteins from the disease in the LCC is the same number as the genes in the LCC
+    # (because we have extracted the list of proteins from the dictionary where they form a tuple with their associated genes).
+    
+    return lcc
+
+# =================================================================================
+
+def nodes_by_degree(G):
+    """
+    This function returns a dictionary where we will obtain the degrees as keys and, in the values, all the nodes of the network that contain that degree.
+    """
+    degree_dict = {}
+    for node in G.nodes():
+        degree = G.degree(node)
+
+        if degree not in degree_dict:
+            degree_dict[degree] = []
+
+        degree_dict[degree].append(node)
+
+    return degree_dict
+
+# =================================================================================
+
+def degrees_list(G):
+    """
+    This function returns a list with the nodes and another list with their degrees.
+    """
+    nodes = list(G.nodes())
+    degrees = list(dict(G.degree()).values())
+    
+    return nodes, degrees
+
+# =================================================================================
+
+def calculate_lcc_for_cell_type(degs_cell_type, gen_pro, pro_pro, PPI):
+
+    # Generar la subred para el tipo celular con sus DEGs
+    prots = pro_gen_dict(degs_cell_type, gen_pro)  # Proteínas correspondientes a los DEGs
+    prots_interactome = gen_pro_PPI(prots, pro_pro)
+    SG = PPI.subgraph(prots_interactome)
+    lcc_cell_type = lcc(SG)
+
+    return lcc_cell_type
+
+# =================================================================================
+
+def calculate_lcc_from_prots(prots, pro_pro, PPI):
+
+    prots_interactome = []
+    for prot in prots:
+        # Iterating over all proteins in the general prot:gen dictionary.
+        if prot in pro_pro["prA"].tolist() or prot in pro_pro["prB"].tolist():
+            # Selecting proteins that appear in the PPI network.
+            prots_interactome.append(prot)
+
+    SG = PPI.subgraph(prots_interactome)
+    lcc_cell_type = lcc(SG)
+
+    return lcc_cell_type
+
+# =================================================================================
+
+def generate_log_bins(graph, num_bins):
+    """
+    This function generates logarithmic bins to group nodes of a graph based on
+    the degree distribution of the nodes.
+    """
+    degrees = [degree for _, degree in graph.degree()]
+    min_degree = max(min(degrees), 1)  # Para evitar log(0)
+    max_degree = max(degrees)
+
+    return np.logspace(np.log10(min_degree), np.log10(max_degree), num_bins)
+
+# =================================================================================
+
+def group_nodes_by_bins(graph, log_bins):
+    """
+    This function groups nodes of a graph in logarithmic bins based on its degree.
+    """
+    nodes_bins = {}
+    for node, degree in graph.degree():
+        bin_index = np.digitize(degree, log_bins) - 1  # Ajustar índice para Python (basado en 0)
+        nodes_bins.setdefault(bin_index, []).append(node)
+
+    return nodes_bins
+
+# =================================================================================
+
+def random_subset_generator(proteins, graph_ppi, num_iterations):
+
+        # Generation of logarithmic bins
+    num_bins = 10
+    bin_edges = generate_log_bins(graph_ppi, num_bins)
+
+    # Group nodes in logarithmic bins
+    group_nodes_bins = group_nodes_by_bins(graph_ppi, bin_edges)
+
+    results = []
+
+    for _ in range(num_iterations):  # For each iteration
+
+        iteration_results = [] # list to append proteins for each disease
+
+        for prot in proteins:
+
+            # degree of the node
+            degree_node = graph_ppi.degree(prot)
+
+            # bin of the node based on its degree
+            bin_index = np.digitize(degree_node, bin_edges) - 1
+
+            # nodes of the same bin
+            available_nodes = group_nodes_bins.get(bin_index, [])
+
+            if available_nodes:
+                random_node = np.random.choice(available_nodes) #choose randomly a node from available nodes
+                while random_node == prot:
+                    random_node = np.random.choice(available_nodes)
+
+                iteration_results.append(str(random_node))
+
+            else:
+                iteration_results.append(None)
+
+        if any(iteration_results):
+            results.append(iteration_results)
+
+    return results
+
+# =================================================================================
+
+def load_and_process(files):
+    df_list = []
+    for file in files:
+        df = pd.read_csv(file)
+        # Extraer el tipo celular del nombre del archivo (por ejemplo, "degs_celltype_mapped.csv")
+        cell_type = file.split('_')[1]  # Asumiendo que el tipo celular está en la segunda posición del nombre del archivo
+        df['cell_type'] = cell_type  # Agregar la columna 'cell_type' al DataFrame
+        df_list.append(df)
+    merged_df = pd.concat(df_list)
+    return merged_df
+
+# =================================================================================
\ No newline at end of file
--- a/code/functions/functions_proximity.py
+++ b/code/functions/functions_proximity.py
--- a/code/scrna_ppi_analysis.ipynb
+++ b/code/scrna_ppi_analysis.ipynb