#! /usr/bin/env python """ # --------------------------------------------------------------------------- # functions.py # ---------------------------------------------------------------------------- """ import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import networkx as nx from scipy.stats import norm from itertools import combinations from tqdm import tqdm import re from itertools import product from statannotations.Annotator import Annotator # ================================================================================= def genes_dis(enf, file): """ This function creates a list with the genes associated with the disease "enf" in the dis_gen file """ genes=[] for i, dis in enumerate(file["cui"]): if dis == enf: gen = file["gene_id"][i] genes.append(gen) return genes # ================================================================================= def pro_gen_dict(gene_list, file): """ This function creates a dictionary from the list of genes associated with the disease with: key: protein associated with each gene in the gen_pro file value: gene related to the key protein in the gen_pro file """ result_dict = {} for i, gen in enumerate(file["gene_id"]): # Looping through gen_pro, which relates genes and proteins. # I'm storing the position of the gene (i) and the gene id (gen). if gen in gene_list: # Searching each gene in gen_pro within the corresponding gene list of each disease. prot = file["protein_id"][i] # If that gene is in the gene list of each disease, I find the associated protein at the same position. result_dict[prot] = gen # Adding to each disease's dictionary the protein as key and the related gene as value. return result_dict # ================================================================================= def gen_pro_PPI(dict1, file): """ From a dictionary with the relationships between proteins and genes associated with each of our diseases, this function retains the prot:gen relationship from the dictionary only if such prot appears in the PPI network of the pro_pro file. key: proteins appearing in the PPI network value: genes related to the key protein """ result_dict = {} for prot in dict1.keys(): # Iterating over all proteins in the general prot:gen dictionary. if prot in file["prA"].tolist() or prot in file["prB"].tolist(): # Selecting proteins that appear in the PPI network. result_dict[prot] = dict1[prot] # Adding to the PPI prot:gen dictionary only the prot:gen relationships for proteins that are in the PPI. return result_dict # ================================================================================= def lcc(SG): """ This function gives us the LCC of the proteins from the PPI network associated with a disease from a subgraph formed only with the proteins associated with the disease. """ lcc = max(nx.connected_components(SG), key=len) # Calculating the LCC (module comprising the largest number of proteins associated with a disease). # Our goal is to obtain the number of genes that are part of the LCC of the disease: # The number of proteins from the disease in the LCC is the same number as the genes in the LCC # (because we have extracted the list of proteins from the dictionary where they form a tuple with their associated genes). return lcc # ================================================================================= def nodes_by_degree(G): """ This function returns a dictionary where we will obtain the degrees as keys and, in the values, all the nodes of the network that contain that degree. """ degree_dict = {} for node in G.nodes(): degree = G.degree(node) if degree not in degree_dict: degree_dict[degree] = [] degree_dict[degree].append(node) return degree_dict # ================================================================================= def degrees_list(G): """ This function returns a list with the nodes and another list with their degrees. """ nodes = list(G.nodes()) degrees = list(dict(G.degree()).values()) return nodes, degrees # ================================================================================= def calculate_lcc_for_cell_type(degs_cell_type, gen_pro, pro_pro, PPI): # Generar la subred para el tipo celular con sus DEGs prots = pro_gen_dict(degs_cell_type, gen_pro) # Proteínas correspondientes a los DEGs prots_interactome = gen_pro_PPI(prots, pro_pro) SG = PPI.subgraph(prots_interactome) lcc_cell_type = lcc(SG) return lcc_cell_type # ================================================================================= def calculate_lcc_from_prots(prots, pro_pro, PPI): prots_interactome = [] for prot in prots: # Iterating over all proteins in the general prot:gen dictionary. if prot in pro_pro["prA"].tolist() or prot in pro_pro["prB"].tolist(): # Selecting proteins that appear in the PPI network. prots_interactome.append(prot) SG = PPI.subgraph(prots_interactome) lcc_cell_type = lcc(SG) return lcc_cell_type # ================================================================================= def generate_log_bins(graph, num_bins): """ This function generates logarithmic bins to group nodes of a graph based on the degree distribution of the nodes. """ degrees = [degree for _, degree in graph.degree()] min_degree = max(min(degrees), 1) # Para evitar log(0) max_degree = max(degrees) return np.logspace(np.log10(min_degree), np.log10(max_degree), num_bins) # ================================================================================= def group_nodes_by_bins(graph, log_bins): """ This function groups nodes of a graph in logarithmic bins based on its degree. """ nodes_bins = {} for node, degree in graph.degree(): bin_index = np.digitize(degree, log_bins) - 1 # Ajustar índice para Python (basado en 0) nodes_bins.setdefault(bin_index, []).append(node) return nodes_bins # ================================================================================= def random_subset_generator(proteins, graph_ppi, num_iterations): # Generation of logarithmic bins num_bins = 10 bin_edges = generate_log_bins(graph_ppi, num_bins) # Group nodes in logarithmic bins group_nodes_bins = group_nodes_by_bins(graph_ppi, bin_edges) results = [] for _ in range(num_iterations): # For each iteration iteration_results = [] # list to append proteins for each disease for prot in proteins: # degree of the node degree_node = graph_ppi.degree(prot) # bin of the node based on its degree bin_index = np.digitize(degree_node, bin_edges) - 1 # nodes of the same bin available_nodes = group_nodes_bins.get(bin_index, []) if available_nodes: random_node = np.random.choice(available_nodes) #choose randomly a node from available nodes while random_node == prot: random_node = np.random.choice(available_nodes) iteration_results.append(str(random_node)) else: iteration_results.append(None) if any(iteration_results): results.append(iteration_results) return results # ================================================================================= def load_and_process(files): df_list = [] for file in files: df = pd.read_csv(file) # Extraer el tipo celular del nombre del archivo (por ejemplo, "degs_celltype_mapped.csv") cell_type = file.split('_')[1] # Asumiendo que el tipo celular está en la segunda posición del nombre del archivo df['cell_type'] = cell_type # Agregar la columna 'cell_type' al DataFrame df_list.append(df) merged_df = pd.concat(df_list) return merged_df # =================================================================================