#! /usr/bin/env python

"""
# ---------------------------------------------------------------------------
# functions.py
# ----------------------------------------------------------------------------
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from scipy.stats import norm
from itertools import combinations
from tqdm import tqdm
import re
from itertools import product
from statannotations.Annotator import Annotator
    

# =================================================================================

def genes_dis(enf, file):
    """
    This function creates a list with the genes associated with the disease "enf" in the dis_gen file   
    
    """
    genes=[]
    for i, dis in enumerate(file["cui"]):
        if dis == enf:
            gen = file["gene_id"][i]
            genes.append(gen) 
    return genes

# =================================================================================

def pro_gen_dict(gene_list, file): 
    """
    This function creates a dictionary from the list of genes associated with the disease with:
     key: protein associated with each gene in the gen_pro file
     value: gene related to the key protein in the gen_pro file
    """
    result_dict = {}
    for i, gen in enumerate(file["gene_id"]): 
        # Looping through gen_pro, which relates genes and proteins.
        # I'm storing the position of the gene (i) and the gene id (gen).
        if gen in gene_list: 
            # Searching each gene in gen_pro within the corresponding gene list of each disease.
            prot = file["protein_id"][i] 
            # If that gene is in the gene list of each disease, I find the associated protein at the same position.
            result_dict[prot] = gen 
            # Adding to each disease's dictionary the protein as key and the related gene as value.
    return result_dict

# =================================================================================

def gen_pro_PPI(dict1, file):
    """
    From a dictionary with the relationships between proteins and genes associated with each of our diseases,
    this function retains the prot:gen relationship from the dictionary only if such prot appears in the PPI network of the pro_pro file.
    key: proteins appearing in the PPI network
    value: genes related to the key protein
    """
    result_dict = {}
    for prot in dict1.keys(): 
        # Iterating over all proteins in the general prot:gen dictionary.
        if prot in file["prA"].tolist() or prot in file["prB"].tolist(): 
            # Selecting proteins that appear in the PPI network.
            result_dict[prot] = dict1[prot] 
            # Adding to the PPI prot:gen dictionary only the prot:gen relationships for proteins that are in the PPI.
    return result_dict 

# =================================================================================

def lcc(SG):
    """
    This function gives us the LCC of the proteins from the PPI network associated with a disease from a subgraph
    formed only with the proteins associated with the disease.
    """
    lcc = max(nx.connected_components(SG), key=len) 
    # Calculating the LCC (module comprising the largest number of proteins associated with a disease).
    
    # Our goal is to obtain the number of genes that are part of the LCC of the disease:
    # The number of proteins from the disease in the LCC is the same number as the genes in the LCC
    # (because we have extracted the list of proteins from the dictionary where they form a tuple with their associated genes).
    
    return lcc

# =================================================================================

def nodes_by_degree(G):
    """
    This function returns a dictionary where we will obtain the degrees as keys and, in the values, all the nodes of the network that contain that degree.
    """
    degree_dict = {}
    for node in G.nodes():
        degree = G.degree(node)

        if degree not in degree_dict:
            degree_dict[degree] = []

        degree_dict[degree].append(node)

    return degree_dict

# =================================================================================

def degrees_list(G):
    """
    This function returns a list with the nodes and another list with their degrees.
    """
    nodes = list(G.nodes())
    degrees = list(dict(G.degree()).values())
    
    return nodes, degrees

# =================================================================================

def calculate_lcc_for_cell_type(degs_cell_type, gen_pro, pro_pro, PPI):

    # Generar la subred para el tipo celular con sus DEGs
    prots = pro_gen_dict(degs_cell_type, gen_pro)  # Proteínas correspondientes a los DEGs
    prots_interactome = gen_pro_PPI(prots, pro_pro)
    SG = PPI.subgraph(prots_interactome)
    lcc_cell_type = lcc(SG)

    return lcc_cell_type

# =================================================================================

def calculate_lcc_from_prots(prots, pro_pro, PPI):

    prots_interactome = []
    for prot in prots:
        # Iterating over all proteins in the general prot:gen dictionary.
        if prot in pro_pro["prA"].tolist() or prot in pro_pro["prB"].tolist():
            # Selecting proteins that appear in the PPI network.
            prots_interactome.append(prot)

    SG = PPI.subgraph(prots_interactome)
    lcc_cell_type = lcc(SG)

    return lcc_cell_type

# =================================================================================

def generate_log_bins(graph, num_bins):
    """
    This function generates logarithmic bins to group nodes of a graph based on
    the degree distribution of the nodes.
    """
    degrees = [degree for _, degree in graph.degree()]
    min_degree = max(min(degrees), 1)  # Para evitar log(0)
    max_degree = max(degrees)

    return np.logspace(np.log10(min_degree), np.log10(max_degree), num_bins)

# =================================================================================

def group_nodes_by_bins(graph, log_bins):
    """
    This function groups nodes of a graph in logarithmic bins based on its degree.
    """
    nodes_bins = {}
    for node, degree in graph.degree():
        bin_index = np.digitize(degree, log_bins) - 1  # Ajustar índice para Python (basado en 0)
        nodes_bins.setdefault(bin_index, []).append(node)

    return nodes_bins

# =================================================================================

def random_subset_generator(proteins, graph_ppi, num_iterations):

        # Generation of logarithmic bins
    num_bins = 10
    bin_edges = generate_log_bins(graph_ppi, num_bins)

    # Group nodes in logarithmic bins
    group_nodes_bins = group_nodes_by_bins(graph_ppi, bin_edges)

    results = []

    for _ in range(num_iterations):  # For each iteration

        iteration_results = [] # list to append proteins for each disease

        for prot in proteins:

            # degree of the node
            degree_node = graph_ppi.degree(prot)

            # bin of the node based on its degree
            bin_index = np.digitize(degree_node, bin_edges) - 1

            # nodes of the same bin
            available_nodes = group_nodes_bins.get(bin_index, [])

            if available_nodes:
                random_node = np.random.choice(available_nodes) #choose randomly a node from available nodes
                while random_node == prot:
                    random_node = np.random.choice(available_nodes)

                iteration_results.append(str(random_node))

            else:
                iteration_results.append(None)

        if any(iteration_results):
            results.append(iteration_results)

    return results

# =================================================================================

def load_and_process(files):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        # Extraer el tipo celular del nombre del archivo (por ejemplo, "degs_celltype_mapped.csv")
        cell_type = file.split('_')[1]  # Asumiendo que el tipo celular está en la segunda posición del nombre del archivo
        df['cell_type'] = cell_type  # Agregar la columna 'cell_type' al DataFrame
        df_list.append(df)
    merged_df = pd.concat(df_list)
    return merged_df

# =================================================================================