clustering.py

import pandas as pd
import Levenshtein
import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np
from matplotlib import pyplot as plt

from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall
from Levenshtein import distance
import re
from minineedle import needle, smith, core
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
from scipy.cluster.hierarchy import dendrogram
import multiprocessing as mp
globi=0
df_b=None
def plot_dendrogram(model, **kwargs):
    """
    Plot dendrogram for hierarchical clustering model.

    Parameters:
    - model: Hierarchical clustering model
    - **kwargs: Additional keyword arguments for dendrogram
    """
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)     
def substitute_or_remove_prot_id(data,archNom,sub_rem):
    """
    Substitute or remove protein IDs in the given DataFrame based on a substitution file.

    Parameters:
    - data: DataFrame containing protein data
    - archNom: File path for the IDs to be substituted to its primary entry 
    - sub_rem: String indicating the operation type ('s' for substitution, 'r' for removal)

    This function reads a file containing information for protein ID substitution or removal and performs the
    corresponding operation on the provided DataFrame. The substitution or removal is based on the content of the file
    specified by the 'archNom' parameter.

    If 'sub_rem' is set to 's' (substitution), it replaces protein IDs in the DataFrame with the specified IDs from
    the substitution file.

    If 'sub_rem' is set to 'r' (removal), it removes rows from the DataFrame where the protein ID matches the specified
    other protein IDs or its sequences matches the other protein sequence in the dataframe once substituted. The removed rows are saved to a CSV file.

    Note: The function modifies the input DataFrame and returns the modified DataFrame.

    Example:
    >>> data = substitute_or_remove_prot_id(data, "substitution_file.txt", "s")
    >>> data = substitute_or_remove_prot_id(data, "removal_file.txt", "r")
    """
    print("inside the problem")
    with open(archNom) as prottosubs:
          index=prottosubs.readline()
          acept=index.split()
          listtosubs={}
          for i in range(0,len(acept)):
            listtosubs[acept[i]]=[]
          while line := prottosubs.readline():
              newline=line.split()
              #print(len(newline))
              for i in range(0,len(newline)):
                  
                  listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())  
    resub=1
    if re.search("Primary",list(listtosubs.keys())[0]):
           resub=0
    print((resub+1)%2)
    #print(data)
    #data2=data.copy()
    if(sub_rem == "s"):
       data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
    #datacp=data.copy()
    #print(pd.concat([data2,datacp]).drop_duplicates())
    else: 
        global globi
        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
        data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False]
        
        datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) 

        globi=globi+1 
        #data.to_excel('data_nervous_genes_xf.xlsx')
    return data                
def readData(archivoEntrada, enfermedad):
    """
    Read data from an Excel file and filter based on disease_id.

    Parameters:
    - archivoEntrada: Input Excel file path
    - enfermedad: Disease ID for filtering
    
    Returns:
    - Protein sequences data
    """
    data = pd.read_excel(archivoEntrada)
    if (enfermedad != ''):
        
        
        data = data.loc[data["disease_id"] == enfermedad]
        filt_data=len(data)
        print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))      

    sequences = data["protein_sequence"]

    return sequences
def save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,method,sal):
    """
    Save clustered protein data and related information into CSV files.

    Parameters:
    - data: Protein sequences data
    - archivoEntrada: Input file path for reading protein data
    - archNom: File path for substitution or removal information
    - similarity_matrix: Matrix representing the similarity between protein sequences
    - cluster: Cluster labels assigned to each protein
    - method: Clustering method used (e.g., Agglomerative, Spectral, DBScan)
    - sal: Extension to be added to the output file name

    This function processes the clustered protein data and related information, such as the centroid of each cluster,
    and saves the results into CSV files. The CSV files include details about discarded data and cluster information.

    
    """
    filtered_clusters = []
    discarded_data = []
    discarded_data2=[]
    dato=remplazar_sequence_for_ID(data,archivoEntrada,archNom)
    # Convert similarity matrix to list for easier indexing
    similarity_matrix2=similarity_matrix.values.tolist()
    # Create a dictionary to store protein indices for each cluster
    clusters={}
    for k in range(0,len(cluster)):
        if cluster[k] in clusters:
           clusters[cluster[k]].append(k)
        else:
           clusters[cluster[k]]=[k] 
    print(clusters)
    # Process each cluster to find the centroid       
    for cluster_id, cluster in clusters.items():
        filtered_cluster = []
        min_avg_distance = float('inf')
        central_point_index = None
        print(cluster)
        # Calculate the average distance for each point in the cluster
        for point_index in cluster:
            total_distance = 0
            for other_index in cluster:
                total_distance += similarity_matrix2[point_index][other_index]
            avg_distance = total_distance / len(cluster)
            # Update centroid if the average distance is smaller
            if avg_distance < min_avg_distance:
                min_avg_distance = avg_distance
                central_point_index = point_index

        # Verificar si el punto central supera el umbral
        similarity_percentage = 1 - (min_avg_distance / eps)
        # Append the central point index to the filtered cluster
        filtered_cluster.append(central_point_index)
        print(max(cluster))
        print(len(datacp))
        print(len(data))
        print(len(dato))
        discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
        #discarded_data2.extend([[dato[i],datacp[i]]  for i in cluster if i != central_point_index] )
        if filtered_cluster:
            filtered_clusters.append(filtered_cluster)

    data = remplazar_sequence_for_ID(data,archivoEntrada,archNom)

    # Imprimir los resultados
    #for cluster_id, cluster in enumerate(filtered_clusters):
    #    cluster_data = [data[i] for i in cluster]
    #    print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
        
    #discarded_data = remplazar_sequence_for_ID(discarded_data)
    # Save the data into a CSV file
    if discarded_data:
        df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
        #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
        df.to_csv('resultados/proteinasCluster'+method +sal+'.csv', index=False) 


def descarte(data, threshold,archivoEntrada,ValEnt,sal,archNom,n_clust,min_samp):
    """
    Cluster proteins based on a similarity matrix using hierarchical clustering, spectral clustering,
    and DBSCAN algorithms.

    Parameters:
    - data: Protein sequences data
    - threshold: Similarity threshold for clustering
    - archivoEntrada: Input file path for reading protein data
    - ValEnt: File path for the similarity matrix
    - sal: Placeholder variable (unused in the function)
    - archNom: File path for substitution or removal information
    - n_clust: Number of clusters for Agglomerative and Spectral clustering
    - min_samp: Minimum number of samples for DBSCAN clustering

    The function reads protein sequences from the input file and performs clustering using
    hierarchical clustering (Agglomerative), spectral clustering (Spectral), and DBSCAN algorithms.
    The results are saved in separate cluster files.

    Note: The 'sal' parameter is not used in the function and can be ignored.
    """
    # Datos de ejemplo
    data = data.tolist()
    #print(str(blast_similarity(data[0],data[1]))+"  similarity of equals")
    # Crear matriz de similitud
    #num_points = len(data)
    similarity_matrix=pd.read_csv(ValEnt,header=None,index_col=False)-1
    similarity_matrix=similarity_matrix.abs()
    #sim_matrix=[item[0] for item in similarity_matrix]
    #k=0
    #for i in range(num_points):
    #   for j in range(i,num_points):
    #       similarity_matrix[i][j]=sim_matrix[k]
    #       k=k+1
    #for i in range(num_points):
    #   for j in range(i):
    #       similarity_matrix[i][j]=similarity_matrix[j][i]       
    print("simmatrix")
    # algorithm DBSCAN parameters
    eps = threshold  # Umbral de similitud
    min_samples = min_samp  # Número mínimo de muestras para formar un cluster
    # Create a copy of the original data
    datacp=data.copy()
    # Ejecutar el algoritmo DBSCAN
    global dat
    dat=data
    #datx=np.arange(len(data)).reshape(-1, 1)
    #sim
    # Perform Agglomerative clustering
    aglom_instance=AgglomerativeClustering(n_clusters=n_clust, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy())
    print(aglom_instance.labels_)
    cluster= aglom_instance.labels_
    
    # Plot dendrogram for Agglomerative clustering
    plot_dendrogram(aglom_instance, labels=aglom_instance.labels_)
    plt.show()
    # Save Agglomerative clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Agglomerative",sal)
    # Perform Spectral clustering
    spectre=SpectralClustering(n_clusters=n_clust,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy())
    cluster= spectre.labels_
    print(spectre.labels_)
    # Save Spectral clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Spectral",sal)
    # Perform DBSCAN clustering
    dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy())
    cluster= dbscan_instance.labels_
    print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist())))
    print("outside the cluster madness")
    # Save DBSCAN clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"DBScan",sal)
        #df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False)
    

def remplazar_sequence_for_ID(output,ids,archNom):
    """
    Replace protein sequences with protein IDs from a pre-existing DataFrame.

    Parameters:
    - output: List of protein sequences
    - ids: file that contains the equivalence between each sequence and their respective id
    
    Returns:
    - List of protein IDs
    """
    df_b = pd.read_excel(ids)
    df_b=substitute_or_remove_prot_id(df_b,archNom,"r")
    #df_b= substitute_or_remove_prot_id(df_b,"s")
    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)

    for i in range(len(output)):
        protein_sequence = output[i]
        if protein_sequence in proteinas_dict:
            output[i] = proteinas_dict[protein_sequence]

    return output

def remplazar_ID_for_sequence(output,ids,archNom):
     """
    Replace protein IDs with protein sequences using a pre-existing DataFrame.

    Parameters:
    - output: List of protein IDs
    
    Returns:
    - List of protein sequences
    """
    global df_b
    if(df_b is None):
     
       df_b = pd.read_excel(ids)
       df_b=substitute_or_remove_prot_id(df_b,archNom,"r")
    #df_b= substitute_or_remove_prot_id(df_b,"s")
    proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values)

    for i in range(len(output)):
        protein_sequence = output[i]
        if protein_sequence in proteinas_dict:
            output[i] = proteinas_dict[protein_sequence]

    return output
    
def ejecutar(archivoEntrada, enfermedad, similitud,ValEnt,sal,archNom,n_clusters,min_sample):
    """
    Execute protein clustering based on similarity matrix using hierarchical clustering, spectral clustering,
    and DBSCAN algorithms.

    Parameters:
    - archivoEntrada: Input Excel file path containing protein data
    - enfermedad: Disease ID for filtering protein data (leave empty for no filtering)
    - similitud: Similarity threshold for clustering
    - ValEnt: File path for the similarity matrix
    - sal: Extension to be added to the cluster Output File made by every algorithm
    - archNom: File path for substitution or removal information
    - n_clusters: Number of clusters to be formed by hierarchical clustering and spectral clustering
    - min_sample: Minimum number of samples required to form a cluster in DBSCAN

    This function reads protein data from an Excel file, filters it based on disease ID if provided, and then performs
    clustering using hierarchical clustering, spectral clustering, and DBSCAN algorithms. The resulting clusters and
    discarded data are saved into CSV files.

    """
    data = readData(archivoEntrada, enfermedad)
    descarte(data, similitud,archivoEntrada,ValEnt,sal,archNomn_clusters,min_sample)
if __name__=='__main__':
   ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001¨,'resultados/matrizNeedleWunchFS_70.csv',"NW70","nombres_sust.txt",100,1)