clustering.py 14.3 KB
Newer Older
Rafael Artinano's avatar
Rafael Artinano committed
1 2 3 4 5 6
import pandas as pd
import Levenshtein
import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np
7 8
from matplotlib import pyplot as plt

Rafael Artinano's avatar
Rafael Artinano committed
9 10 11 12 13 14 15 16 17 18 19 20 21
from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall
from Levenshtein import distance
import re
from minineedle import needle, smith, core
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
22
from scipy.cluster.hierarchy import dendrogram
Rafael Artinano's avatar
Rafael Artinano committed
23 24 25 26
import multiprocessing as mp
globi=0
df_b=None
def plot_dendrogram(model, **kwargs):
27 28
    """
    Plot dendrogram for hierarchical clustering model.
Rafael Artinano's avatar
Rafael Artinano committed
29

30 31 32 33
    Parameters:
    - model: Hierarchical clustering model
    - **kwargs: Additional keyword arguments for dendrogram
    """
Rafael Artinano's avatar
Rafael Artinano committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)     
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
def substitute_or_remove_prot_id(data,archNom,sub_rem):
    """
    Substitute or remove protein IDs in the given DataFrame based on a substitution file.

    Parameters:
    - data: DataFrame containing protein data
    - archNom: File path for the IDs to be substituted to its primary entry 
    - sub_rem: String indicating the operation type ('s' for substitution, 'r' for removal)

    This function reads a file containing information for protein ID substitution or removal and performs the
    corresponding operation on the provided DataFrame. The substitution or removal is based on the content of the file
    specified by the 'archNom' parameter.

    If 'sub_rem' is set to 's' (substitution), it replaces protein IDs in the DataFrame with the specified IDs from
    the substitution file.

    If 'sub_rem' is set to 'r' (removal), it removes rows from the DataFrame where the protein ID matches the specified
    other protein IDs or its sequences matches the other protein sequence in the dataframe once substituted. The removed rows are saved to a CSV file.

    Note: The function modifies the input DataFrame and returns the modified DataFrame.

    Example:
    >>> data = substitute_or_remove_prot_id(data, "substitution_file.txt", "s")
    >>> data = substitute_or_remove_prot_id(data, "removal_file.txt", "r")
    """
Rafael Artinano's avatar
Rafael Artinano committed
74
    print("inside the problem")
75
    with open(archNom) as prottosubs:
Rafael Artinano's avatar
Rafael Artinano committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
          index=prottosubs.readline()
          acept=index.split()
          listtosubs={}
          for i in range(0,len(acept)):
            listtosubs[acept[i]]=[]
          while line := prottosubs.readline():
              newline=line.split()
              #print(len(newline))
              for i in range(0,len(newline)):
                  
                  listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())  
    resub=1
    if re.search("Primary",list(listtosubs.keys())[0]):
           resub=0
    print((resub+1)%2)
    #print(data)
    #data2=data.copy()
    if(sub_rem == "s"):
       data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
    #datacp=data.copy()
    #print(pd.concat([data2,datacp]).drop_duplicates())
    else: 
        global globi
        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
        data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False]
        
        datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) 

        globi=globi+1 
        #data.to_excel('data_nervous_genes_xf.xlsx')
    return data                
def readData(archivoEntrada, enfermedad):
108 109 110 111 112 113 114 115 116 117
    """
    Read data from an Excel file and filter based on disease_id.

    Parameters:
    - archivoEntrada: Input Excel file path
    - enfermedad: Disease ID for filtering
    
    Returns:
    - Protein sequences data
    """
Rafael Artinano's avatar
Rafael Artinano committed
118 119 120 121 122
    data = pd.read_excel(archivoEntrada)
    if (enfermedad != ''):
        
        
        
123
        data = data.loc[data["disease_id"] == enfermedad]
Rafael Artinano's avatar
Rafael Artinano committed
124 125 126 127 128 129
        filt_data=len(data)
        print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))      

    sequences = data["protein_sequence"]

    return sequences
130 131 132
def save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,method,sal):
    """
    Save clustered protein data and related information into CSV files.
Rafael Artinano's avatar
Rafael Artinano committed
133

134 135 136 137 138 139 140 141 142 143 144 145 146 147
    Parameters:
    - data: Protein sequences data
    - archivoEntrada: Input file path for reading protein data
    - archNom: File path for substitution or removal information
    - similarity_matrix: Matrix representing the similarity between protein sequences
    - cluster: Cluster labels assigned to each protein
    - method: Clustering method used (e.g., Agglomerative, Spectral, DBScan)
    - sal: Extension to be added to the output file name

    This function processes the clustered protein data and related information, such as the centroid of each cluster,
    and saves the results into CSV files. The CSV files include details about discarded data and cluster information.

    
    """
148 149 150
    filtered_clusters = []
    discarded_data = []
    discarded_data2=[]
151 152
    dato=remplazar_sequence_for_ID(data,archivoEntrada,archNom)
    # Convert similarity matrix to list for easier indexing
153
    similarity_matrix2=similarity_matrix.values.tolist()
154
    # Create a dictionary to store protein indices for each cluster
155 156 157 158 159 160
    clusters={}
    for k in range(0,len(cluster)):
        if cluster[k] in clusters:
           clusters[cluster[k]].append(k)
        else:
           clusters[cluster[k]]=[k] 
161 162
    print(clusters)
    # Process each cluster to find the centroid       
163 164 165 166 167
    for cluster_id, cluster in clusters.items():
        filtered_cluster = []
        min_avg_distance = float('inf')
        central_point_index = None
        print(cluster)
168
        # Calculate the average distance for each point in the cluster
169 170 171 172 173
        for point_index in cluster:
            total_distance = 0
            for other_index in cluster:
                total_distance += similarity_matrix2[point_index][other_index]
            avg_distance = total_distance / len(cluster)
174
            # Update centroid if the average distance is smaller
175 176 177 178 179 180
            if avg_distance < min_avg_distance:
                min_avg_distance = avg_distance
                central_point_index = point_index

        # Verificar si el punto central supera el umbral
        similarity_percentage = 1 - (min_avg_distance / eps)
181
        # Append the central point index to the filtered cluster
182 183 184 185 186 187 188 189 190 191
        filtered_cluster.append(central_point_index)
        print(max(cluster))
        print(len(datacp))
        print(len(data))
        print(len(dato))
        discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
        #discarded_data2.extend([[dato[i],datacp[i]]  for i in cluster if i != central_point_index] )
        if filtered_cluster:
            filtered_clusters.append(filtered_cluster)

192
    data = remplazar_sequence_for_ID(data,archivoEntrada,archNom)
193 194 195 196 197 198 199

    # Imprimir los resultados
    #for cluster_id, cluster in enumerate(filtered_clusters):
    #    cluster_data = [data[i] for i in cluster]
    #    print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
        
    #discarded_data = remplazar_sequence_for_ID(discarded_data)
200
    # Save the data into a CSV file
201 202 203
    if discarded_data:
        df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
        #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
204
        df.to_csv('resultados/proteinasCluster'+method +sal+'.csv', index=False) 
205 206


207 208 209 210
def descarte(data, threshold,archivoEntrada,ValEnt,sal,archNom,n_clust,min_samp):
    """
    Cluster proteins based on a similarity matrix using hierarchical clustering, spectral clustering,
    and DBSCAN algorithms.
211

212 213 214 215 216 217 218 219 220
    Parameters:
    - data: Protein sequences data
    - threshold: Similarity threshold for clustering
    - archivoEntrada: Input file path for reading protein data
    - ValEnt: File path for the similarity matrix
    - sal: Placeholder variable (unused in the function)
    - archNom: File path for substitution or removal information
    - n_clust: Number of clusters for Agglomerative and Spectral clustering
    - min_samp: Minimum number of samples for DBSCAN clustering
221

222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
    The function reads protein sequences from the input file and performs clustering using
    hierarchical clustering (Agglomerative), spectral clustering (Spectral), and DBSCAN algorithms.
    The results are saved in separate cluster files.

    Note: The 'sal' parameter is not used in the function and can be ignored.
    """
    # Datos de ejemplo
    data = data.tolist()
    #print(str(blast_similarity(data[0],data[1]))+"  similarity of equals")
    # Crear matriz de similitud
    #num_points = len(data)
    similarity_matrix=pd.read_csv(ValEnt,header=None,index_col=False)-1
    similarity_matrix=similarity_matrix.abs()
    #sim_matrix=[item[0] for item in similarity_matrix]
    #k=0
    #for i in range(num_points):
    #   for j in range(i,num_points):
    #       similarity_matrix[i][j]=sim_matrix[k]
    #       k=k+1
    #for i in range(num_points):
    #   for j in range(i):
    #       similarity_matrix[i][j]=similarity_matrix[j][i]       
    print("simmatrix")
    # algorithm DBSCAN parameters
    eps = threshold  # Umbral de similitud
    min_samples = min_samp  # Número mínimo de muestras para formar un cluster
    # Create a copy of the original data
    datacp=data.copy()
    # Ejecutar el algoritmo DBSCAN
    global dat
    dat=data
    #datx=np.arange(len(data)).reshape(-1, 1)
    #sim
    # Perform Agglomerative clustering
    aglom_instance=AgglomerativeClustering(n_clusters=n_clust, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy())
    print(aglom_instance.labels_)
    cluster= aglom_instance.labels_
    
    # Plot dendrogram for Agglomerative clustering
    plot_dendrogram(aglom_instance, labels=aglom_instance.labels_)
    plt.show()
    # Save Agglomerative clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Agglomerative",sal)
    # Perform Spectral clustering
    spectre=SpectralClustering(n_clusters=n_clust,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy())
    cluster= spectre.labels_
    print(spectre.labels_)
    # Save Spectral clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Spectral",sal)
    # Perform DBSCAN clustering
Rafael Artinano's avatar
Rafael Artinano committed
272 273 274 275
    dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy())
    cluster= dbscan_instance.labels_
    print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist())))
    print("outside the cluster madness")
276 277
    # Save DBSCAN clustering results
    save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"DBScan",sal)
278
        #df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False)
Rafael Artinano's avatar
Rafael Artinano committed
279 280
    

281 282 283 284 285 286 287 288 289 290 291 292 293
def remplazar_sequence_for_ID(output,ids,archNom):
    """
    Replace protein sequences with protein IDs from a pre-existing DataFrame.

    Parameters:
    - output: List of protein sequences
    - ids: file that contains the equivalence between each sequence and their respective id
    
    Returns:
    - List of protein IDs
    """
    df_b = pd.read_excel(ids)
    df_b=substitute_or_remove_prot_id(df_b,archNom,"r")
Rafael Artinano's avatar
Rafael Artinano committed
294 295 296 297 298 299 300 301 302 303
    #df_b= substitute_or_remove_prot_id(df_b,"s")
    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)

    for i in range(len(output)):
        protein_sequence = output[i]
        if protein_sequence in proteinas_dict:
            output[i] = proteinas_dict[protein_sequence]

    return output

304 305 306 307 308 309 310 311 312 313
def remplazar_ID_for_sequence(output,ids,archNom):
     """
    Replace protein IDs with protein sequences using a pre-existing DataFrame.

    Parameters:
    - output: List of protein IDs
    
    Returns:
    - List of protein sequences
    """
Rafael Artinano's avatar
Rafael Artinano committed
314 315 316
    global df_b
    if(df_b is None):
     
317 318
       df_b = pd.read_excel(ids)
       df_b=substitute_or_remove_prot_id(df_b,archNom,"r")
Rafael Artinano's avatar
Rafael Artinano committed
319 320 321 322 323 324 325 326 327 328
    #df_b= substitute_or_remove_prot_id(df_b,"s")
    proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values)

    for i in range(len(output)):
        protein_sequence = output[i]
        if protein_sequence in proteinas_dict:
            output[i] = proteinas_dict[protein_sequence]

    return output
    
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
def ejecutar(archivoEntrada, enfermedad, similitud,ValEnt,sal,archNom,n_clusters,min_sample):
    """
    Execute protein clustering based on similarity matrix using hierarchical clustering, spectral clustering,
    and DBSCAN algorithms.

    Parameters:
    - archivoEntrada: Input Excel file path containing protein data
    - enfermedad: Disease ID for filtering protein data (leave empty for no filtering)
    - similitud: Similarity threshold for clustering
    - ValEnt: File path for the similarity matrix
    - sal: Extension to be added to the cluster Output File made by every algorithm
    - archNom: File path for substitution or removal information
    - n_clusters: Number of clusters to be formed by hierarchical clustering and spectral clustering
    - min_sample: Minimum number of samples required to form a cluster in DBSCAN

    This function reads protein data from an Excel file, filters it based on disease ID if provided, and then performs
    clustering using hierarchical clustering, spectral clustering, and DBSCAN algorithms. The resulting clusters and
    discarded data are saved into CSV files.

    """
Rafael Artinano's avatar
Rafael Artinano committed
349
    data = readData(archivoEntrada, enfermedad)
350
    descarte(data, similitud,archivoEntrada,ValEnt,sal,archNomn_clusters,min_sample)
351
if __name__=='__main__':
352
   ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001¨,'resultados/matrizNeedleWunchFS_70.csv',"NW70","nombres_sust.txt",100,1)