import pandas as pd import Levenshtein import time from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering from sklearn.preprocessing import StandardScaler import numpy as np from matplotlib import pyplot as plt from scipy.spatial.distance import pdist, squareform from pyclustering.cluster.dbscan import dbscan from pyclustering.utils import timedcall from Levenshtein import distance import re from minineedle import needle, smith, core from Bio.Blast.Applications import NcbiblastpCommandline from io import StringIO from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import swalign from scipy.cluster.hierarchy import dendrogram import multiprocessing as mp globi=0 df_b=None def plot_dendrogram(model, **kwargs): """ Plot dendrogram for hierarchical clustering model. Parameters: - model: Hierarchical clustering model - **kwargs: Additional keyword arguments for dendrogram """ # Children of hierarchical clustering children = model.children_ # Distances between each pair of children # Since we don't have this information, we can use a uniform one for plotting distance = np.arange(children.shape[0]) # The number of observations contained in each cluster level no_of_observations = np.arange(2, children.shape[0]+2) # Create linkage matrix and then plot the dendrogram linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) def substitute_or_remove_prot_id(data,archNom,sub_rem): """ Substitute or remove protein IDs in the given DataFrame based on a substitution file. Parameters: - data: DataFrame containing protein data - archNom: File path for the IDs to be substituted to its primary entry - sub_rem: String indicating the operation type ('s' for substitution, 'r' for removal) This function reads a file containing information for protein ID substitution or removal and performs the corresponding operation on the provided DataFrame. The substitution or removal is based on the content of the file specified by the 'archNom' parameter. If 'sub_rem' is set to 's' (substitution), it replaces protein IDs in the DataFrame with the specified IDs from the substitution file. If 'sub_rem' is set to 'r' (removal), it removes rows from the DataFrame where the protein ID matches the specified other protein IDs or its sequences matches the other protein sequence in the dataframe once substituted. The removed rows are saved to a CSV file. Note: The function modifies the input DataFrame and returns the modified DataFrame. Example: >>> data = substitute_or_remove_prot_id(data, "substitution_file.txt", "s") >>> data = substitute_or_remove_prot_id(data, "removal_file.txt", "r") """ print("inside the problem") with open(archNom) as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) else: global globi datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False] datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) globi=globi+1 #data.to_excel('data_nervous_genes_xf.xlsx') return data def readData(archivoEntrada, enfermedad): """ Read data from an Excel file and filter based on disease_id. Parameters: - archivoEntrada: Input Excel file path - enfermedad: Disease ID for filtering Returns: - Protein sequences data """ data = pd.read_excel(archivoEntrada) if (enfermedad != ''): data = data.loc[data["disease_id"] == enfermedad] filt_data=len(data) print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) sequences = data["protein_sequence"] return sequences def save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,method,sal): """ Save clustered protein data and related information into CSV files. Parameters: - data: Protein sequences data - archivoEntrada: Input file path for reading protein data - archNom: File path for substitution or removal information - similarity_matrix: Matrix representing the similarity between protein sequences - cluster: Cluster labels assigned to each protein - method: Clustering method used (e.g., Agglomerative, Spectral, DBScan) - sal: Extension to be added to the output file name This function processes the clustered protein data and related information, such as the centroid of each cluster, and saves the results into CSV files. The CSV files include details about discarded data and cluster information. """ filtered_clusters = [] discarded_data = [] discarded_data2=[] dato=remplazar_sequence_for_ID(data,archivoEntrada,archNom) # Convert similarity matrix to list for easier indexing similarity_matrix2=similarity_matrix.values.tolist() # Create a dictionary to store protein indices for each cluster clusters={} for k in range(0,len(cluster)): if cluster[k] in clusters: clusters[cluster[k]].append(k) else: clusters[cluster[k]]=[k] print(clusters) # Process each cluster to find the centroid for cluster_id, cluster in clusters.items(): filtered_cluster = [] min_avg_distance = float('inf') central_point_index = None print(cluster) # Calculate the average distance for each point in the cluster for point_index in cluster: total_distance = 0 for other_index in cluster: total_distance += similarity_matrix2[point_index][other_index] avg_distance = total_distance / len(cluster) # Update centroid if the average distance is smaller if avg_distance < min_avg_distance: min_avg_distance = avg_distance central_point_index = point_index # Verificar si el punto central supera el umbral similarity_percentage = 1 - (min_avg_distance / eps) # Append the central point index to the filtered cluster filtered_cluster.append(central_point_index) print(max(cluster)) print(len(datacp)) print(len(data)) print(len(dato)) discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster]) #discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] ) if filtered_cluster: filtered_clusters.append(filtered_cluster) data = remplazar_sequence_for_ID(data,archivoEntrada,archNom) # Imprimir los resultados #for cluster_id, cluster in enumerate(filtered_clusters): # cluster_data = [data[i] for i in cluster] # print(f'Cluster {cluster_id}: {", ".join(cluster_data)}') #discarded_data = remplazar_sequence_for_ID(discarded_data) # Save the data into a CSV file if discarded_data: df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id']) #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia']) df.to_csv('resultados/proteinasCluster'+method +sal+'.csv', index=False) def descarte(data, threshold,archivoEntrada,ValEnt,sal,archNom,n_clust,min_samp): """ Cluster proteins based on a similarity matrix using hierarchical clustering, spectral clustering, and DBSCAN algorithms. Parameters: - data: Protein sequences data - threshold: Similarity threshold for clustering - archivoEntrada: Input file path for reading protein data - ValEnt: File path for the similarity matrix - sal: Placeholder variable (unused in the function) - archNom: File path for substitution or removal information - n_clust: Number of clusters for Agglomerative and Spectral clustering - min_samp: Minimum number of samples for DBSCAN clustering The function reads protein sequences from the input file and performs clustering using hierarchical clustering (Agglomerative), spectral clustering (Spectral), and DBSCAN algorithms. The results are saved in separate cluster files. Note: The 'sal' parameter is not used in the function and can be ignored. """ # Datos de ejemplo data = data.tolist() #print(str(blast_similarity(data[0],data[1]))+" similarity of equals") # Crear matriz de similitud #num_points = len(data) similarity_matrix=pd.read_csv(ValEnt,header=None,index_col=False)-1 similarity_matrix=similarity_matrix.abs() #sim_matrix=[item[0] for item in similarity_matrix] #k=0 #for i in range(num_points): # for j in range(i,num_points): # similarity_matrix[i][j]=sim_matrix[k] # k=k+1 #for i in range(num_points): # for j in range(i): # similarity_matrix[i][j]=similarity_matrix[j][i] print("simmatrix") # algorithm DBSCAN parameters eps = threshold # Umbral de similitud min_samples = min_samp # Número mínimo de muestras para formar un cluster # Create a copy of the original data datacp=data.copy() # Ejecutar el algoritmo DBSCAN global dat dat=data #datx=np.arange(len(data)).reshape(-1, 1) #sim # Perform Agglomerative clustering aglom_instance=AgglomerativeClustering(n_clusters=n_clust, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy()) print(aglom_instance.labels_) cluster= aglom_instance.labels_ # Plot dendrogram for Agglomerative clustering plot_dendrogram(aglom_instance, labels=aglom_instance.labels_) plt.show() # Save Agglomerative clustering results save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Agglomerative",sal) # Perform Spectral clustering spectre=SpectralClustering(n_clusters=n_clust,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy()) cluster= spectre.labels_ print(spectre.labels_) # Save Spectral clustering results save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"Spectral",sal) # Perform DBSCAN clustering dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy()) cluster= dbscan_instance.labels_ print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist()))) print("outside the cluster madness") # Save DBSCAN clustering results save_clusters(data,archivoEntrada,archNom,similarity_matrix,cluster,"DBScan",sal) #df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False) def remplazar_sequence_for_ID(output,ids,archNom): """ Replace protein sequences with protein IDs from a pre-existing DataFrame. Parameters: - output: List of protein sequences - ids: file that contains the equivalence between each sequence and their respective id Returns: - List of protein IDs """ df_b = pd.read_excel(ids) df_b=substitute_or_remove_prot_id(df_b,archNom,"r") #df_b= substitute_or_remove_prot_id(df_b,"s") proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for i in range(len(output)): protein_sequence = output[i] if protein_sequence in proteinas_dict: output[i] = proteinas_dict[protein_sequence] return output def remplazar_ID_for_sequence(output,ids,archNom): """ Replace protein IDs with protein sequences using a pre-existing DataFrame. Parameters: - output: List of protein IDs Returns: - List of protein sequences """ global df_b if(df_b is None): df_b = pd.read_excel(ids) df_b=substitute_or_remove_prot_id(df_b,archNom,"r") #df_b= substitute_or_remove_prot_id(df_b,"s") proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values) for i in range(len(output)): protein_sequence = output[i] if protein_sequence in proteinas_dict: output[i] = proteinas_dict[protein_sequence] return output def ejecutar(archivoEntrada, enfermedad, similitud,ValEnt,sal,archNom,n_clusters,min_sample): """ Execute protein clustering based on similarity matrix using hierarchical clustering, spectral clustering, and DBSCAN algorithms. Parameters: - archivoEntrada: Input Excel file path containing protein data - enfermedad: Disease ID for filtering protein data (leave empty for no filtering) - similitud: Similarity threshold for clustering - ValEnt: File path for the similarity matrix - sal: Extension to be added to the cluster Output File made by every algorithm - archNom: File path for substitution or removal information - n_clusters: Number of clusters to be formed by hierarchical clustering and spectral clustering - min_sample: Minimum number of samples required to form a cluster in DBSCAN This function reads protein data from an Excel file, filters it based on disease ID if provided, and then performs clustering using hierarchical clustering, spectral clustering, and DBSCAN algorithms. The resulting clusters and discarded data are saved into CSV files. """ data = readData(archivoEntrada, enfermedad) descarte(data, similitud,archivoEntrada,ValEnt,sal,archNomn_clusters,min_sample) if __name__=='__main__': ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001¨,'resultados/matrizNeedleWunchFS_70.csv',"NW70","nombres_sust.txt",100,1)