import pandas as pd import Levenshtein import time from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering from sklearn.preprocessing import StandardScaler import numpy as np from matplotlib import pyplot as plt from scipy.spatial.distance import pdist, squareform from pyclustering.cluster.dbscan import dbscan from pyclustering.utils import timedcall from Levenshtein import distance import re from minineedle import needle, smith, core from Bio.Blast.Applications import NcbiblastpCommandline from io import StringIO from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import swalign from scipy.cluster.hierarchy import dendrogram import multiprocessing as mp globi=0 df_b=None def plot_dendrogram(model, **kwargs): # Children of hierarchical clustering children = model.children_ # Distances between each pair of children # Since we don't have this information, we can use a uniform one for plotting distance = np.arange(children.shape[0]) # The number of observations contained in each cluster level no_of_observations = np.arange(2, children.shape[0]+2) # Create linkage matrix and then plot the dendrogram linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) def substitute_or_remove_prot_id(data,sub_rem): print("inside the problem") with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) else: global globi datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False] datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) globi=globi+1 #data.to_excel('data_nervous_genes_xf.xlsx') return data def readData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") #data.to_excel('data_nervous_genes_xf.xlsx') if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_x.xlsx') filt_data=len(data) #alz_filt_data=len(dataB) print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] sequences = data["protein_sequence"] return sequences def sim(pattern1,pattern2,j): return j def descarte(data, threshold): # Datos de ejemplo data = data.tolist() #print(str(blast_similarity(data[0],data[1]))+" similarity of equals") # Crear matriz de similitud #num_points = len(data) similarity_matrix=pd.read_csv('resultados/matrizNeedleWunchFS_70.csv',header=None,index_col=False)-1 similarity_matrix=similarity_matrix.abs() #sim_matrix=[item[0] for item in similarity_matrix] #k=0 #for i in range(num_points): # for j in range(i,num_points): # similarity_matrix[i][j]=sim_matrix[k] # k=k+1 #for i in range(num_points): # for j in range(i): # similarity_matrix[i][j]=similarity_matrix[j][i] print("simmatrix") # Parámetros del algoritmo DBSCAN eps = threshold # Umbral de similitud min_samples = 1 # Número mínimo de muestras para formar un cluster datacp=data.copy() # Ejecutar el algoritmo DBSCAN global dat dat=data #datx=np.arange(len(data)).reshape(-1, 1) #sim aglom_instance=AgglomerativeClustering(n_clusters=100, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy()) print(aglom_instance.labels_) cluster= aglom_instance.labels_ plot_dendrogram(aglom_instance, labels=aglom_instance.labels_) plt.show() filtered_clusters = [] discarded_data = [] discarded_data2=[] dato=remplazar_sequence_for_ID(data) similarity_matrix2=similarity_matrix.values.tolist() clusters={} for k in range(0,len(cluster)): if cluster[k] in clusters: clusters[cluster[k]].append(k) else: clusters[cluster[k]]=[k] print(clusters) for cluster_id, cluster in clusters.items(): filtered_cluster = [] min_avg_distance = float('inf') central_point_index = None print(cluster) #Calcular la distancia promedio para cada punto del cluster for point_index in cluster: total_distance = 0 for other_index in cluster: total_distance += similarity_matrix2[point_index][other_index] avg_distance = total_distance / len(cluster) if avg_distance < min_avg_distance: min_avg_distance = avg_distance central_point_index = point_index # Verificar si el punto central supera el umbral similarity_percentage = 1 - (min_avg_distance / eps) filtered_cluster.append(central_point_index) print(max(cluster)) print(len(datacp)) print(len(data)) print(len(dato)) discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster]) #discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] ) if filtered_cluster: filtered_clusters.append(filtered_cluster) data = remplazar_sequence_for_ID(data) # Imprimir los resultados #for cluster_id, cluster in enumerate(filtered_clusters): # cluster_data = [data[i] for i in cluster] # print(f'Cluster {cluster_id}: {", ".join(cluster_data)}') #discarded_data = remplazar_sequence_for_ID(discarded_data) # Guardar los datos descartados en un archivo CSV utilizando Pandas if discarded_data: df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id']) #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia']) df.to_csv('resultados/proteinasClusterAglomerativeNW70.csv', index=False) spectre=SpectralClustering(n_clusters=100,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy()) cluster= spectre.labels_ print(spectre.labels_) filtered_clusters = [] discarded_data = [] discarded_data2=[] dato=remplazar_sequence_for_ID(data) similarity_matrix2=similarity_matrix.values.tolist() clusters={} for k in range(0,len(cluster)): if cluster[k] in clusters: clusters[cluster[k]].append(k) else: clusters[cluster[k]]=[k] print(clusters) for cluster_id, cluster in clusters.items(): filtered_cluster = [] min_avg_distance = float('inf') central_point_index = None print(cluster) #Calcular la distancia promedio para cada punto del cluster for point_index in cluster: total_distance = 0 for other_index in cluster: total_distance += similarity_matrix2[point_index][other_index] avg_distance = total_distance / len(cluster) if avg_distance < min_avg_distance: min_avg_distance = avg_distance central_point_index = point_index # Verificar si el punto central supera el umbral similarity_percentage = 1 - (min_avg_distance / eps) filtered_cluster.append(central_point_index) print(max(cluster)) discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster]) #discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] ) if filtered_cluster: filtered_clusters.append(filtered_cluster) data = remplazar_sequence_for_ID(data) # Imprimir los resultados #for cluster_id, cluster in enumerate(filtered_clusters): # cluster_data = [data[i] for i in cluster] # print(f'Cluster {cluster_id}: {", ".join(cluster_data)}') #discarded_data = remplazar_sequence_for_ID(discarded_data) # Guardar los datos descartados en un archivo CSV utilizando Pandas if discarded_data: df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id']) #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia']) df.to_csv('resultados/proteinasClusterSpectralNW70.csv', index=False) dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy()) cluster= dbscan_instance.labels_ print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist()))) print("outside the cluster madness") filtered_clusters = [] discarded_data = [] discarded_data2=[] dato=remplazar_sequence_for_ID(data) similarity_matrix2=similarity_matrix.values.tolist() clusters={} for k in range(0,len(cluster)): if cluster[k] in clusters: clusters[cluster[k]].append(k) else: clusters[cluster[k]]=[k] print(clusters) for cluster_id, cluster in clusters.items(): filtered_cluster = [] min_avg_distance = float('inf') central_point_index = None print(cluster) #Calcular la distancia promedio para cada punto del cluster for point_index in cluster: total_distance = 0 for other_index in cluster: total_distance += similarity_matrix2[point_index][other_index] avg_distance = total_distance / len(cluster) if avg_distance < min_avg_distance: min_avg_distance = avg_distance central_point_index = point_index # Verificar si el punto central supera el umbral similarity_percentage = 1 - (min_avg_distance / eps) filtered_cluster.append(central_point_index) discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster]) #discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] ) if filtered_cluster: filtered_clusters.append(filtered_cluster) data = remplazar_sequence_for_ID(data) # Imprimir los resultados #for cluster_id, cluster in enumerate(filtered_clusters): # cluster_data = [data[i] for i in cluster] # print(f'Cluster {cluster_id}: {", ".join(cluster_data)}') #discarded_data = remplazar_sequence_for_ID(discarded_data) # Guardar los datos descartados en un archivo CSV utilizando Pandas if discarded_data: df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id']) #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia']) df.to_csv('resultados/proteinasClusterDBScanNW70.csv', index=False) #df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False) def remplazar_sequence_for_ID(output): df_b = pd.read_excel("data_nervous_genes_xf.xlsx") df_b=substitute_or_remove_prot_id(df_b,"r") #df_b= substitute_or_remove_prot_id(df_b,"s") proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for i in range(len(output)): protein_sequence = output[i] if protein_sequence in proteinas_dict: output[i] = proteinas_dict[protein_sequence] return output def remplazar_ID_for_sequence(output): global df_b if(df_b is None): df_b = pd.read_excel("data_nervous_genes_xf.xlsx") df_b=substitute_or_remove_prot_id(df_b,"r") #df_b= substitute_or_remove_prot_id(df_b,"s") proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values) for i in range(len(output)): protein_sequence = output[i] if protein_sequence in proteinas_dict: output[i] = proteinas_dict[protein_sequence] return output def ejecutar(archivoEntrada, enfermedad, similitud): data = readData(archivoEntrada, enfermedad) descarte(data, similitud) if __name__=='__main__': ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001)