Commit bad93c31 authored by Rafael Artinano's avatar Rafael Artinano

Change issue in distance_matrix and cluster generator scripts

parent d5cae6ac
...@@ -4,6 +4,8 @@ import time ...@@ -4,6 +4,8 @@ import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
import numpy as np import numpy as np
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall from pyclustering.utils import timedcall
...@@ -17,6 +19,7 @@ from Bio.Seq import Seq ...@@ -17,6 +19,7 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from Bio import SeqIO from Bio import SeqIO
import swalign import swalign
from scipy.cluster.hierarchy import dendrogram
import multiprocessing as mp import multiprocessing as mp
globi=0 globi=0
df_b=None df_b=None
...@@ -73,7 +76,7 @@ def substitute_or_remove_prot_id(data,sub_rem): ...@@ -73,7 +76,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
return data return data
def readData(archivoEntrada, enfermedad): def readData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada) data = pd.read_excel(archivoEntrada)
data=substitute_or_remove_prot_id(data,"r") #data=substitute_or_remove_prot_id(data,"r")
#data.to_excel('data_nervous_genes_xf.xlsx') #data.to_excel('data_nervous_genes_xf.xlsx')
if (enfermedad != ''): if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r") #datar=substitute_or_remove_prot_id(data,"r")
...@@ -111,7 +114,7 @@ def descarte(data, threshold): ...@@ -111,7 +114,7 @@ def descarte(data, threshold):
#print(str(blast_similarity(data[0],data[1]))+" similarity of equals") #print(str(blast_similarity(data[0],data[1]))+" similarity of equals")
# Crear matriz de similitud # Crear matriz de similitud
#num_points = len(data) #num_points = len(data)
similarity_matrix=pd.read_csv('resultados/matrizSmithWater.csv',header=None,index_col=False)-1 similarity_matrix=pd.read_csv('resultados/matrizNeedleWunchFS_70.csv',header=None,index_col=False)-1
similarity_matrix=similarity_matrix.abs() similarity_matrix=similarity_matrix.abs()
#sim_matrix=[item[0] for item in similarity_matrix] #sim_matrix=[item[0] for item in similarity_matrix]
#k=0 #k=0
...@@ -132,12 +135,117 @@ def descarte(data, threshold): ...@@ -132,12 +135,117 @@ def descarte(data, threshold):
dat=data dat=data
#datx=np.arange(len(data)).reshape(-1, 1) #datx=np.arange(len(data)).reshape(-1, 1)
#sim #sim
aglom_instance=AgglomerativeClustering(n_clusters=500, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy()) aglom_instance=AgglomerativeClustering(n_clusters=100, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy())
print(aglom_instance.labels_) print(aglom_instance.labels_)
plot_dendrogram(algom_instance, labels=aglom_instance.labels_) cluster= aglom_instance.labels_
plt.show() spectre=SpectralClustering(n_clusters=100,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy()) plot_dendrogram(aglom_instance, labels=aglom_instance.labels_)
plt.show()
filtered_clusters = []
discarded_data = []
discarded_data2=[]
dato=remplazar_sequence_for_ID(data)
similarity_matrix2=similarity_matrix.values.tolist()
clusters={}
for k in range(0,len(cluster)):
if cluster[k] in clusters:
clusters[cluster[k]].append(k)
else:
clusters[cluster[k]]=[k]
print(clusters)
for cluster_id, cluster in clusters.items():
filtered_cluster = []
min_avg_distance = float('inf')
central_point_index = None
print(cluster)
#Calcular la distancia promedio para cada punto del cluster
for point_index in cluster:
total_distance = 0
for other_index in cluster:
total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance:
min_avg_distance = avg_distance
central_point_index = point_index
# Verificar si el punto central supera el umbral
similarity_percentage = 1 - (min_avg_distance / eps)
filtered_cluster.append(central_point_index)
print(max(cluster))
print(len(datacp))
print(len(data))
print(len(dato))
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster:
filtered_clusters.append(filtered_cluster)
data = remplazar_sequence_for_ID(data)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if discarded_data:
df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df.to_csv('resultados/proteinasClusterAglomerativeNW70.csv', index=False)
spectre=SpectralClustering(n_clusters=100,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy())
cluster= spectre.labels_
print(spectre.labels_) print(spectre.labels_)
filtered_clusters = []
discarded_data = []
discarded_data2=[]
dato=remplazar_sequence_for_ID(data)
similarity_matrix2=similarity_matrix.values.tolist()
clusters={}
for k in range(0,len(cluster)):
if cluster[k] in clusters:
clusters[cluster[k]].append(k)
else:
clusters[cluster[k]]=[k]
print(clusters)
for cluster_id, cluster in clusters.items():
filtered_cluster = []
min_avg_distance = float('inf')
central_point_index = None
print(cluster)
#Calcular la distancia promedio para cada punto del cluster
for point_index in cluster:
total_distance = 0
for other_index in cluster:
total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance:
min_avg_distance = avg_distance
central_point_index = point_index
# Verificar si el punto central supera el umbral
similarity_percentage = 1 - (min_avg_distance / eps)
filtered_cluster.append(central_point_index)
print(max(cluster))
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster:
filtered_clusters.append(filtered_cluster)
data = remplazar_sequence_for_ID(data)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if discarded_data:
df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df.to_csv('resultados/proteinasClusterSpectralNW70.csv', index=False)
dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy()) dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy())
cluster= dbscan_instance.labels_ cluster= dbscan_instance.labels_
print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist()))) print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist())))
...@@ -146,7 +254,7 @@ def descarte(data, threshold): ...@@ -146,7 +254,7 @@ def descarte(data, threshold):
discarded_data = [] discarded_data = []
discarded_data2=[] discarded_data2=[]
dato=remplazar_sequence_for_ID(data) dato=remplazar_sequence_for_ID(data)
similarity_matrix=similarity_matrix.values.tolist() similarity_matrix2=similarity_matrix.values.tolist()
clusters={} clusters={}
for k in range(0,len(cluster)): for k in range(0,len(cluster)):
if cluster[k] in clusters: if cluster[k] in clusters:
...@@ -163,7 +271,7 @@ def descarte(data, threshold): ...@@ -163,7 +271,7 @@ def descarte(data, threshold):
for point_index in cluster: for point_index in cluster:
total_distance = 0 total_distance = 0
for other_index in cluster: for other_index in cluster:
total_distance += similarity_matrix[point_index][other_index] total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster) avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance: if avg_distance < min_avg_distance:
min_avg_distance = avg_distance min_avg_distance = avg_distance
...@@ -174,7 +282,7 @@ def descarte(data, threshold): ...@@ -174,7 +282,7 @@ def descarte(data, threshold):
filtered_cluster.append(central_point_index) filtered_cluster.append(central_point_index)
#discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster]) discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] ) #discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster: if filtered_cluster:
filtered_clusters.append(filtered_cluster) filtered_clusters.append(filtered_cluster)
...@@ -188,11 +296,11 @@ def descarte(data, threshold): ...@@ -188,11 +296,11 @@ def descarte(data, threshold):
#discarded_data = remplazar_sequence_for_ID(discarded_data) #discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas # Guardar los datos descartados en un archivo CSV utilizando Pandas
#if discarded_data: if discarded_data:
#df = pd.DataFrame( [], columns=['protein_sequence','cluster_id','centroid','ProteinasDescartadas']) df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia']) #df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
#df.to_csv('resultados/proteinasDescartadasSmith.csv', index=False) df.to_csv('resultados/proteinasClusterDBScanNW70.csv', index=False)
#df2.to_csv('resultados/proteinasDescartadas2.csv', index=False) #df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False)
def remplazar_sequence_for_ID(output): def remplazar_sequence_for_ID(output):
...@@ -227,3 +335,5 @@ def remplazar_ID_for_sequence(output): ...@@ -227,3 +335,5 @@ def remplazar_ID_for_sequence(output):
def ejecutar(archivoEntrada, enfermedad, similitud): def ejecutar(archivoEntrada, enfermedad, similitud):
data = readData(archivoEntrada, enfermedad) data = readData(archivoEntrada, enfermedad)
descarte(data, similitud) descarte(data, similitud)
if __name__=='__main__':
ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001)
...@@ -143,8 +143,8 @@ def calculate_matrix_blasto(data): ...@@ -143,8 +143,8 @@ def calculate_matrix_blasto(data):
datf.to_csv('resultados/matrizBlast.csv', index=False,header=False) datf.to_csv('resultados/matrizBlast.csv', index=False,header=False)
def remplazar_sequence_for_ID(output): def remplazar_sequence_for_ID(output):
df_b = pd.read_excel("data_nervous_genes_2.xlsx") df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
df_b= substitute_or_remove_prot_id(df_b,"s") #df_b= substitute_or_remove_prot_id(df_b,"s")
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for i in range(len(output)): for i in range(len(output)):
...@@ -255,45 +255,99 @@ def readData(archivoEntrada, enfermedad): ...@@ -255,45 +255,99 @@ def readData(archivoEntrada, enfermedad):
if __name__=="__main__": if __name__=="__main__":
data=readData("data_nervous_genes_x.xlsx","C0002395") data=readData("data_nervous_genes_xf.xlsx","C0002395")
calculate_matrix_needle(data) #calculate_matrix_needle(data)
calculate_matrix_smith(data) #calculate_matrix_smith(data)
#calculate_matriz_levens(data)
output=data.to_list() output=data.to_list()
#output=remplazar_sequence_for_ID(data) output=list(remplazar_sequence_for_ID(data))
similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)-1 similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)-1
#similarity_matrix=similarity_matrix/2 #similarity_matrix=similarity_matrix/2
similarity_matrix=similarity_matrix.abs() similarity_matrix=similarity_matrix.abs()
similarity_matrix.to_numpy() similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy() sim_mat_40=similarity_matrix.copy()
sim_mat_40=sim_mat_40.to_numpy()
sim_mat_20=similarity_matrix.copy() sim_mat_20=similarity_matrix.copy()
sim_mat_10=similarity_matrix.copy() sim_mat_20=sim_mat_20.to_numpy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia_40.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) #sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_40=data_40.drop([0]) data_40=data_40.drop([0])
data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) #data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_20=data_20.drop([0]) #data_20=data_20.drop([0])
data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) #data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_10=data_10.drop([0]) #data_10=data_10.drop([0])
new_sim=np.copy(similarity_matrix) #new_sim=np.copy(similarity_matrix)
print(output) #print(output)
new_sim_mean=np.copy(similarity_matrix) #new_sim_mean=np.copy(similarity_matrix)
for i,ks in data_40.iterrows(): for i,ks in data_40.iterrows():
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3 #print(ks['proteina1'])
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])
#for i,kks in data_20.iterrows(): #for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3 # sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows(): #for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3 # sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
#dfx=pd.DataFrame(sim_mat_20) for i in range(0,sim_mat_20.shape[0]):
#dfx=df/1.3 sim_mat_20[i][i]*=2
#dfx=df-1 dfx=pd.DataFrame(sim_mat_20)
#dfx.abs() dfx=dfx.apply(lambda x: x/2)
dfx=dfx-1
dfx=dfx.abs()
#dfx.to_csv("resultados/matrizLevenshteinFS_20.csv",header=False,index=False) dfx.to_csv("resultados/matrizLevenshteinFS_Mean.csv",header=False,index=False)
for i in range(0,sim_mat_40.shape[0]):
sim_mat_40[i][i]/=0.7
dfx=pd.DataFrame(sim_mat_40) dfx=pd.DataFrame(sim_mat_40)
dfx=dfx/1.3 dfx=dfx.apply(lambda x: x*0.7)
dfx=dfx-1 dfx=dfx-1
dfx.abs() dfx=dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_70.csv",header=False,index=False)
similarity_matrix=pd.read_csv('resultados/matrizNeedleWunch.csv',header=None,index_col=False)+1
similarity_matrix=similarity_matrix/2
similarity_matrix=similarity_matrix.abs()
similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy()
sim_mat_40=sim_mat_40.to_numpy()
sim_mat_20=similarity_matrix.copy()
sim_mat_20=sim_mat_20.to_numpy()
#sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_40=data_40.drop([0])
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
indexes=[]
for i,ks in data_40.iterrows():
indexes.append((output.index(ks['proteina1']),output.index(ks['proteina2'])))
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])/100
#print(indexes)
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
for i in range(0,sim_mat_20.shape[0]):
sim_mat_20[i][i]*=2
dfx=pd.DataFrame(sim_mat_20)
dfx=dfx.apply(lambda x: x/2)
dfx=dfx-1
dfx=dfx.abs()
dfx.to_csv("resultados/matrizNeedleWunchFS_Mean.csv",header=False,index=False)
for i in range(0,sim_mat_40.shape[0]):
sim_mat_40[i][i]/=0.7
dfx=pd.DataFrame(sim_mat_40)
dfx=dfx.apply(lambda x: x*0.7)
dfx=dfx-1
dfx=dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_40.csv",header=False,index=False) dfx.to_csv("resultados/mmatrizNeedleWunchFS_70.csv",header=False,index=False)
""" """
dfx=pd.DataFrame(sim_mat_10) dfx=pd.DataFrame(sim_mat_10)
dfx=df/1.3 dfx=df/1.3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment