Commit bad93c31 authored by Rafael Artinano's avatar Rafael Artinano

Change issue in distance_matrix and cluster generator scripts

parent d5cae6ac
......@@ -4,6 +4,8 @@ import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall
......@@ -17,6 +19,7 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
from scipy.cluster.hierarchy import dendrogram
import multiprocessing as mp
globi=0
df_b=None
......@@ -73,7 +76,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
return data
def readData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
data=substitute_or_remove_prot_id(data,"r")
#data=substitute_or_remove_prot_id(data,"r")
#data.to_excel('data_nervous_genes_xf.xlsx')
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
......@@ -111,7 +114,7 @@ def descarte(data, threshold):
#print(str(blast_similarity(data[0],data[1]))+" similarity of equals")
# Crear matriz de similitud
#num_points = len(data)
similarity_matrix=pd.read_csv('resultados/matrizSmithWater.csv',header=None,index_col=False)-1
similarity_matrix=pd.read_csv('resultados/matrizNeedleWunchFS_70.csv',header=None,index_col=False)-1
similarity_matrix=similarity_matrix.abs()
#sim_matrix=[item[0] for item in similarity_matrix]
#k=0
......@@ -132,12 +135,117 @@ def descarte(data, threshold):
dat=data
#datx=np.arange(len(data)).reshape(-1, 1)
#sim
aglom_instance=AgglomerativeClustering(n_clusters=500, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy())
aglom_instance=AgglomerativeClustering(n_clusters=100, affinity='precomputed', linkage = 'average').fit(similarity_matrix.to_numpy())
print(aglom_instance.labels_)
plot_dendrogram(algom_instance, labels=aglom_instance.labels_)
plt.show() spectre=SpectralClustering(n_clusters=100,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy())
cluster= aglom_instance.labels_
plot_dendrogram(aglom_instance, labels=aglom_instance.labels_)
plt.show()
filtered_clusters = []
discarded_data = []
discarded_data2=[]
dato=remplazar_sequence_for_ID(data)
similarity_matrix2=similarity_matrix.values.tolist()
clusters={}
for k in range(0,len(cluster)):
if cluster[k] in clusters:
clusters[cluster[k]].append(k)
else:
clusters[cluster[k]]=[k]
print(clusters)
for cluster_id, cluster in clusters.items():
filtered_cluster = []
min_avg_distance = float('inf')
central_point_index = None
print(cluster)
#Calcular la distancia promedio para cada punto del cluster
for point_index in cluster:
total_distance = 0
for other_index in cluster:
total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance:
min_avg_distance = avg_distance
central_point_index = point_index
# Verificar si el punto central supera el umbral
similarity_percentage = 1 - (min_avg_distance / eps)
filtered_cluster.append(central_point_index)
print(max(cluster))
print(len(datacp))
print(len(data))
print(len(dato))
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster:
filtered_clusters.append(filtered_cluster)
data = remplazar_sequence_for_ID(data)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if discarded_data:
df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df.to_csv('resultados/proteinasClusterAglomerativeNW70.csv', index=False)
spectre=SpectralClustering(n_clusters=100,affinity='precomputed_nearest_neighbors').fit(similarity_matrix.to_numpy())
cluster= spectre.labels_
print(spectre.labels_)
filtered_clusters = []
discarded_data = []
discarded_data2=[]
dato=remplazar_sequence_for_ID(data)
similarity_matrix2=similarity_matrix.values.tolist()
clusters={}
for k in range(0,len(cluster)):
if cluster[k] in clusters:
clusters[cluster[k]].append(k)
else:
clusters[cluster[k]]=[k]
print(clusters)
for cluster_id, cluster in clusters.items():
filtered_cluster = []
min_avg_distance = float('inf')
central_point_index = None
print(cluster)
#Calcular la distancia promedio para cada punto del cluster
for point_index in cluster:
total_distance = 0
for other_index in cluster:
total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance:
min_avg_distance = avg_distance
central_point_index = point_index
# Verificar si el punto central supera el umbral
similarity_percentage = 1 - (min_avg_distance / eps)
filtered_cluster.append(central_point_index)
print(max(cluster))
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster:
filtered_clusters.append(filtered_cluster)
data = remplazar_sequence_for_ID(data)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if discarded_data:
df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df.to_csv('resultados/proteinasClusterSpectralNW70.csv', index=False)
dbscan_instance = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed',algorithm='brute').fit(similarity_matrix.to_numpy())
cluster= dbscan_instance.labels_
print(str(len(cluster))+ " " +str(len(similarity_matrix.values.tolist())))
......@@ -146,7 +254,7 @@ def descarte(data, threshold):
discarded_data = []
discarded_data2=[]
dato=remplazar_sequence_for_ID(data)
similarity_matrix=similarity_matrix.values.tolist()
similarity_matrix2=similarity_matrix.values.tolist()
clusters={}
for k in range(0,len(cluster)):
if cluster[k] in clusters:
......@@ -163,7 +271,7 @@ def descarte(data, threshold):
for point_index in cluster:
total_distance = 0
for other_index in cluster:
total_distance += similarity_matrix[point_index][other_index]
total_distance += similarity_matrix2[point_index][other_index]
avg_distance = total_distance / len(cluster)
if avg_distance < min_avg_distance:
min_avg_distance = avg_distance
......@@ -174,7 +282,7 @@ def descarte(data, threshold):
filtered_cluster.append(central_point_index)
#discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if filtered_cluster:
filtered_clusters.append(filtered_cluster)
......@@ -188,11 +296,11 @@ def descarte(data, threshold):
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
#if discarded_data:
#df = pd.DataFrame( [], columns=['protein_sequence','cluster_id','centroid','ProteinasDescartadas'])
if discarded_data:
df = pd.DataFrame(discarded_data, columns=['protein_sequence','cluster_id','centroid','protein_id'])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
#df.to_csv('resultados/proteinasDescartadasSmith.csv', index=False)
#df2.to_csv('resultados/proteinasDescartadas2.csv', index=False)
df.to_csv('resultados/proteinasClusterDBScanNW70.csv', index=False)
#df2.to_csv('resultados/proteinasClusterDBScan.csv', index=False)
def remplazar_sequence_for_ID(output):
......@@ -227,3 +335,5 @@ def remplazar_ID_for_sequence(output):
def ejecutar(archivoEntrada, enfermedad, similitud):
data = readData(archivoEntrada, enfermedad)
descarte(data, similitud)
if __name__=='__main__':
ejecutar("data_nervous_genes_xf.xlsx","C0002395",0.0001)
......@@ -143,8 +143,8 @@ def calculate_matrix_blasto(data):
datf.to_csv('resultados/matrizBlast.csv', index=False,header=False)
def remplazar_sequence_for_ID(output):
df_b = pd.read_excel("data_nervous_genes_2.xlsx")
df_b= substitute_or_remove_prot_id(df_b,"s")
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
#df_b= substitute_or_remove_prot_id(df_b,"s")
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for i in range(len(output)):
......@@ -255,45 +255,99 @@ def readData(archivoEntrada, enfermedad):
if __name__=="__main__":
data=readData("data_nervous_genes_x.xlsx","C0002395")
calculate_matrix_needle(data)
calculate_matrix_smith(data)
data=readData("data_nervous_genes_xf.xlsx","C0002395")
#calculate_matrix_needle(data)
#calculate_matrix_smith(data)
#calculate_matriz_levens(data)
output=data.to_list()
#output=remplazar_sequence_for_ID(data)
output=list(remplazar_sequence_for_ID(data))
similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)-1
#similarity_matrix=similarity_matrix/2
similarity_matrix=similarity_matrix.abs()
similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy()
sim_mat_40=sim_mat_40.to_numpy()
sim_mat_20=similarity_matrix.copy()
sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia_40.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
sim_mat_20=sim_mat_20.to_numpy()
#sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_40=data_40.drop([0])
data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_20=data_20.drop([0])
data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_10=data_10.drop([0])
new_sim=np.copy(similarity_matrix)
print(output)
new_sim_mean=np.copy(similarity_matrix)
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
for i,ks in data_40.iterrows():
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3
#print(ks['proteina1'])
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
#dfx=pd.DataFrame(sim_mat_20)
#dfx=df/1.3
#dfx=df-1
#dfx.abs()
for i in range(0,sim_mat_20.shape[0]):
sim_mat_20[i][i]*=2
dfx=pd.DataFrame(sim_mat_20)
dfx=dfx.apply(lambda x: x/2)
dfx=dfx-1
dfx=dfx.abs()
#dfx.to_csv("resultados/matrizLevenshteinFS_20.csv",header=False,index=False)
dfx.to_csv("resultados/matrizLevenshteinFS_Mean.csv",header=False,index=False)
for i in range(0,sim_mat_40.shape[0]):
sim_mat_40[i][i]/=0.7
dfx=pd.DataFrame(sim_mat_40)
dfx=dfx/1.3
dfx=dfx.apply(lambda x: x*0.7)
dfx=dfx-1
dfx.abs()
dfx=dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_70.csv",header=False,index=False)
similarity_matrix=pd.read_csv('resultados/matrizNeedleWunch.csv',header=None,index_col=False)+1
similarity_matrix=similarity_matrix/2
similarity_matrix=similarity_matrix.abs()
similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy()
sim_mat_40=sim_mat_40.to_numpy()
sim_mat_20=similarity_matrix.copy()
sim_mat_20=sim_mat_20.to_numpy()
#sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
data_40=data_40.drop([0])
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
indexes=[]
for i,ks in data_40.iterrows():
indexes.append((output.index(ks['proteina1']),output.index(ks['proteina2'])))
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])/100
#print(indexes)
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
for i in range(0,sim_mat_20.shape[0]):
sim_mat_20[i][i]*=2
dfx=pd.DataFrame(sim_mat_20)
dfx=dfx.apply(lambda x: x/2)
dfx=dfx-1
dfx=dfx.abs()
dfx.to_csv("resultados/matrizNeedleWunchFS_Mean.csv",header=False,index=False)
for i in range(0,sim_mat_40.shape[0]):
sim_mat_40[i][i]/=0.7
dfx=pd.DataFrame(sim_mat_40)
dfx=dfx.apply(lambda x: x*0.7)
dfx=dfx-1
dfx=dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_40.csv",header=False,index=False)
dfx.to_csv("resultados/mmatrizNeedleWunchFS_70.csv",header=False,index=False)
"""
dfx=pd.DataFrame(sim_mat_10)
dfx=df/1.3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment