From d5cae6acac3f2b8f0c0d52f671c02bea961ff947 Mon Sep 17 00:00:00 2001 From: Rafael Artinano Date: Mon, 11 Dec 2023 16:54:53 +0100 Subject: [PATCH] update in similarity with aa --- TFM-main/src/compute_for_clases.py | 406 ++++++++++++++++++ ...ate_tha_excel.py => generate_the_excel.py} | 0 TFM-main/src/metricas.py | 160 +++++++ TFM-main/src/patrones_similares_aa.py | 261 ++++++----- 4 files changed, 691 insertions(+), 136 deletions(-) create mode 100644 TFM-main/src/compute_for_clases.py rename TFM-main/src/{generate_tha_excel.py => generate_the_excel.py} (100%) diff --git a/TFM-main/src/compute_for_clases.py b/TFM-main/src/compute_for_clases.py new file mode 100644 index 0000000..fd4418b --- /dev/null +++ b/TFM-main/src/compute_for_clases.py @@ -0,0 +1,406 @@ +import pandas as pd +import time +import ast +import csv +import math +from interfazGrafica import interfaz +from descarteProteinas import ejecutar,remplazar_ID_for_sequence +from generate_tha_excel import substitute_or_remove_prot_id +import metricas +from graficas import grafica +import os +import json +import ast +import re +from patrones_similares_aa import remplazar_sequence_for_ID as remplazar_s +from patrones_similares_aa import buscar_patrones_simAA +from collections import defaultdict +from pathlib import Path + + + +def substitute_or_remove_prot_id2(data,sub_rem): + print("inside the problem") + with open("nombres_sust.txt") as prottosubs: + index=prottosubs.readline() + acept=index.split() + listtosubs={} + for i in range(0,len(acept)): + listtosubs[acept[i]]=[] + while line := prottosubs.readline(): + newline=line.split() + #print(len(newline)) + for i in range(0,len(newline)): + + listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) + resub=1 + if re.search("Primary",list(listtosubs.keys())[0]): + resub=0 + print((resub+1)%2) + #print(data) + #data2=data.copy() + if(sub_rem == "s"): + data["Proteina"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) + #datacp=data.copy() + #print(pd.concat([data2,datacp]).drop_duplicates()) + else: + global globi + datas= data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==True] + data = data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==False] + + #datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) + + globi=globi+1 + return data + +def readData(archivoEntrada, enfermedad, archivoTarget): + data = pd.read_excel(archivoEntrada) + dataC = pd.read_csv("resultados/proteinasDescartadas2.csv") + #data=substitute_or_remove_prot_id(data,"r") + #dataC=substitute_or_remove_prot_id(dataC,"r") + #Descarte de proteinas + data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])] + print("Se ha realizado el descarte de proteínas") + + # "C0002395" + if(enfermedad != ''): + data = data.loc[data["disease_id"] == enfermedad] + #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") + #print("Se han seleccionado las proteínas de la enfermedad elegida") + #dataB=substitute_or_remove_prot_id(dataB,"r") + #if(archivoTarget != ''): + # dataB=substitute_or_remove_prot_id(dataB,"r") + #Eliminar las proteinas target + # data = data[~((data["disease_id"] == enfermedad) & + # (data["protein_id"].isin(dataB["protein_id"])))] + # print("Se han descartado las proteínas del archivo target") + + sequences = data["protein_sequence"] + print(sequences) + num_filas = sequences.shape[0] + + return sequences, num_filas + +def guardar_patrones_len1(sequences, pattern_freqMin): + all_patterns = dict() + longitud_max = 0 + # Each pattern associated to the proteins the pattern is in + pattern_proteins = {} + for protein in sequences: + longitud = len(protein) + if longitud > longitud_max: + longitud_max = longitud + + all_patterns[protein] = [] + # En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina + posicionPatterns = dict() + for index, letter in enumerate(protein): + posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index] + + all_patterns[protein] = posicionPatterns + + + for protein, patterns in all_patterns.items(): + for pattern, positions in patterns.items(): + if pattern not in pattern_proteins: + pattern_proteins[pattern] = {} + if protein not in pattern_proteins[pattern]: + pattern_proteins[pattern][protein] = [] + pattern_proteins[pattern][protein].extend(positions) + + + for pattern, proteins in pattern_proteins.items(): + if len(proteins) >= min_ocurrence: + pattern_freqMin[pattern] = proteins + + df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) + df.to_csv('prueba2.csv', index=False) + return pattern_freqMin, posicionPatterns, longitud_max + +def buscar_patrones_identicos(sequences): + pattern_freqMin = {} + pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin) + + if bool(pattern_freqMin): + for pattern_length in range(2, longitud_max + 1): + # Si se intenta acceder a una clave que no existe se creara una lista vacia + auxPos = {} + sub_seqs = [] + for pattern, proteins in pattern_freqMin.items(): + if len(pattern) == pattern_length - 1: + for prot, positions in proteins.items(): + protein_len = len(prot) + if protein_len < pattern_length - 1: + continue + for position in positions: + if (protein_len < position + pattern_length): + continue + sub_seq = prot[position:position + pattern_length] + if sub_seq in pattern_freqMin: + continue + # Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible + # min freq tb + ultima_letra = sub_seq[-1] + pos_ultima_letra = position + pattern_length - 1 + if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: + if sub_seq not in auxPos: + auxPos[sub_seq] = {} + if prot not in auxPos[sub_seq]: + auxPos[sub_seq][prot] = [] + auxPos[sub_seq][prot].append(position) + if sub_seq not in sub_seqs: + sub_seqs.append(sub_seq) + print(pattern_length) + sub_seqs_copy = sub_seqs.copy() + for p in sub_seqs_copy: + if len(auxPos[p]) < min_ocurrence: + del auxPos[p] + sub_seqs.remove(p) + + # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar + if not bool(auxPos): + break + + for pattern, proteins in auxPos.items(): + for prot, pos in proteins.items(): + if pattern not in pattern_freqMin: + pattern_freqMin[pattern] = {} + if prot not in pattern_freqMin[pattern]: + pattern_freqMin[pattern][prot] = [] + found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot])) + print(found) + print(len(found)) + if(len(found)<=0): + pattern_freqMin[pattern][prot].extend(pos) + if len(pattern) > 2: + if pattern[:-1] in pattern_freqMin: + del pattern_freqMin[pattern[:-1]] + if pattern[1:] in pattern_freqMin: + del pattern_freqMin[pattern[1:]] + + + + # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico + + dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) + dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} + df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) + num_patrones = df.shape[0] + pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} + return pattern_freqMin, num_patrones + +def remplazar_sequence_for_ID(pattern_freqMin,name): + df_b = pd.read_excel("data_nervous_genes_xf.xlsx") + #df_b=pd.read_excel("proteinasClase_PC00060.xlsx") + #df_b=substitute_or_remove_prot_id(df_b,'r') + cl=pd.read_excel("alzheimer_protein_class 2.xlsx") + #cl=substitute_or_remove_prot_id(cl,"r") + #data2=data.copy() + cli=cl.groupby('protein_id') + di=[] + do={} + for k,v in cli: + for index,row in v.iterrows(): + di.append(row['class_name']) + do[k]=di + di=[] + class_dict=do + output = [] + + for key, value in pattern_freqMin.items(): + for proteina, posiciones in value.items(): + output.append([key, proteina, posiciones]) + + output = [sublista for sublista in output if len(sublista[0]) != 1] + + # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico + output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) + + + proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) + for item in output_ordered: + protein_sequence = item[1] + if protein_sequence in proteinas_dict: + item[1] = proteinas_dict[protein_sequence] + item.append(class_dict[item[1]] if item[1] in class_dict else "N/A") + df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt']) + + # Guardar el DataFrame actualizado en un archivo CSV + + df_a.to_csv('clases/'+ name +'/patronesIdenticos.csv', index=False) + print("Se ha generado el .csv con los patrones idénticos encontrados") + + +if __name__ == "__main__": + if not os.path.exists("resultados"): + # Si no existe, crearla + os.makedirs("resultados") + print(f"La carpeta resultados se ha creado correctamente.") + else: + print(f"La carpeta resultados ya existe.") + + + inicio = time.time() + jsonfile=open("param_file.conf","r") + datosInterfaz=json.load(jsonfile) + #datosInterfaz = interfaz() + print(datosInterfaz) + + #archivoEntrada = datosInterfaz["NombreArchivoEntrada"] + enfermedad = datosInterfaz["CodigoEnfermedad"] + archivoTarget = datosInterfaz["NombreArchivoTarget"] + similitud = float(datosInterfaz["Similitud"]) + cl=pd.read_excel("alzheimer_protein_class 2.xlsx") + #cl=substitute_or_remove_prot_id(cl,"r") + #data2=data.copy() + cli=cl.groupby('protein_id') + di=[] + do={} + for k,v in cli: + for index,row in v.iterrows(): + di.append(row['class_name']) + do[k]=di + di=[] + class_dict=do + + for fil in Path("clases").rglob("*.xlsx"): + if not os.path.exists("clases/"+fil.name.split('.')[0]+"/"): + # Si no existe, crearla + os.makedirs("clases/"+fil.name.split('.')[0]+"/") + print(f"La carpeta resultados se ha creado correctamente.") + else: + print(f"La carpeta resultados ya existe.") + ejecutar("clases/"+fil.name, enfermedad, similitud) + pattern_freqMin = dict() + sequences, num_filas = readData("clases/"+fil.name, enfermedad, archivoTarget) + df_b = pd.read_excel("clases/"+fil.name) + #df_b=pd.read_excel("proteinasClase_PC00060.xlsx") + proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) + ka="" + for item in sequences: + ka=proteinas_dict[item] + min_ocurrence = math.floor(num_filas * float(datosInterfaz["OcurrenciaMin"])) + seq_len=0 + for i in sequences: + seq_len+=len(i) + print(min_ocurrence) + pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences) + remplazar_sequence_for_ID(pattern_freqMin,fil.name.split('.')[0]) + + df=pd.read_csv('clases/'+fil.name.split('.')[0]+'/patronesIdenticos.csv', usecols=['Patron', 'Proteina', 'Posiciones',"classesProt"],index_col=False) + #df=substitute_or_remove_prot_id2(df,"s") + df.to_csv('clases/'+fil.name.split('.')[0]+'/patronesIdenticos.csv', index=False) + + #dfx=df.copy() + df2=df.groupby('Patron') + dicta={'Patron':[] ,'%Ocurrencia_caracter':[],'longitud_Apariciones':[],'longitud_Apariciones_Proteina':[],'%Patron':[],'%Patron_proteina':[],'total_Patrones':[],'total_Patrones_por_prot':[]} + compl=0 + comp=0 + first=True + res=set() + for k,v in df2: + res=set() + for index,row in v.iterrows(): + Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'] + res |= set(Posic) + compl+=1 + comp+=len(res) + for k,v in df2: + dicta={'Patron':[] ,'%Ocurrencia_caracter':[],'longitud_Apariciones':[],'longitud_Apariciones_Proteina':[],'%Patron':[],'%Patron_proteina':[],'total_Patrones':[],'total_Patrones_por_prot':[]} + dicta[k]=0 + dox=0 + dix=0 + co=0 + Posic=set() + for index,row in v.iterrows(): + Posic|=set([oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']) + co+=1 + dix+=len(Posic) + dox+=len(Posic)*len(str(k)) + + dox/=seq_len + dicta['%Ocurrencia_caracter'].append(dox*100) + dicta['longitud_Apariciones'].append(co) + dicta['longitud_Apariciones_Proteina'].append(dix) + dicta['%Patron'].append(co/compl*100) + dicta['%Patron_proteina'].append(dix/comp*100) + dicta['Patron'].append(str(k)) + dicta['total_Patrones'].append(compl) + dicta['total_Patrones_por_prot'].append(comp) + do=pd.DataFrame(dicta) + if not first: + do.to_csv('clases/'+fil.name.split('.')[0]+'/patronesOcurrencia.csv',index=False,header=False,mode='a' ) + else: + do.to_csv('clases/'+fil.name.split('.')[0]+'/patronesOcurrencia.csv',index=False ) + first=False + del df2 + + + df3=df.groupby('Proteina') + del df + first=True + di={'proteinas':[],'maximum_ocurrence':[],'patrones':[],'global_ocurrence':[]} + df_b = pd.read_excel("data_nervous_genes_xf.xlsx") + #df_b = pd.read_excel("proteinasClase_PC00060.xlsx") + #df_b=substitute_or_remove_prot_id(df_b,"r") + proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values) + positions_visited=[] + for k,v in df3: + di={'proteinas':[],'maximum_ocurrence':[],'patrones':[],'global_ocurrence':[],"classesProt":[]} + seq=proteinas_dict[k] + di['maximum_ocurrence'].append(len(seq)) + di['proteinas'].append(k) + pato=[] + glob_ocurrence=0 + Acum=[] + + + + for index,row in v.iterrows(): + print(row) + pat={} + pat['patron']=str(row['Patron']) + Posit=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'] + print(Posit) + Add=[] + for i in Posit: + for kaa in range(0,len(str(row['Patron']))): + print(i) + Add.append(int(i)+kaa) + lex=len(list(set(Acum) & set(Add))) + + Posic=Posit + pat['loc_ocurren']=(len(Posic)*len(str(row['Patron'])))/len(seq) + glob_ocurrence+=len(Posic)*len(str(row['Patron']))-lex + pato.append(pat) + Acum=list(set(Acum) | set(Add)) + di['patrones'].append(pato) + di['global_ocurrence'].append(glob_ocurrence) + di['classesProt'].append(class_dict[k] if k in class_dict else "N/A") + do=pd.DataFrame(di) + if not first: + do.to_csv('clases/'+fil.name.split('.')[0]+'/proteinasOcurrencia.csv',index=False,header=False,mode='a' ) + else: + do.to_csv('clases/'+fil.name.split('.')[0]+'/proteinasOcurrencia.csv',index=False) + first=False + + #metricas.metrica_distanciaProteinas() + archivo = 'resultados/Metrica_distanciaProteinasMismoPatron.csv' + nombreOutput = 'resultados/Figura_DistanciaProteinasMismoPatron' + #grafica(archivo, nombreOutput) + + print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen el mismo patrón") + + metrica = math.floor(num_patrones * float(datosInterfaz["Metrica"])) + + metricas.patronesComunClas(metrica,fil.name.split('.')[0]) + + archivo = 'resultados/Metrica_patronesComunes.csv' + nombreOutput = 'resultados/Figura_distanciaProteinasPatronesComunes' + #grafica(archivo, nombreOutput) + print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen mas de un patrón en común") + + fin = time.time() + + tiempo_total = fin - inicio + print(tiempo_total, "segundos") diff --git a/TFM-main/src/generate_tha_excel.py b/TFM-main/src/generate_the_excel.py similarity index 100% rename from TFM-main/src/generate_tha_excel.py rename to TFM-main/src/generate_the_excel.py diff --git a/TFM-main/src/metricas.py b/TFM-main/src/metricas.py index d007217..c0d0bc6 100755 --- a/TFM-main/src/metricas.py +++ b/TFM-main/src/metricas.py @@ -212,6 +212,166 @@ def patronesComun(patronesComun): # index=False) + +def patronesComunClas(patronesComun,name): + + # Leer el archivo CSV y cargar los datos en una lista de diccionarios + registros = [] + cl=pd.read_excel("alzheimer_protein_class 2.xlsx") + #cl=substitute_or_remove_prot_id(cl,"r") + #data2=data.copy() + cli=cl.groupby('protein_id') + di=[] + do={} + for k,v in cli: + for index,row in v.iterrows(): + di.append(row['class_name']) + do[k]=di + di=[] + class_dict=do + with open("clases/"+name+"/patronesIdenticos.csv", 'r') as file: + reader = csv.DictReader(file) + for row in reader: + registros.append(row) + + # Diccionario para almacenar la cantidad de patrones únicos por proteína + patrones_por_proteina = {} + posiciones_patron={} + # Iterar sobre los registros y extraer los patrones únicos de cada proteína + for registro in registros: + proteina = registro['Proteina'] + patron = registro['Patron'] + posicion = registro['Posiciones'] + if proteina not in patrones_por_proteina: + patrones_por_proteina[proteina] = set() + patrones_por_proteina[proteina].add(patron) + pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']'] + if proteina not in posiciones_patron: + posiciones_patron[proteina]={} + posiciones_patron[proteina][patron]=[] + for u in pp: + for kaa in range(0,len(patron)): + posiciones_patron[proteina][patron].append(kaa+int(u)) + + # Diccionario para almacenar las proteinas que tienen en común cada par de proteinas + proteinas_comunes = {} + rr=[] + df_p = pd.read_excel("data_nervous_genes_xf.xlsx") + #df_p = pd.read_excel("proteinasClase_PC00060.xlsx") + #df_p=substitute_or_remove_prot_id(df_p,"r") + proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values) + pares_proteinas_procesados = set() + # Filtrar las proteínas que tienen al menos 10 patrones únicos en común + + for proteina1, patrones1 in patrones_por_proteina.items(): + for proteina2, patrones2 in patrones_por_proteina.items(): + if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados: + patrones_comunes = patrones1.intersection(patrones2) + if len(patrones_comunes) >= patronesComun: + par_proteinas = (proteina1, proteina2) + + + + proteinas_comunes[par_proteinas] = patrones_comunes + pares_proteinas_procesados.add(par_proteinas) + + output = [] + df_b = pd.read_csv("AllProteins_%Similitud.csv") + output2=[] + proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict() + outbreak=[] + first=True + first2=True + for par_proteinas, patrones_comunes in proteinas_comunes.items(): + + proteina1, proteina2 = par_proteinas + pattern_lengths = {} + pattern_l={} + Antecedentes={} + + if(proteina1 == 'Q13753' and proteina2 == 'P07550'): + print(patrones_comunes) + for pattern in patrones_comunes: + length = len(pattern) + key = f'Longitud {length}' + if key in pattern_lengths: + pattern_lengths[key].append([pattern]) + Add=posiciones_patron[proteina1][pattern] + if(proteina1 == 'Q13753' and proteina2 == 'P07550'): + print(Add) + if proteina1 not in Antecedentes: + Antecedentes[proteina1]=set() + lex=len(Antecedentes[proteina1] & set(Add)) + Antecedentes[proteina1].update(Add) + pattern_l[key][0]+=len(Add)-lex + Add=posiciones_patron[proteina2][pattern] + + if proteina2 not in Antecedentes: + Antecedentes[proteina2]=set() + lex=len(Antecedentes[proteina2] & set(Add)) + Antecedentes[proteina2].update(Add) + pattern_l[key][1]+=len(Add)-lex + #sprint(length*len(Posic)) + else: + pattern_lengths[key] = [[pattern]] + Add=posiciones_patron[proteina1][pattern] + + if proteina1 not in Antecedentes: + Antecedentes[proteina1]=set() + lex=len(Antecedentes[proteina1] & set(Add)) + #print(lex) + #print(Antecedentes) + Antecedentes[proteina1].update(Add) + Add2=posiciones_patron[proteina2][pattern] + + + if proteina2 not in Antecedentes: + Antecedentes[proteina2]=set() + lex2=len(Antecedentes[proteina2] & set(Add2)) + Antecedentes[proteina2].update(Add2) + + pattern_l[key]=[len(Add)-lex,len(Add2)-lex2] + + sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True)) + + if proteina1 != proteina2: + prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]] + if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0): + output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"]) + + df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"]) + output=[] + if(first2): + df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv', + index=False) + first2=False + else: + df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',index=False,header=False,mode='a') + + #else: + #output.append([sorted_pattern_lengths, proteina1, proteina2, + # 'N/A']) + + #print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1]))) + #print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) )) + if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0): + output2.append([proteina1,proteina2, (max(len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1]))*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"]) + df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"]) + output2=[] + if(first): + df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False) + first=False + else: + df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False,header=False,mode='a') + + + + #output2=sorted(output2, key = lambda x: int(x[2])) + #df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia']) + #df2.to_csv('resultados/Metrica_Coincidencia.csv', + # index=False) + + def remplazar_sequence_for_ID(output): df_b = pd.read_excel("data_nervous_genes_xf.xlsx") #df_b = pd.read_excel("proteinasClase_PC00060.xlsx") diff --git a/TFM-main/src/patrones_similares_aa.py b/TFM-main/src/patrones_similares_aa.py index 1616460..b6b1501 100644 --- a/TFM-main/src/patrones_similares_aa.py +++ b/TFM-main/src/patrones_similares_aa.py @@ -11,6 +11,8 @@ import os import json import ast import re +from collections import defaultdict + classes={} min_ocurrence=0 def swap_dict(d): @@ -57,171 +59,158 @@ def readData(archivoEntrada, enfermedad, archivoTarget): return sequences, num_filas -def guardar_patrones_len1(sequences, pattern_freqMin): - all_patterns = dict() + + +def read_aminoacidos(): + cla = {} + with open('aminoacidos.txt', 'r') as op: + lines = op.readlines() + for line in lines: + oo = line.replace('\n', '').split('\t') + key = oo.pop(0) + cla[key] = oo + return swap_dict(cla), cla + +def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence): + all_patterns = defaultdict(list) longitud_max = 0 - global min_ocurrence - # Each pattern associated to the proteins the pattern is in - pattern_proteins = {} + classes, cla = read_aminoacidos() + for protein in sequences: longitud = len(protein) if longitud > longitud_max: longitud_max = longitud all_patterns[protein] = [] - # En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina - posicionPatterns = dict() - cla={} - with open('aminoacidos.txt','r') as op: - lines=op.readlines() - print(lines) - for line in lines: - oo=line.replace('\n','').split('\t') - key=oo.pop(0) - print(oo) - cla[key]=oo - classes=swap_dict(cla) - clases=classes - print(clases) + posicion_patterns = defaultdict(list) + for index, letter in enumerate(protein): - posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index] - if(letter in clases): - overst=set() - for EqvLetter in clases[letter]: - overst=overst | set(cla[EqvLetter]) - - for EqvLetter in overst: - if(EqvLetter) != letter: - print(EqvLetter) - posicionPatterns[EqvLetter] = posicionPatterns.get(EqvLetter, []) + [index] - all_patterns[protein] = posicionPatterns + posicion_patterns[letter].append(index) + if letter in classes: + overst = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[letter]]) + for eqv_letter in overst: + if eqv_letter != letter: + posicion_patterns[eqv_letter].append(index) + + all_patterns[protein] = posicion_patterns + pattern_proteins = defaultdict(dict) for protein, patterns in all_patterns.items(): for pattern, positions in patterns.items(): - if pattern not in pattern_proteins : + if pattern not in pattern_proteins: pattern_proteins[pattern] = {} if protein not in pattern_proteins[pattern]: pattern_proteins[pattern][protein] = [] pattern_proteins[pattern][protein].extend(positions) - for pattern, proteins in pattern_proteins.items(): if len(proteins) >= min_ocurrence: pattern_freqMin[pattern] = proteins df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) df.to_csv('prueba2.csv', index=False) - return pattern_freqMin, posicionPatterns, longitud_max -def buscar_patrones_simAA(sequences,min_ocurr): - min_ocurrence=min_ocurr + return pattern_freqMin, posicion_patterns, longitud_max + +def buscar_patrones_simAA(sequences, min_ocurr): + min_ocurrence = min_ocurr pattern_freqMin = {} - pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin) - cla={} - num_patrones=0 - with open('aminoacidos.txt','r') as op: - lines=op.readlines() - print(lines) - for line in lines: - oo=line.replace('\n','').split('\t') - key=oo.pop(0) - print(oo) - cla[key]=oo - classes=swap_dict(cla) - clases=classes - if bool(pattern_freqMin): - for pattern_length in range(2, longitud_max + 1): - # Si se intenta acceder a una clave que no existe se creara una lista vacia - auxPos = {} - sub_seqs = [] - for pattern, proteins in pattern_freqMin.items(): - if len(pattern) == pattern_length - 1: - for prot, positions in proteins.items(): - protein_len = len(prot) - if protein_len < pattern_length - 1: - continue - for position in positions: - pos_last_letter=position+pattern_length-1 - if pos_last_letter > len(prot)-1: - continue - last_letter = prot[pos_last_letter] - - if last_letter not in clases: - sub_seq = pattern + last_letter - - if sub_seq in pattern_freqMin: - continue - - ultima_letra = sub_seq[-1] - pos_ultima_letra = position + pattern_length - 1 - - if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: - if sub_seq not in auxPos: - auxPos[sub_seq] = {} - if prot not in auxPos[sub_seq]: - auxPos[sub_seq][prot] = [] - auxPos[sub_seq][prot].append(position) - if sub_seq not in sub_seqs: - sub_seqs.append(sub_seq) - else: - overst_set = set() - - for EqvLetter in clases[last_letter]: - overst_set |= set(cla[EqvLetter]) + pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence) + classes, cla = read_aminoacidos() - for EqvLetter in overst_set: - sub_seq = pattern + EqvLetter - - if sub_seq in pattern_freqMin: - continue - - ultima_letra = sub_seq[-1] - pos_ultima_letra = position + pattern_length - 1 - - if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: - if sub_seq not in auxPos: - auxPos[sub_seq] = {} - if prot not in auxPos[sub_seq]: - auxPos[sub_seq][prot] = [] - auxPos[sub_seq][prot].append(position) - if sub_seq not in sub_seqs: - sub_seqs.append(sub_seq) - print(pattern_length) - sub_seqs_copy = sub_seqs.copy() - for p in sub_seqs_copy: - if len(auxPos[p]) < min_ocurrence: - del auxPos[p] - sub_seqs.remove(p) + if not bool(pattern_freqMin): + return pattern_freqMin, 0 - # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar - if not bool(auxPos): - break + for pattern_length in range(2, longitud_max + 1): + aux_pos = defaultdict(dict) + sub_seqs = [] - for pattern, proteins in auxPos.items(): - for prot, pos in proteins.items(): - if pattern not in pattern_freqMin: - pattern_freqMin[pattern] = {} - if prot not in pattern_freqMin[pattern]: - pattern_freqMin[pattern][prot] = [] - found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot])) - print(found) - print(len(found)) - if(len(found)<=0): - pattern_freqMin[pattern][prot].extend(pos) - if len(pattern) > 2: - if pattern[:-1] in pattern_freqMin: - del pattern_freqMin[pattern[:-1]] - if pattern[1:] in pattern_freqMin: - del pattern_freqMin[pattern[1:]] + for pattern, proteins in pattern_freqMin.items(): + if len(pattern) == pattern_length - 1: + + for prot, positions in proteins.items(): + protein_len = len(prot) + if protein_len < pattern_length - 1: + continue + for position in positions: + pos_last_letter = position + pattern_length - 1 + if pos_last_letter > len(prot) - 1: + continue + last_letter = prot[pos_last_letter] + pos_ultima_letra = position + pattern_length - 1 + if last_letter not in classes: + sub_seq = pattern + last_letter + + if sub_seq in pattern_freqMin: + continue + + ultima_letra = sub_seq[-1] + + + if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: + if sub_seq not in aux_pos: + aux_pos[sub_seq] = {} + if prot not in aux_pos[sub_seq]: + aux_pos[sub_seq][prot] = [] + aux_pos[sub_seq][prot].append(position) + if sub_seq not in sub_seqs: + sub_seqs.append(sub_seq) + else: + overst_set = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[last_letter]]) + broken=False + for eqv_letter in overst_set: + sub_seq = pattern + eqv_letter + if sub_seq in pattern_freqMin: + broken=True + break + if sub_seq in aux_pos: + if prot not in aux_pos[sub_seq]: + aux_pos[sub_seq][prot] = [] + aux_pos[sub_seq][prot].append(position) + broken=True + break + ultima_letra=last_letter + sub_seq = pattern + last_letter + + if not broken and ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: + if sub_seq not in aux_pos: + aux_pos[sub_seq] = {} + if prot not in aux_pos[sub_seq]: + aux_pos[sub_seq][prot] = [] + aux_pos[sub_seq][prot].append(position) + if sub_seq not in sub_seqs: + sub_seqs.append(sub_seq) + sub_seqs_copy = sub_seqs.copy() + for p in sub_seqs_copy: + if len(aux_pos[p]) < min_ocurrence: + del aux_pos[p] + sub_seqs.remove(p) + + if not bool(aux_pos): + break + + for pattern, proteins in aux_pos.items(): + for prot, pos in proteins.items(): + if pattern not in pattern_freqMin: + pattern_freqMin[pattern] = {} + if prot not in pattern_freqMin[pattern]: + pattern_freqMin[pattern][prot] = [] + found = list(filter(lambda x: pos - len(pattern) <= x <= pos + len(pattern), + pattern_freqMin[pattern][prot])) + if len(found) <= 0: + pattern_freqMin[pattern][prot].extend(pos) + if len(pattern) > 2: + if pattern[:-1] in pattern_freqMin: + del pattern_freqMin[pattern[:-1]] + if pattern[1:] in pattern_freqMin: + del pattern_freqMin[pattern[1:]] - # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico - - dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) - dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} - df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) - num_patrones = df.shape[0] - pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} + dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) + #dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} + df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) + num_patrones = df.shape[0] + #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} return pattern_freqMin, num_patrones def buscar_patrones_identicos(sequences,min_ocurr): pattern_freqMin = {} -- 2.24.1