Commit d5cae6ac authored by Rafael Artinano's avatar Rafael Artinano

update in similarity with aa

parent 3c565e97
import pandas as pd
import time
import ast
import csv
import math
from interfazGrafica import interfaz
from descarteProteinas import ejecutar,remplazar_ID_for_sequence
from generate_tha_excel import substitute_or_remove_prot_id
import metricas
from graficas import grafica
import os
import json
import ast
import re
from patrones_similares_aa import remplazar_sequence_for_ID as remplazar_s
from patrones_similares_aa import buscar_patrones_simAA
from collections import defaultdict
from pathlib import Path
def substitute_or_remove_prot_id2(data,sub_rem):
print("inside the problem")
with open("nombres_sust.txt") as prottosubs:
index=prottosubs.readline()
acept=index.split()
listtosubs={}
for i in range(0,len(acept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
#print(len(newline))
for i in range(0,len(newline)):
listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())
resub=1
if re.search("Primary",list(listtosubs.keys())[0]):
resub=0
print((resub+1)%2)
#print(data)
#data2=data.copy()
if(sub_rem == "s"):
data["Proteina"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
else:
global globi
datas= data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data = data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==False]
#datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
globi=globi+1
return data
def readData(archivoEntrada, enfermedad, archivoTarget):
data = pd.read_excel(archivoEntrada)
dataC = pd.read_csv("resultados/proteinasDescartadas2.csv")
#data=substitute_or_remove_prot_id(data,"r")
#dataC=substitute_or_remove_prot_id(dataC,"r")
#Descarte de proteinas
data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])]
print("Se ha realizado el descarte de proteínas")
# "C0002395"
if(enfermedad != ''):
data = data.loc[data["disease_id"] == enfermedad]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#print("Se han seleccionado las proteínas de la enfermedad elegida")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#if(archivoTarget != ''):
# dataB=substitute_or_remove_prot_id(dataB,"r")
#Eliminar las proteinas target
# data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])))]
# print("Se han descartado las proteínas del archivo target")
sequences = data["protein_sequence"]
print(sequences)
num_filas = sequences.shape[0]
return sequences, num_filas
def guardar_patrones_len1(sequences, pattern_freqMin):
all_patterns = dict()
longitud_max = 0
# Each pattern associated to the proteins the pattern is in
pattern_proteins = {}
for protein in sequences:
longitud = len(protein)
if longitud > longitud_max:
longitud_max = longitud
all_patterns[protein] = []
# En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
posicionPatterns = dict()
for index, letter in enumerate(protein):
posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index]
all_patterns[protein] = posicionPatterns
for protein, patterns in all_patterns.items():
for pattern, positions in patterns.items():
if pattern not in pattern_proteins:
pattern_proteins[pattern] = {}
if protein not in pattern_proteins[pattern]:
pattern_proteins[pattern][protein] = []
pattern_proteins[pattern][protein].extend(positions)
for pattern, proteins in pattern_proteins.items():
if len(proteins) >= min_ocurrence:
pattern_freqMin[pattern] = proteins
df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins'])
df.to_csv('prueba2.csv', index=False)
return pattern_freqMin, posicionPatterns, longitud_max
def buscar_patrones_identicos(sequences):
pattern_freqMin = {}
pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
if bool(pattern_freqMin):
for pattern_length in range(2, longitud_max + 1):
# Si se intenta acceder a una clave que no existe se creara una lista vacia
auxPos = {}
sub_seqs = []
for pattern, proteins in pattern_freqMin.items():
if len(pattern) == pattern_length - 1:
for prot, positions in proteins.items():
protein_len = len(prot)
if protein_len < pattern_length - 1:
continue
for position in positions:
if (protein_len < position + pattern_length):
continue
sub_seq = prot[position:position + pattern_length]
if sub_seq in pattern_freqMin:
continue
# Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible
# min freq tb
ultima_letra = sub_seq[-1]
pos_ultima_letra = position + pattern_length - 1
if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
if sub_seq not in auxPos:
auxPos[sub_seq] = {}
if prot not in auxPos[sub_seq]:
auxPos[sub_seq][prot] = []
auxPos[sub_seq][prot].append(position)
if sub_seq not in sub_seqs:
sub_seqs.append(sub_seq)
print(pattern_length)
sub_seqs_copy = sub_seqs.copy()
for p in sub_seqs_copy:
if len(auxPos[p]) < min_ocurrence:
del auxPos[p]
sub_seqs.remove(p)
# Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
if not bool(auxPos):
break
for pattern, proteins in auxPos.items():
for prot, pos in proteins.items():
if pattern not in pattern_freqMin:
pattern_freqMin[pattern] = {}
if prot not in pattern_freqMin[pattern]:
pattern_freqMin[pattern][prot] = []
found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot]))
print(found)
print(len(found))
if(len(found)<=0):
pattern_freqMin[pattern][prot].extend(pos)
if len(pattern) > 2:
if pattern[:-1] in pattern_freqMin:
del pattern_freqMin[pattern[:-1]]
if pattern[1:] in pattern_freqMin:
del pattern_freqMin[pattern[1:]]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0])))
dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins'])
num_patrones = df.shape[0]
pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
return pattern_freqMin, num_patrones
def remplazar_sequence_for_ID(pattern_freqMin,name):
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,'r')
cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
output = []
for key, value in pattern_freqMin.items():
for proteina, posiciones in value.items():
output.append([key, proteina, posiciones])
output = [sublista for sublista in output if len(sublista[0]) != 1]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for item in output_ordered:
protein_sequence = item[1]
if protein_sequence in proteinas_dict:
item[1] = proteinas_dict[protein_sequence]
item.append(class_dict[item[1]] if item[1] in class_dict else "N/A")
df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('clases/'+ name +'/patronesIdenticos.csv', index=False)
print("Se ha generado el .csv con los patrones idénticos encontrados")
if __name__ == "__main__":
if not os.path.exists("resultados"):
# Si no existe, crearla
os.makedirs("resultados")
print(f"La carpeta resultados se ha creado correctamente.")
else:
print(f"La carpeta resultados ya existe.")
inicio = time.time()
jsonfile=open("param_file.conf","r")
datosInterfaz=json.load(jsonfile)
#datosInterfaz = interfaz()
print(datosInterfaz)
#archivoEntrada = datosInterfaz["NombreArchivoEntrada"]
enfermedad = datosInterfaz["CodigoEnfermedad"]
archivoTarget = datosInterfaz["NombreArchivoTarget"]
similitud = float(datosInterfaz["Similitud"])
cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
for fil in Path("clases").rglob("*.xlsx"):
if not os.path.exists("clases/"+fil.name.split('.')[0]+"/"):
# Si no existe, crearla
os.makedirs("clases/"+fil.name.split('.')[0]+"/")
print(f"La carpeta resultados se ha creado correctamente.")
else:
print(f"La carpeta resultados ya existe.")
ejecutar("clases/"+fil.name, enfermedad, similitud)
pattern_freqMin = dict()
sequences, num_filas = readData("clases/"+fil.name, enfermedad, archivoTarget)
df_b = pd.read_excel("clases/"+fil.name)
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
ka=""
for item in sequences:
ka=proteinas_dict[item]
min_ocurrence = math.floor(num_filas * float(datosInterfaz["OcurrenciaMin"]))
seq_len=0
for i in sequences:
seq_len+=len(i)
print(min_ocurrence)
pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences)
remplazar_sequence_for_ID(pattern_freqMin,fil.name.split('.')[0])
df=pd.read_csv('clases/'+fil.name.split('.')[0]+'/patronesIdenticos.csv', usecols=['Patron', 'Proteina', 'Posiciones',"classesProt"],index_col=False)
#df=substitute_or_remove_prot_id2(df,"s")
df.to_csv('clases/'+fil.name.split('.')[0]+'/patronesIdenticos.csv', index=False)
#dfx=df.copy()
df2=df.groupby('Patron')
dicta={'Patron':[] ,'%Ocurrencia_caracter':[],'longitud_Apariciones':[],'longitud_Apariciones_Proteina':[],'%Patron':[],'%Patron_proteina':[],'total_Patrones':[],'total_Patrones_por_prot':[]}
compl=0
comp=0
first=True
res=set()
for k,v in df2:
res=set()
for index,row in v.iterrows():
Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
res |= set(Posic)
compl+=1
comp+=len(res)
for k,v in df2:
dicta={'Patron':[] ,'%Ocurrencia_caracter':[],'longitud_Apariciones':[],'longitud_Apariciones_Proteina':[],'%Patron':[],'%Patron_proteina':[],'total_Patrones':[],'total_Patrones_por_prot':[]}
dicta[k]=0
dox=0
dix=0
co=0
Posic=set()
for index,row in v.iterrows():
Posic|=set([oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'])
co+=1
dix+=len(Posic)
dox+=len(Posic)*len(str(k))
dox/=seq_len
dicta['%Ocurrencia_caracter'].append(dox*100)
dicta['longitud_Apariciones'].append(co)
dicta['longitud_Apariciones_Proteina'].append(dix)
dicta['%Patron'].append(co/compl*100)
dicta['%Patron_proteina'].append(dix/comp*100)
dicta['Patron'].append(str(k))
dicta['total_Patrones'].append(compl)
dicta['total_Patrones_por_prot'].append(comp)
do=pd.DataFrame(dicta)
if not first:
do.to_csv('clases/'+fil.name.split('.')[0]+'/patronesOcurrencia.csv',index=False,header=False,mode='a' )
else:
do.to_csv('clases/'+fil.name.split('.')[0]+'/patronesOcurrencia.csv',index=False )
first=False
del df2
df3=df.groupby('Proteina')
del df
first=True
di={'proteinas':[],'maximum_ocurrence':[],'patrones':[],'global_ocurrence':[]}
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,"r")
proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values)
positions_visited=[]
for k,v in df3:
di={'proteinas':[],'maximum_ocurrence':[],'patrones':[],'global_ocurrence':[],"classesProt":[]}
seq=proteinas_dict[k]
di['maximum_ocurrence'].append(len(seq))
di['proteinas'].append(k)
pato=[]
glob_ocurrence=0
Acum=[]
for index,row in v.iterrows():
print(row)
pat={}
pat['patron']=str(row['Patron'])
Posit=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
print(Posit)
Add=[]
for i in Posit:
for kaa in range(0,len(str(row['Patron']))):
print(i)
Add.append(int(i)+kaa)
lex=len(list(set(Acum) & set(Add)))
Posic=Posit
pat['loc_ocurren']=(len(Posic)*len(str(row['Patron'])))/len(seq)
glob_ocurrence+=len(Posic)*len(str(row['Patron']))-lex
pato.append(pat)
Acum=list(set(Acum) | set(Add))
di['patrones'].append(pato)
di['global_ocurrence'].append(glob_ocurrence)
di['classesProt'].append(class_dict[k] if k in class_dict else "N/A")
do=pd.DataFrame(di)
if not first:
do.to_csv('clases/'+fil.name.split('.')[0]+'/proteinasOcurrencia.csv',index=False,header=False,mode='a' )
else:
do.to_csv('clases/'+fil.name.split('.')[0]+'/proteinasOcurrencia.csv',index=False)
first=False
#metricas.metrica_distanciaProteinas()
archivo = 'resultados/Metrica_distanciaProteinasMismoPatron.csv'
nombreOutput = 'resultados/Figura_DistanciaProteinasMismoPatron'
#grafica(archivo, nombreOutput)
print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen el mismo patrón")
metrica = math.floor(num_patrones * float(datosInterfaz["Metrica"]))
metricas.patronesComunClas(metrica,fil.name.split('.')[0])
archivo = 'resultados/Metrica_patronesComunes.csv'
nombreOutput = 'resultados/Figura_distanciaProteinasPatronesComunes'
#grafica(archivo, nombreOutput)
print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen mas de un patrón en común")
fin = time.time()
tiempo_total = fin - inicio
print(tiempo_total, "segundos")
......@@ -212,6 +212,166 @@ def patronesComun(patronesComun):
# index=False)
def patronesComunClas(patronesComun,name):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros = []
cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
with open("clases/"+name+"/patronesIdenticos.csv", 'r') as file:
reader = csv.DictReader(file)
for row in reader:
registros.append(row)
# Diccionario para almacenar la cantidad de patrones únicos por proteína
patrones_por_proteina = {}
posiciones_patron={}
# Iterar sobre los registros y extraer los patrones únicos de cada proteína
for registro in registros:
proteina = registro['Proteina']
patron = registro['Patron']
posicion = registro['Posiciones']
if proteina not in patrones_por_proteina:
patrones_por_proteina[proteina] = set()
patrones_por_proteina[proteina].add(patron)
pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']']
if proteina not in posiciones_patron:
posiciones_patron[proteina]={}
posiciones_patron[proteina][patron]=[]
for u in pp:
for kaa in range(0,len(patron)):
posiciones_patron[proteina][patron].append(kaa+int(u))
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes = {}
rr=[]
df_p = pd.read_excel("data_nervous_genes_xf.xlsx")
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
pares_proteinas_procesados = set()
# Filtrar las proteínas que tienen al menos 10 patrones únicos en común
for proteina1, patrones1 in patrones_por_proteina.items():
for proteina2, patrones2 in patrones_por_proteina.items():
if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados:
patrones_comunes = patrones1.intersection(patrones2)
if len(patrones_comunes) >= patronesComun:
par_proteinas = (proteina1, proteina2)
proteinas_comunes[par_proteinas] = patrones_comunes
pares_proteinas_procesados.add(par_proteinas)
output = []
df_b = pd.read_csv("AllProteins_%Similitud.csv")
output2=[]
proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict()
outbreak=[]
first=True
first2=True
for par_proteinas, patrones_comunes in proteinas_comunes.items():
proteina1, proteina2 = par_proteinas
pattern_lengths = {}
pattern_l={}
Antecedentes={}
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(patrones_comunes)
for pattern in patrones_comunes:
length = len(pattern)
key = f'Longitud {length}'
if key in pattern_lengths:
pattern_lengths[key].append([pattern])
Add=posiciones_patron[proteina1][pattern]
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(Add)
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
Antecedentes[proteina1].update(Add)
pattern_l[key][0]+=len(Add)-lex
Add=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex=len(Antecedentes[proteina2] & set(Add))
Antecedentes[proteina2].update(Add)
pattern_l[key][1]+=len(Add)-lex
#sprint(length*len(Posic))
else:
pattern_lengths[key] = [[pattern]]
Add=posiciones_patron[proteina1][pattern]
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
#print(lex)
#print(Antecedentes)
Antecedentes[proteina1].update(Add)
Add2=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex2=len(Antecedentes[proteina2] & set(Add2))
Antecedentes[proteina2].update(Add2)
pattern_l[key]=[len(Add)-lex,len(Add2)-lex2]
sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True))
if proteina1 != proteina2:
prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]]
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
output=[]
if(first2):
df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',
index=False)
first2=False
else:
df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',index=False,header=False,mode='a')
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
# 'N/A'])
#print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
#print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output2.append([proteina1,proteina2, (max(len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1]))*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
output2=[]
if(first):
df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False)
first=False
else:
df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False,header=False,mode='a')
#output2=sorted(output2, key = lambda x: int(x[2]))
#df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
#df2.to_csv('resultados/Metrica_Coincidencia.csv',
# index=False)
def remplazar_sequence_for_ID(output):
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
......
......@@ -11,6 +11,8 @@ import os
import json
import ast
import re
from collections import defaultdict
classes={}
min_ocurrence=0
def swap_dict(d):
......@@ -57,155 +59,146 @@ def readData(archivoEntrada, enfermedad, archivoTarget):
return sequences, num_filas
def guardar_patrones_len1(sequences, pattern_freqMin):
all_patterns = dict()
def read_aminoacidos():
cla = {}
with open('aminoacidos.txt', 'r') as op:
lines = op.readlines()
for line in lines:
oo = line.replace('\n', '').split('\t')
key = oo.pop(0)
cla[key] = oo
return swap_dict(cla), cla
def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence):
all_patterns = defaultdict(list)
longitud_max = 0
global min_ocurrence
# Each pattern associated to the proteins the pattern is in
pattern_proteins = {}
classes, cla = read_aminoacidos()
for protein in sequences:
longitud = len(protein)
if longitud > longitud_max:
longitud_max = longitud
all_patterns[protein] = []
# En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
posicionPatterns = dict()
cla={}
with open('aminoacidos.txt','r') as op:
lines=op.readlines()
print(lines)
for line in lines:
oo=line.replace('\n','').split('\t')
key=oo.pop(0)
print(oo)
cla[key]=oo
classes=swap_dict(cla)
clases=classes
print(clases)
posicion_patterns = defaultdict(list)
for index, letter in enumerate(protein):
posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index]
if(letter in clases):
overst=set()
for EqvLetter in clases[letter]:
overst=overst | set(cla[EqvLetter])
for EqvLetter in overst:
if(EqvLetter) != letter:
print(EqvLetter)
posicionPatterns[EqvLetter] = posicionPatterns.get(EqvLetter, []) + [index]
all_patterns[protein] = posicionPatterns
posicion_patterns[letter].append(index)
if letter in classes:
overst = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[letter]])
for eqv_letter in overst:
if eqv_letter != letter:
posicion_patterns[eqv_letter].append(index)
all_patterns[protein] = posicion_patterns
pattern_proteins = defaultdict(dict)
for protein, patterns in all_patterns.items():
for pattern, positions in patterns.items():
if pattern not in pattern_proteins :
if pattern not in pattern_proteins:
pattern_proteins[pattern] = {}
if protein not in pattern_proteins[pattern]:
pattern_proteins[pattern][protein] = []
pattern_proteins[pattern][protein].extend(positions)
for pattern, proteins in pattern_proteins.items():
if len(proteins) >= min_ocurrence:
pattern_freqMin[pattern] = proteins
df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins'])
df.to_csv('prueba2.csv', index=False)
return pattern_freqMin, posicionPatterns, longitud_max
def buscar_patrones_simAA(sequences,min_ocurr):
min_ocurrence=min_ocurr
return pattern_freqMin, posicion_patterns, longitud_max
def buscar_patrones_simAA(sequences, min_ocurr):
min_ocurrence = min_ocurr
pattern_freqMin = {}
pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
cla={}
num_patrones=0
with open('aminoacidos.txt','r') as op:
lines=op.readlines()
print(lines)
for line in lines:
oo=line.replace('\n','').split('\t')
key=oo.pop(0)
print(oo)
cla[key]=oo
classes=swap_dict(cla)
clases=classes
if bool(pattern_freqMin):
pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence)
classes, cla = read_aminoacidos()
if not bool(pattern_freqMin):
return pattern_freqMin, 0
for pattern_length in range(2, longitud_max + 1):
# Si se intenta acceder a una clave que no existe se creara una lista vacia
auxPos = {}
aux_pos = defaultdict(dict)
sub_seqs = []
for pattern, proteins in pattern_freqMin.items():
if len(pattern) == pattern_length - 1:
for prot, positions in proteins.items():
protein_len = len(prot)
if protein_len < pattern_length - 1:
continue
for position in positions:
pos_last_letter=position+pattern_length-1
if pos_last_letter > len(prot)-1:
pos_last_letter = position + pattern_length - 1
if pos_last_letter > len(prot) - 1:
continue
last_letter = prot[pos_last_letter]
if last_letter not in clases:
pos_ultima_letra = position + pattern_length - 1
if last_letter not in classes:
sub_seq = pattern + last_letter
if sub_seq in pattern_freqMin:
continue
ultima_letra = sub_seq[-1]
pos_ultima_letra = position + pattern_length - 1
if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
if sub_seq not in auxPos:
auxPos[sub_seq] = {}
if prot not in auxPos[sub_seq]:
auxPos[sub_seq][prot] = []
auxPos[sub_seq][prot].append(position)
if sub_seq not in aux_pos:
aux_pos[sub_seq] = {}
if prot not in aux_pos[sub_seq]:
aux_pos[sub_seq][prot] = []
aux_pos[sub_seq][prot].append(position)
if sub_seq not in sub_seqs:
sub_seqs.append(sub_seq)
else:
overst_set = set()
for EqvLetter in clases[last_letter]:
overst_set |= set(cla[EqvLetter])
for EqvLetter in overst_set:
sub_seq = pattern + EqvLetter
overst_set = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[last_letter]])
broken=False
for eqv_letter in overst_set:
sub_seq = pattern + eqv_letter
if sub_seq in pattern_freqMin:
continue
ultima_letra = sub_seq[-1]
pos_ultima_letra = position + pattern_length - 1
broken=True
break
if sub_seq in aux_pos:
if prot not in aux_pos[sub_seq]:
aux_pos[sub_seq][prot] = []
aux_pos[sub_seq][prot].append(position)
broken=True
break
ultima_letra=last_letter
sub_seq = pattern + last_letter
if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
if sub_seq not in auxPos:
auxPos[sub_seq] = {}
if prot not in auxPos[sub_seq]:
auxPos[sub_seq][prot] = []
auxPos[sub_seq][prot].append(position)
if not broken and ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
if sub_seq not in aux_pos:
aux_pos[sub_seq] = {}
if prot not in aux_pos[sub_seq]:
aux_pos[sub_seq][prot] = []
aux_pos[sub_seq][prot].append(position)
if sub_seq not in sub_seqs:
sub_seqs.append(sub_seq)
print(pattern_length)
sub_seqs_copy = sub_seqs.copy()
for p in sub_seqs_copy:
if len(auxPos[p]) < min_ocurrence:
del auxPos[p]
if len(aux_pos[p]) < min_ocurrence:
del aux_pos[p]
sub_seqs.remove(p)
# Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
if not bool(auxPos):
if not bool(aux_pos):
break
for pattern, proteins in auxPos.items():
for pattern, proteins in aux_pos.items():
for prot, pos in proteins.items():
if pattern not in pattern_freqMin:
pattern_freqMin[pattern] = {}
if prot not in pattern_freqMin[pattern]:
pattern_freqMin[pattern][prot] = []
found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot]))
print(found)
print(len(found))
if(len(found)<=0):
found = list(filter(lambda x: pos - len(pattern) <= x <= pos + len(pattern),
pattern_freqMin[pattern][prot]))
if len(found) <= 0:
pattern_freqMin[pattern][prot].extend(pos)
if len(pattern) > 2:
if pattern[:-1] in pattern_freqMin:
......@@ -213,15 +206,11 @@ def buscar_patrones_simAA(sequences,min_ocurr):
if pattern[1:] in pattern_freqMin:
del pattern_freqMin[pattern[1:]]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0])))
dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
#dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins'])
num_patrones = df.shape[0]
pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
#pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
return pattern_freqMin, num_patrones
def buscar_patrones_identicos(sequences,min_ocurr):
pattern_freqMin = {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment