import pandas as pd import time import ast import csv import math from interfazGrafica import interfaz from descarteProteinas import ejecutar,substitute_or_remove_prot_id,remplazar_ID_for_sequence import metricas from graficas import grafica import os import json import ast import re from collections import defaultdict classes={} min_ocurrence=0 def swap_dict(d): new_dict = {} for key, values in d.items(): for value in values: if value not in new_dict: new_dict[value] = [] new_dict[value].append(key) return new_dict def readData(archivoEntrada, enfermedad, archivoTarget): data = pd.read_excel(archivoEntrada) dataC = pd.read_csv("resultados/proteinasDescartadas2.csv") #data=substitute_or_remove_prot_id(data,"r") #dataC=substitute_or_remove_prot_id(dataC,"r") #Descarte de proteinas data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])] print("Se ha realizado el descarte de proteínas") cla={} global classes with open('aminoacidos.txt','r') as op: line=op.readline() print(line) oo=line.split() key=oo.pop(0) cla[key]=oo classes=swap_dict(cla) # "C0002395" if(enfermedad != ''): data = data.loc[data["disease_id"] == enfermedad] # dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") # print("Se han seleccionado las proteínas de la enfermedad elegida") # dataB=substitute_or_remove_prot_id(dataB,"r") #if(archivoTarget != ''): # dataB=substitute_or_remove_prot_id(dataB,"r") #Eliminar las proteinas target # data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])))] # print("Se han descartado las proteínas del archivo target") sequences = data["protein_sequence"] print(sequences) num_filas = sequences.shape[0] return sequences, num_filas def read_aminoacidos(): cla = {} with open('aminoacidos.txt', 'r') as op: lines = op.readlines() for line in lines: oo = line.replace('\n', '').split('\t') key = oo.pop(0) cla[key] = oo return swap_dict(cla), cla def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence): all_patterns = defaultdict(list) longitud_max = 0 classes, cla = read_aminoacidos() for protein in sequences: longitud = len(protein) if longitud > longitud_max: longitud_max = longitud all_patterns[protein] = [] posicion_patterns = defaultdict(list) for index, letter in enumerate(protein): posicion_patterns[letter].append(index) if letter in classes: overst = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[letter]]) for eqv_letter in overst: if eqv_letter != letter: posicion_patterns[eqv_letter].append(index) all_patterns[protein] = posicion_patterns pattern_proteins = defaultdict(dict) for protein, patterns in all_patterns.items(): for pattern, positions in patterns.items(): if pattern not in pattern_proteins: pattern_proteins[pattern] = {} if protein not in pattern_proteins[pattern]: pattern_proteins[pattern][protein] = [] pattern_proteins[pattern][protein].extend(positions) for pattern, proteins in pattern_proteins.items(): if len(proteins) >= min_ocurrence: pattern_freqMin[pattern] = proteins df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) df.to_csv('prueba2.csv', index=False) return pattern_freqMin, posicion_patterns, longitud_max def buscar_patrones_simAA(sequences, min_ocurr): min_ocurrence = min_ocurr pattern_freqMin = {} pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence) classes, cla = read_aminoacidos() if not bool(pattern_freqMin): return pattern_freqMin, 0 for pattern_length in range(2, longitud_max + 1): aux_pos = defaultdict(dict) sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: pos_last_letter = position + pattern_length - 1 if pos_last_letter > len(prot) - 1: continue last_letter = prot[pos_last_letter] pos_ultima_letra = position + pattern_length - 1 if last_letter not in classes: sub_seq = pattern + last_letter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in aux_pos: aux_pos[sub_seq] = {} if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) else: overst_set = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[last_letter]]) broken=False for eqv_letter in overst_set: sub_seq = pattern + eqv_letter if sub_seq in pattern_freqMin: broken=True break if sub_seq in aux_pos: if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) broken=True break ultima_letra=last_letter sub_seq = pattern + last_letter if not broken and ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in aux_pos: aux_pos[sub_seq] = {} if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(aux_pos[p]) < min_ocurrence: del aux_pos[p] sub_seqs.remove(p) if not bool(aux_pos): break for pattern, proteins in aux_pos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] found = list(filter(lambda x: pos - len(pattern) <= x <= pos + len(pattern), pattern_freqMin[pattern][prot])) if len(found) <= 0: pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) #dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} return pattern_freqMin, num_patrones def buscar_patrones_identicos(sequences,min_ocurr): pattern_freqMin = {} min_ocurrence=min_ocurr pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin) cla={} num_patrones=0 with open('aminoacidos.txt','r') as op: lines=op.readlines() print(lines) for line in lines: oo=line.replace('\n','').split('\t') key=oo.pop(0) print(oo) cla[key]=oo classes=swap_dict(cla) clases=classes if bool(pattern_freqMin): for pattern_length in range(2, longitud_max + 1): # Si se intenta acceder a una clave que no existe se creara una lista vacia auxPos = {} sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: pos_last_letter = position + pattern_length - 1 if protein_len <= pos_last_letter: continue last_letter = prot[pos_last_letter] if last_letter not in clases: sub_seq = pattern + last_letter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) else: overst_set = set() for EqvLetter in clases[last_letter]: overst_set |= set(cla[EqvLetter]) for EqvLetter in overst_set: sub_seq = pattern + EqvLetter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) print(pattern_length) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(auxPos[p]) < min_ocurrence: del auxPos[p] sub_seqs.remove(p) # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar if not bool(auxPos): break for pattern, proteins in auxPos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] return pattern_freqMin, num_patrones def remplazar_sequence_for_ID(pattern_freqMin): df_b = pd.read_excel("data_nervous_genes_xf.xlsx") #df_b=substitute_or_remove_prot_id(df_b,'r') output = [] global classes cla={} with open('aminoacidos.txt','r') as op: lines=op.readlines() #print(lines) for line in lines: oo=line.replace('\n','').split('\t') key=oo.pop(0) #print(oo) cla[key]=oo classes=swap_dict(cla) for key, value in pattern_freqMin.items(): for proteina, posiciones in value.items(): posiciones_sim=[] for y in posiciones: count=0 original_list=[] print(len(proteina[y:y+len(key)])) print(len(key)) print(len(proteina[y:y+len(key)])==len(key)) for h1,h2 in zip(enumerate(key),enumerate(proteina[y:y+len(key)])): (index1,u)=h1 (index2,k)=h2 if(u==k): count+=1 else: #print(u+" "+k) count+=0.9*len(set(classes[u]) & set(classes[k]))/len(classes[u]) posiciones_sim.append([y,proteina[y:y+len(key)],count]) #print(posiciones_sim) output.append([key, proteina, posiciones_sim]) output = [sublista for sublista in output if len(sublista[0]) != 1] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for item in output_ordered: protein_sequence = item[1] if protein_sequence in proteinas_dict: item[1] = proteinas_dict[protein_sequence] df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones']) # Guardar el DataFrame actualizado en un archivo CSV df_a.to_csv('resultados/patronesSimilaresAA.csv', index=False) print("Se ha generado el .csv con los patrones idénticos encontrados")