import pandas as pd import time import ast import csv import math from interfazGrafica import interfaz from descarteProteinas import ejecutar,substitute_or_remove_prot_id,remplazar_ID_for_sequence import metricas from graficas import grafica import os import json import ast import re from collections import defaultdict classes={} min_ocurrence=0 def swap_dict(d): """ Swaps keys and values in a dictionary. Parameters: - d: dict, input dictionary. Returns: - new_dict: dict, dictionary with keys and values swapped. """ new_dict = {} for key, values in d.items(): for value in values: if value not in new_dict: new_dict[value] = [] new_dict[value].append(key) return new_dict def read_aminoacidos(archivoAA): """ Reads amino acid information from a file and returns a dictionary with swapped keys and values, and the original dictionary. Parameters: - archivoAA: str, path to the amino acid information file. Returns: - classes: dict, dictionary with swapped keys and values. - cla: dict, original dictionary with amino acid information. """ cla = {} with open(archivoAA, 'r') as op: lines = op.readlines() for line in lines: oo = line.replace('\n', '').split('\t') key = oo.pop(0) cla[key] = oo return swap_dict(cla), cla def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA): """ Processes protein sequences to find patterns of length 1 and their positions, filters patterns based on minimum occurrence, and saves results to a CSV file. Parameters: - sequences: pandas Series, protein sequences. - pattern_freqMin: dict, dictionary to store patterns and their occurrences. - min_ocurrence: int, minimum occurrence threshold. - archivoAA: str, path to the amino acid information file. Returns: - pattern_freqMin: dict, updated dictionary of patterns. - posicion_patterns: dict, positions of each character in the sequences. - longitud_max: int, maximum length of protein sequences. """ all_patterns = defaultdict(list) longitud_max = 0 classes, cla = read_aminoacidos(archivoAA) for protein in sequences: longitud = len(protein) if longitud > longitud_max: longitud_max = longitud all_patterns[protein] = [] posicion_patterns = defaultdict(list) for index, letter in enumerate(protein): posicion_patterns[letter].append(index) if letter in classes: overst = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[letter]]) for eqv_letter in overst: if eqv_letter != letter: posicion_patterns[eqv_letter].append(index) all_patterns[protein] = posicion_patterns pattern_proteins = defaultdict(dict) for protein, patterns in all_patterns.items(): for pattern, positions in patterns.items(): if pattern not in pattern_proteins: pattern_proteins[pattern] = {} if protein not in pattern_proteins[pattern]: pattern_proteins[pattern][protein] = [] pattern_proteins[pattern][protein].extend(positions) for pattern, proteins in pattern_proteins.items(): if len(proteins) >= min_ocurrence: pattern_freqMin[pattern] = proteins df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) df.to_csv('prueba2.csv', index=False) return pattern_freqMin, posicion_patterns, longitud_max def buscar_patrones_simAA(sequences, min_ocurr,archivoAA): """ Searches for similar patterns in protein sequences based on amino acid information, filters patterns based on minimum occurrence, and returns results. Parameters: - sequences: pandas Series, protein sequences. - min_ocurr: int, minimum occurrence threshold. - archivoAA: str, path to the amino acid information file. Returns: - pattern_freqMin: dict, dictionary of patterns and their positions. - num_patrones: int, number of unique patterns found. """ min_ocurrence = min_ocurr pattern_freqMin = {} pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA) classes, cla = read_aminoacidos(archivoAA) if not bool(pattern_freqMin): return pattern_freqMin, 0 for pattern_length in range(2, longitud_max + 1): aux_pos = defaultdict(dict) sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: pos_last_letter = position + pattern_length - 1 if pos_last_letter > len(prot) - 1: continue last_letter = prot[pos_last_letter] pos_ultima_letra = position + pattern_length - 1 if last_letter not in classes: sub_seq = pattern + last_letter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in aux_pos: aux_pos[sub_seq] = {} if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) else: overst_set = set().union(*[set(cla[eqv_letter]) for eqv_letter in classes[last_letter]]) broken=False for eqv_letter in overst_set: sub_seq = pattern + eqv_letter if sub_seq in pattern_freqMin: broken=True break if sub_seq in aux_pos: if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) broken=True break ultima_letra=last_letter sub_seq = pattern + last_letter if not broken and ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in aux_pos: aux_pos[sub_seq] = {} if prot not in aux_pos[sub_seq]: aux_pos[sub_seq][prot] = [] aux_pos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(aux_pos[p]) < min_ocurrence: del aux_pos[p] sub_seqs.remove(p) if not bool(aux_pos): break for pattern, proteins in aux_pos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] found = list(filter(lambda x: pos - len(pattern) <= x <= pos + len(pattern), pattern_freqMin[pattern][prot])) if len(found) <= 0: pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) #dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} return pattern_freqMin, num_patrones def buscar_patrones_identicos(sequences,min_ocurr,archivoAA): """ Searches for identical patterns of different lengths in protein sequences based on amino acid information, and returns results. Parameters: - sequences: pandas Series, protein sequences. - min_ocurr: int, minimum occurrence threshold. - archivoAA: str, path to the amino acid information file. Returns: - pattern_freqMin: dict, dictionary of patterns and their positions. - num_patrones: int, number of unique patterns found. """ pattern_freqMin = {} min_ocurrence=min_ocurr pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin,archivoAA) cla={} num_patrones=0 with open(archivoAA,'r') as op: lines=op.readlines() print(lines) for line in lines: oo=line.replace('\n','').split('\t') key=oo.pop(0) print(oo) cla[key]=oo classes=swap_dict(cla) clases=classes if bool(pattern_freqMin): for pattern_length in range(2, longitud_max + 1): # Si se intenta acceder a una clave que no existe se creara una lista vacia auxPos = {} sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: pos_last_letter = position + pattern_length - 1 if protein_len <= pos_last_letter: continue last_letter = prot[pos_last_letter] if last_letter not in clases: sub_seq = pattern + last_letter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) else: overst_set = set() for EqvLetter in clases[last_letter]: overst_set |= set(cla[EqvLetter]) for EqvLetter in overst_set: sub_seq = pattern + EqvLetter if sub_seq in pattern_freqMin: continue ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) print(pattern_length) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(auxPos[p]) < min_ocurrence: del auxPos[p] sub_seqs.remove(p) # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar if not bool(auxPos): break for pattern, proteins in auxPos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] return pattern_freqMin, num_patrones def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ArchivoAA,ocurrencia,sal): """ Replaces identified patterns in the original data with their corresponding IDs, saves the results to a CSV file, and prints a success message. Parameters: - pattern_freqMin: dict, dictionary of patterns and their positions. - archivoEntrada: str, path to the input Excel file. - ArchivoAA: str, path to the amino acid information file. - ocurrencia: float, occurrence parameter (not currently in use). """ df_b = pd.read_excel(archivoEntrada) #df_b=substitute_or_remove_prot_id(df_b,'r') output = [] global classes cla={} with open(ArchivoAA,'r') as op: lines=op.readlines() #print(lines) for line in lines: oo=line.replace('\n','').split('\t') key=oo.pop(0) #print(oo) cla[key]=oo classes=swap_dict(cla) for key, value in pattern_freqMin.items(): for proteina, posiciones in value.items(): posiciones_sim=[] for y in posiciones: count=0 original_list=[] print(len(proteina[y:y+len(key)])) print(len(key)) print(len(proteina[y:y+len(key)])==len(key)) for h1,h2 in zip(enumerate(key),enumerate(proteina[y:y+len(key)])): (index1,u)=h1 (index2,k)=h2 if(u==k): count+=1 else: #print(u+" "+k) count+=0.9*len(set(classes[u]) & set(classes[k]))/len(classes[u]) posiciones_sim.append([y,proteina[y:y+len(key)],count]) #print(posiciones_sim) output.append([key, proteina, posiciones_sim]) output = [sublista for sublista in output if len(sublista[0]) != 1] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for item in output_ordered: protein_sequence = item[1] if protein_sequence in proteinas_dict: item[1] = proteinas_dict[protein_sequence] df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones']) # Guardar el DataFrame actualizado en un archivo CSV df_a.to_csv('resultados/patronesSimilaresAA'+str(int((float(ocurrencia)%1)*100))+sal+'.csv', index=False) print("Se ha generado el .csv con los patrones idénticos encontrados")