import pandas as pd import time import ast import csv import math from interfazGrafica import interfaz from descarteProteinas import ejecutar,remplazar_ID_for_sequence from generate_tha_excel import substitute_or_remove_prot_id import metricas from graficas import grafica import os import json import ast import re from patrones_similares_aa import remplazar_sequence_for_ID as remplazar_s from patrones_similares_aa import buscar_patrones_simAA from collections import defaultdict from pathlib import Path def substitute_or_remove_prot_id2(data,sub_rem): print("inside the problem") with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() if(sub_rem == "s"): data["Proteina"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) else: global globi datas= data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==True] data = data[data["Proteina"].isin(list(listtosubs.values())[(resub+1)%2])==False] #datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) globi=globi+1 return data def readData(archivoEntrada, enfermedad, archivoTarget): """ Reads data from an Excel file, filters it based on disease (if specified), and returns protein sequences along with the number of rows. Parameters: - archivoEntrada: str, path to the input Excel file. - enfermedad: str, disease ID for filtering (empty string for no filtering). - archivoTarget: str, path to the target Excel file (not currently in use). Returns: - sequences: pandas Series, protein sequences column. - num_filas: int, number of rows in the filtered data. """ data = pd.read_excel(archivoEntrada) dataC = pd.read_csv("resultados/proteinasDescartadas2.csv") #data=substitute_or_remove_prot_id(data,"r") #dataC=substitute_or_remove_prot_id(dataC,"r") #Descarte de proteinas data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])] print("Se ha realizado el descarte de proteínas") # "C0002395" if(enfermedad != ''): data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #print("Se han seleccionado las proteínas de la enfermedad elegida") #dataB=substitute_or_remove_prot_id(dataB,"r") #if(archivoTarget != ''): # dataB=substitute_or_remove_prot_id(dataB,"r") #Eliminar las proteinas target # data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])))] # print("Se han descartado las proteínas del archivo target") sequences = data["protein_sequence"] print(sequences) num_filas = sequences.shape[0] return sequences, num_filas def guardar_patrones_len1(sequences, pattern_freqMin): """ Processes protein sequences to find patterns of length 1 and their positions, filters patterns based on minimum occurrence, and saves results to a CSV file. Parameters: - sequences: pandas Series, protein sequences. - pattern_freqMin: dict, dictionary to store patterns and their occurrences. Returns: - pattern_freqMin: dict, updated dictionary of patterns. - posicionPatterns: dict, positions of each character in the sequences. - longitud_max: int, maximum length of protein sequences. """ all_patterns = dict() longitud_max = 0 # Each pattern associated to the proteins the pattern is in pattern_proteins = {} for protein in sequences: longitud = len(protein) if longitud > longitud_max: longitud_max = longitud all_patterns[protein] = [] # En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina posicionPatterns = dict() for index, letter in enumerate(protein): posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index] all_patterns[protein] = posicionPatterns for protein, patterns in all_patterns.items(): for pattern, positions in patterns.items(): if pattern not in pattern_proteins: pattern_proteins[pattern] = {} if protein not in pattern_proteins[pattern]: pattern_proteins[pattern][protein] = [] pattern_proteins[pattern][protein].extend(positions) for pattern, proteins in pattern_proteins.items(): if len(proteins) >= min_ocurrence: pattern_freqMin[pattern] = proteins df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) df.to_csv('prueba2.csv', index=False) return pattern_freqMin, posicionPatterns, longitud_max def buscar_patrones_identicos(sequences): """ Searches for identical patterns of different lengths in protein sequences and stores them along with their positions in a dictionary. Parameters: - sequences: pandas Series, protein sequences. Returns: - pattern_freqMin: dict, dictionary of patterns and their positions. - num_patrones: int, number of unique patterns found. """ pattern_freqMin = {} pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin) if bool(pattern_freqMin): for pattern_length in range(2, longitud_max + 1): # Si se intenta acceder a una clave que no existe se creara una lista vacia auxPos = {} sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: if (protein_len < position + pattern_length): continue sub_seq = prot[position:position + pattern_length] if sub_seq in pattern_freqMin: continue # Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible # min freq tb ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) print(pattern_length) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(auxPos[p]) < min_ocurrence: del auxPos[p] sub_seqs.remove(p) # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar if not bool(auxPos): break for pattern, proteins in auxPos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot])) print(found) print(len(found)) if(len(found)<=0): pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} return pattern_freqMin, num_patrones def remplazar_sequence_for_ID(pattern_freqMin,name,archivoEntrada,ocurrencia,archivoClases=None): """ Replaces identified patterns in the original data with their corresponding IDs, saves the results to a CSV file, and prints a success message. Parameters: - pattern_freqMin: dict, dictionary of patterns and their positions. - name: str name of the class - archivoEntrada: str, path to the input Excel file. - ocurrencia: float, occurrence parameter. - archivoClases (Optional): str, path to the classes Excel file. """ df_b = pd.read_excel(archivoEntrada) #df_b=pd.read_excel("proteinasClase_PC00060.xlsx") #df_b=substitute_or_remove_prot_id(df_b,'r') cl=pd.read_excel(archivoClase) #cl=substitute_or_remove_prot_id(cl,"r") #data2=data.copy() cli=cl.groupby('protein_id') di=[] do={} for k,v in cli: for index,row in v.iterrows(): di.append(row['class_name']) do[k]=di di=[] class_dict=do output = [] for key, value in pattern_freqMin.items(): for proteina, posiciones in value.items(): output.append([key, proteina, posiciones]) output = [sublista for sublista in output if len(sublista[0]) != 1] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for item in output_ordered: protein_sequence = item[1] if protein_sequence in proteinas_dict: item[1] = proteinas_dict[protein_sequence] item.append(class_dict[item[1]] if item[1] in class_dict else "N/A") df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt']) # Guardar el DataFrame actualizado en un archivo CSV df_a.to_csv('clases/'+ name +'/patronesIdenticos'+str(int((ocurrencia%1)*100))+'.csv', index=False) print("Se ha generado el .csv con los patrones idénticos encontrados") def compute_pattern_ocurrence(df,name): """ Computes the occurrence of patterns in the data and saves the results to a CSV file. Parameters: - df: pandas DataFrame, DataFrame containing pattern information. - name: str name of the class Note: saves the patterns, the amount of times a pattern appears in proteins of the dataset and the number of proteins that have that pattern. """ df2=df.groupby('Patron') compl=0 comp=0 first=True res=set() for k,v in df2: res=set() for index,row in v.iterrows(): Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'] rem=[] if(len(Posic)>2): u=0 while u+12): u=0 while u+1