import pandas as pd import time import ast import csv import math import metricas import os import json import ast import re from collections import defaultdict def readData(archivoEntrada, enfermedad, archivoTarget): """ Reads data from an Excel file, filters it based on disease (if specified), and returns protein sequences along with the number of rows. Parameters: - archivoEntrada: str, path to the input Excel file. - enfermedad: str, disease ID for filtering (empty string for no filtering). - archivoTarget: str, path to the target Excel file (not currently in use). Returns: - sequences: pandas Series, protein sequences column. - num_filas: int, number of rows in the filtered data. """ data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") #dataC=substitute_or_remove_prot_id(dataC,"r") #Descarte de proteinas #print(data) #data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])] print("Se ha realizado el descarte de proteínas") # "C0002395" if(enfermedad != ''): data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #print("Se han seleccionado las proteínas de la enfermedad elegida") #dataB=substitute_or_remove_prot_id(dataB,"r") #if(archivoTarget != ''): # dataB=substitute_or_remove_prot_id(dataB,"r") #Eliminar las proteinas target # data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])))] # print("Se han descartado las proteínas del archivo target") sequences = data["protein_sequence"] print(sequences) num_filas = sequences.shape[0] return sequences, num_filas def guardar_patrones_len1(sequences, pattern_freqMin): """ Processes protein sequences to find patterns of length 1 and their positions, filters patterns based on minimum occurrence, and saves results to a CSV file. Parameters: - sequences: pandas Series, protein sequences. - pattern_freqMin: dict, dictionary to store patterns and their occurrences. Returns: - pattern_freqMin: dict, updated dictionary of patterns. - posicionPatterns: dict, positions of each character in the sequences. - longitud_max: int, maximum length of protein sequences. """ all_patterns = dict() longitud_max = 0 # Each pattern associated to the proteins the pattern is in pattern_proteins = {} for protein in sequences: longitud = len(protein) if longitud > longitud_max: longitud_max = longitud all_patterns[protein] = [] # En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina posicionPatterns = dict() for index, letter in enumerate(protein): posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index] all_patterns[protein] = posicionPatterns for protein, patterns in all_patterns.items(): for pattern, positions in patterns.items(): if pattern not in pattern_proteins: pattern_proteins[pattern] = {} if protein not in pattern_proteins[pattern]: pattern_proteins[pattern][protein] = [] pattern_proteins[pattern][protein].extend(positions) for pattern, proteins in pattern_proteins.items(): if len(proteins) >= min_ocurrence: pattern_freqMin[pattern] = proteins df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins']) df.to_csv('prueba2.csv', index=False) return pattern_freqMin, posicionPatterns, longitud_max def buscar_patrones_identicos(sequences): """ Searches for identical patterns of different lengths in protein sequences and stores them along with their positions in a dictionary. Parameters: - sequences: pandas Series, protein sequences. Returns: - pattern_freqMin: dict, dictionary of patterns and their positions. - num_patrones: int, number of unique patterns found. """ pattern_freqMin = {} pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin) if bool(pattern_freqMin): for pattern_length in range(2, longitud_max + 1): # Si se intenta acceder a una clave que no existe se creara una lista vacia auxPos = {} sub_seqs = [] for pattern, proteins in pattern_freqMin.items(): if len(pattern) == pattern_length - 1: for prot, positions in proteins.items(): protein_len = len(prot) if protein_len < pattern_length - 1: continue for position in positions: if (protein_len < position + pattern_length): continue sub_seq = prot[position:position + pattern_length] if sub_seq in pattern_freqMin: continue # Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible # min freq tb ultima_letra = sub_seq[-1] pos_ultima_letra = position + pattern_length - 1 if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]: if sub_seq not in auxPos: auxPos[sub_seq] = {} if prot not in auxPos[sub_seq]: auxPos[sub_seq][prot] = [] auxPos[sub_seq][prot].append(position) if sub_seq not in sub_seqs: sub_seqs.append(sub_seq) print(pattern_length) sub_seqs_copy = sub_seqs.copy() for p in sub_seqs_copy: if len(auxPos[p]) < min_ocurrence: del auxPos[p] sub_seqs.remove(p) # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar if not bool(auxPos): break for pattern, proteins in auxPos.items(): for prot, pos in proteins.items(): if pattern not in pattern_freqMin: pattern_freqMin[pattern] = {} if prot not in pattern_freqMin[pattern]: pattern_freqMin[pattern][prot] = [] found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot])) print(found) print(len(found)) if(len(found)<=0): pattern_freqMin[pattern][prot].extend(pos) if len(pattern) > 2: if pattern[:-1] in pattern_freqMin: del pattern_freqMin[pattern[:-1]] if pattern[1:] in pattern_freqMin: del pattern_freqMin[pattern[1:]] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0]))) #dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4} df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins']) num_patrones = df.shape[0] #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4} return pattern_freqMin, num_patrones def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ocurrencia,Sal,archivoClases=None): """ Replaces identified patterns in the original data with their corresponding IDs, saves the results to a CSV file, and prints a success message. Parameters: - pattern_freqMin: dict, dictionary of patterns and their positions. - archivoEntrada: str, path to the input Excel file. - ocurrencia: float, occurrence parameter. - archivoClases (Optional): str, path to the classes Excel file. """ df_b = pd.read_excel(archivoEntrada) #df_b=pd.read_excel("proteinasClase_PC00060.xlsx") #df_b=substitute_or_remove_prot_id(df_b,'r') if(archivoClases is not None): cl=pd.read_excel(archivoClases) #cl=substitute_or_remove_prot_id(cl,"r") #data2=data.copy() cli=cl.groupby('protein_id') di=[] do={} for k,v in cli: for index,row in v.iterrows(): di.append(row['class_name']) do[k]=di di=[] class_dict=do output = [] for key, value in pattern_freqMin.items(): for proteina, posiciones in value.items(): output.append([key, proteina, posiciones]) output = [sublista for sublista in output if len(sublista[0]) != 1] # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for item in output_ordered: protein_sequence = item[1] if protein_sequence in proteinas_dict: item[1] = proteinas_dict[protein_sequence] item.append(class_dict[item[1]] if item[1] in class_dict else "N/A") df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt']) # Guardar el DataFrame actualizado en un archivo CSV df_a.to_csv('resultados/patronesIdenticos'+str(int((ocurrencia%1)*100))+Sal+'.csv', index=False) print("Se ha generado el .csv con los patrones idénticos encontrados") def calculate_sequence_length(sequences): """ Calculates the total length of protein sequences. Parameters: - sequences: pandas Series, protein sequences. Returns: - seq_len: int, total length of protein sequences. """ seq_len = 0 for i in sequences: seq_len += len(i) return seq_len def group_classes_by_protein(cl): """ Groups classes by protein ID. Parameters: - cl: pandas DataFrame, DataFrame containing class information. Returns: - class_dict: dict, dictionary of protein IDs and associated classes. """ class_dict = {} cli = cl.groupby('protein_id') for k, v in cli: class_names = [row['class_name'] for index, row in v.iterrows()] class_dict[k] = class_names return class_dict def compute_pattern_ocurrence(df,sal): """ Computes the occurrence of patterns in the data and saves the results to a CSV file. Parameters: - df: pandas DataFrame, DataFrame containing pattern information. Note: saves the patterns, the amount of times a pattern appears in proteins of the dataset and the number of proteins that have that pattern. """ df2=df.groupby('Patron') compl=0 comp=0 first=True res=set() for k,v in df2: res=set() for index,row in v.iterrows(): Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'] rem=[] if(len(Posic)>2): u=0 while u+12): u=0 while u+1