import pandas as pd import time import numpy as np import re from ast import literal_eval from find_patterns import substitute_or_remove_prot_id def readData(archivoEntrada, enfermedad,patrones_file,Sal): """ Reads data from an Excel file, filters it based on the disease, and performs additional processing. Parameters: - archivo_entrada (str): Path to the Excel file. - enfermedad (str): Disease ID for filtering. - patrones_file (str): Path to the file containing patterns. - Sal: Output file extension Returns: - data (pd.DataFrame): Processed DataFrame based on the given parameters. """ data = pd.read_excel(archivoEntrada) if enfermedad: data = data.loc[data["disease_id"] == enfermedad] dataB = pd.read_csv(patrones_file) print(len(data)) filt_data = len(data) alz_filt_data = len(dataB) print("Proteins discarded after the main filter: " + str(filt_data - len(data))) print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB))) dataC = {} dataz={} daa = dataB["Patron"].unique() das={} pos={} deas={} for u in daa: if len(u) > 3: kk=data.protein_sequence.str.contains(u) das[u] = data[kk]["protein_id"].to_list() pos[u]= data[kk]['protein_sequence'].str.find(u).to_list() deas[u]=data[kk]['disease_id'].to_list() print(len(pos[u])) print(len(das[u])) dataC[u]=[[[das[u][ii],pos[u][ii]],deas[u][ii]] for ii in range(0,len(das[u]))] res = [] for row in dataC[u]: matching_sublist = next((sublist for sublist in res if sublist[0] == row[0]), None) if matching_sublist is not None: # If a matching sublist is found, append only non-matching elements to it matching_sublist[1].append(row[1]) else: # If no matching sublist is found, create a new sublist with only non-matching elements res.append([row[0],row[1:]]) dataC[u]=[sublist[0] for sublist in res] dataz[u]=[sublist[1] for sublist in res] dataG = pd.DataFrame({"pattern": dataC.keys(),"proteins":dataC.values(),"desease_id":dataz.values()}) dataG.to_excel("ProtByPattern"+Sal+".xlsx") sequences = data["protein_sequence"] return data def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path): """ Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns. Parameters: - main_data_path (str): The path to the Excel file containing the main data. - patterns_info_path (str): The path to the CSV file containing patterns and protein information. - protein_names_path (str): The path to the CSV file containing protein names. Returns: None: The function updates the provided Excel file with additional protein information. Example: ```python add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv") ``` Note: - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column. - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'. - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'. """ # Read data from files main_data = pd.read_excel(main_data_path) patterns_info = pd.read_csv(patterns_info_path) protein_names = pd.read_csv(protein_names_path) # Group patterns in 'patterns_info' DataFrame patterns_grouped = patterns_info.groupby("Patron") # Initialize columns in 'main_data' DataFrame main_data["protein_names"] = "" main_data["proteins_treat"] = "{}" main_data["names_Treat"]="" for pattern, group_data in patterns_grouped: # Iterate over patterns in 'patterns_info' for index, row in group_data.iterrows(): protein_id = row["Proteina"] positions = row["Posiciones"] # Find matching rows in 'main_data' DataFrame matching_rows = main_data[main_data["pattern"] == pattern] # Initialize or get the current 'proteins_treat' list current_proteins_treat = {} # Update 'proteins_treat' field for each matching row for matching_index, matching_row in matching_rows.iterrows(): current_proteins_treat = literal_eval(matching_row["proteins_treat"]) if pd.notna(matching_row["proteins_treat"]) or matching_row["proteins_treat"] != "[]" else {} current_proteins_treat.update({protein_id: literal_eval(positions)}) main_data.at[matching_index, "proteins_treat"] = str(current_proteins_treat) matching_rows.at[matching_index, "proteins_treat"] = str(current_proteins_treat) print(matching_rows["proteins_treat"].apply( lambda lst: [protein_idee for protein_idee, _ in literal_eval(lst).items()])) main_data.loc[main_data["pattern"] == pattern, "names_Treat"] = matching_rows["proteins_treat"].apply( lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst).items()] ) main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply( lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst)] ) # Save the updated data main_data_base_name = main_data_path.split(".")[0] main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False) def add_entry_name(archivoEntrada,protein_name_file,archNom): """ Adds entry names to the DataFrame based on an additional CSV file and performs additional processing. Parameters: - archivo_entrada (str): Path to the Excel file. - protein_name_file (str): Path to the protein name CSV file. - archNom (str): Path to the id sustitution file Returns: - None """ data = pd.read_excel(archivoEntrada) dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"]) dataB = substitute_or_remove_prot_id(dataB, archNom, "na") print("PASA") dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"]) datas = dataB[dataB["Entry"].isin(data["protein_id"])] datas.to_csv(archivoEntrada + "_nombre.csv") doo = data[~(data["protein_id"].isin(dataB["Entry"]))] doo.to_csv("Proteinas_sin_nombre") #data.assign(lenght=datas["Length"].to_list()) #data.assign(name=datas["Protein names"].to_list()) #data.to_csv(archivoEntrada+"_nombre.csv") if __name__=="__main__": #data=add_entry_name("Data/data_cancers_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt") #data=pd.read_excel("Data/data_lung_cancer_desease.xlsx") #dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx") #dds=pd.concat([data,dd]) #dds.to_excel("Data/data_lung_cancer_desease_full.xlsx") data=readData("Data/data_immune_desease.xlsx","","patronesIdenticos10Treat.csv","Immun01") add_protein_info_to_data("ProtByPatternImmun01.xlsx","patronesIdenticos10Treat.csv","Data/protein_name.csv")