import pandas as pd import time import numpy as np import re from ast import literal_eval from find_patterns import substitute_or_remove_prot_id def readData(archivoEntrada, enfermedad,patrones_file,Sal): """ Reads data from an Excel file, filters it based on the disease, and performs additional processing. Parameters: - archivo_entrada (str): Path to the Excel file. - enfermedad (str): Disease ID for filtering. - patrones_file (str): Path to the file containing patterns. - Sal: Output file extension Returns: - data (pd.DataFrame): Processed DataFrame based on the given parameters. """ data = pd.read_excel(archivoEntrada) if enfermedad: data = data.loc[data["disease_id"] == enfermedad] dataB = pd.read_csv(patrones_file) print(len(data)) filt_data = len(data) alz_filt_data = len(dataB) print("Proteins discarded after the main filter: " + str(filt_data - len(data))) print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB))) dataC = {} daa = dataB["Patron"].unique() das={} pos={} for u in daa: if len(u) > 3: kk=data.protein_sequence.str.contains(u) das[u] = data[kk]["protein_id"].to_list() pos[u]= data[kk]['protein_sequence'].str.find(u).to_list() print(len(pos[u])) print(len(das[u])) dataC[u]=[[das[u][ii],pos[u][ii]] for ii in range(0,len(das[u]))] dataG = pd.DataFrame(dataC.items(), columns=["pattern", "proteins"]) dataG.to_excel("ProtByPattern"+Sal+".xlsx") sequences = data["protein_sequence"] return data def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path): """ Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns. Parameters: - main_data_path (str): The path to the Excel file containing the main data. - patterns_info_path (str): The path to the CSV file containing patterns and protein information. - protein_names_path (str): The path to the CSV file containing protein names. Returns: None: The function updates the provided Excel file with additional protein information. Example: ```python add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv") ``` Note: - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column. - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'. - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'. """ # Read data from files main_data = pd.read_excel(main_data_path) patterns_info = pd.read_csv(patterns_info_path) protein_names = pd.read_csv(protein_names_path) # Group patterns in 'patterns_info' DataFrame patterns_grouped = patterns_info.groupby("Patron") # Initialize columns in 'main_data' DataFrame main_data["protein_names"] = "" main_data["proteins_treat"] = "" # Iterate over patterns in 'patterns_info' for index, row in patterns_info.iterrows(): pattern = row["Patron"] protein_id = row["Proteina"] positions = row["Posiciones"] # Find matching rows in 'main_data' DataFrame matching_rows = main_data[main_data["pattern"] == pattern] # Update 'proteins_treat' column main_data.loc[main_data["pattern"] == pattern, "proteins_treat"] = matching_rows["proteins"].apply( lambda x: literal_eval(x) + [[protein_id, positions]] if pd.notna(x) else [[protein_id, positions]] ) # Update 'protein_names' column main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply( lambda lst: [protein_names[protein_names["Entry"] == protein_id]["Entry_Name"].to_list() if protein_id else "N/A" for protein_id, _ in literal_eval(lst)] ) # Save the updated data main_data_base_name = main_data_path.split(".")[0] main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False) def add_entry_name(archivoEntrada,protein_name_file,archNom): """ Adds entry names to the DataFrame based on an additional CSV file and performs additional processing. Parameters: - archivo_entrada (str): Path to the Excel file. - protein_name_file (str): Path to the protein name CSV file. - archNom (str): Path to the id sustitution file Returns: - None """ data = pd.read_excel(archivoEntrada) dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"]) dataB = substitute_or_remove_prot_id(dataB, archNom, "na") print("PASA") dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"]) datas = dataB[dataB["Entry"].isin(data["protein_id"])] datas.to_csv(archivoEntrada + "_nombre.csv") doo = data[~(data["protein_id"].isin(dataB["Entry"]))] doo.to_csv("Proteinas_sin_nombre") #data.assign(lenght=datas["Length"].to_list()) #data.assign(name=datas["Protein names"].to_list()) #data.to_csv(archivoEntrada+"_nombre.csv") if __name__=="__main__": data=add_entry_name("Data/data_lung_cancer_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt") data=pd.read_excel("Data/data_lung_cancer_desease.xlsx") dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx") dds=pd.concat([data,dd]) dds.to_excel("Data/data_lung_cancer_desease_full.xlsx") #data=readData("Data/data_lung_cancer_desease.xlsx","","patronesIdenticos10Treat.csv","Lung01") #add_names_prot("ProtByPatternLung01.xlsx","patronesIdenticos10Treat.csv","protein_name.csv")