import pandas as pd import time import numpy as np import re globi=0 df_b=None def substitute_or_remove_prot_id(data,archSubs,sub_rem,archivSal=None): """ Substitute or remove protein identifiers based on a substitution file. Parameters: - data: DataFrame containing protein data - archSubs: Input file with the protein ids that must be substituted by its primary entry - sub_rem: Operation type ('s' for substitution, 'p' for removal of protein id repeated once replaced and return of the removed ones, 'c' for class replacement, 'na' for protein_id replacement as Entry, anything else for protein_id based on desease_id replacement and repeated removal) - archSal: (Optional) Name of the output file has default value as None Returns: - Modified DataFrame after performing substitution or removal operation """ print("inside the problem") with open(archSubss) as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() global globi if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) elif(sub_rem == "p"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False] data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True] #print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]) #print(datas) #data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True) data=data.drop_duplicates(keep="first", inplace=False) did=data.copy() data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False) did=did[~did.isin(data).all(axis=1)] did=did.drop_duplicates() #print(pd.concat([did,did2]).drop_duplicates(keep=False)) print(did) datas=pd.concat([datas, did], ignore_index=True) if(archivSal != None): data.to_excel(archivSal,index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) elif(sub_rem == "c"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) if(archivSal != None): dats.to_excel(archivSal,index=False,columns=data.columns) datas.to_csv('resultados/clasesDescartadasc_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats elif(sub_rem=="na"): datas= data[data["Entry"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["Entry"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['Entry'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) #dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) if(archivSal != None): dats.to_excel(archivSal,index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadasna_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats else: datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) if(archivSal != None): dats.to_excel(archivSal,index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats #data.to_excel('data_nervous_genes_2.xlsx') return data def readData(archivoEntrada,archivoEnt2 ,enfermedad,archivoSal): """ Read data from an Excel file, find evey entry that matches the patterns of the CSV Input file, and save the result of the matches with the pattern that was searched to a new Excel file. Parameters: - archivoEntrada: Input Excel file path - archivoEnt2: Input CSV file with the patterns to be searched in the Archivo Entrada input file - enfermedad: Optional disease ID for filtering - archivoSal: Output Excel file path """ data = pd.read_excel(archivoEntrada) #data.to_excel('data_nervous_genes_2.xlsx') if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] dataB = pd.read_csv(archivoEnt2) print(len(data)) #dataB.to_excel("data_nervous_genes_xf2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') filt_data=len(data) alz_filt_data=len(dataB) print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0])) dataC={} daa=dataB["Patron"].unique() for u in daa: if(len(u)>3): dataC[u]=data[data.protein_sequence.str.contains(u)]["protein_id"].to_list() dataG=pd.DataFrame(dataC.items(),columns=["pattern","proteins"]) dataG.to_excel(archivoSal) #data=substitute_or_remove_prot_id(data,"r") def add_name_patterns(archivoEntrada,archivNom,EqvData,OutName): """ Add protein names to the DataFrame extracted from an excel based on a CSV file and save the result to a new CSV file. Parameters: - archivoEntrada: Input Excel file -ArchivNom: Input csv with the names of equivalences of the values contained in the input excel """ data=pd.read_excel(archivoEntrada) dataB=pd.read_csv(archivNom,usecols=['Entry',"Entry_Name","Protein_names","Length"]) dataB=substitute_or_remove_prot_id(dataB,EqvData,"na",OutName) dataB=dataB.reindex(columns=['Entry',"Entry_Name","Length","Protein_names"]) datas=dataB[dataB["Entry"].isin(data["protein_id"])] datas.to_csv(archivoEntrada+"_nombre.csv") doo=data[~(data["protein_id"].isin(dataB["Entry_name"]))] doo.to_csv("Proteinas_sin_nombre.csv") #data.assign(lenght=datas["Length"].to_list()) #data.assign(name=datas["Protein names"].to_list()) #data.to_csv(archivoEntrada+"_nombre.csv") if __name__=="__main__": #readData("data_nervous_genes_xf.xlsx","resultados/patronesIdenticosTreat_005.csv","C0007131","ProtByPatternLung005.xlsx") add_name_patterns("data_nervous_genes_xf.xlsx","protein_name.csv","nombres_sust.txt","protein_name_clean.csv")