import pandas as pd import time import numpy as np import re import multiprocessing as mp globi=0 df_b=None def substitute_or_remove_prot_id(data,archivoNombre,sub_rem): print("inside the problem") with open(archivoNombre) as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() global globi if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) elif(sub_rem == "p"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False] data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True] #print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]) #print(datas) #data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True) data=data.drop_duplicates(keep="first", inplace=False) did=data.copy() data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False) did=did[~did.isin(data).all(axis=1)] did=did.drop_duplicates() #print(pd.concat([did,did2]).drop_duplicates(keep=False)) print(did) datas=pd.concat([datas, did], ignore_index=True) data.to_excel('data_principalpurge.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) elif(sub_rem == "c"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) dats.to_excel('clases.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/clasesDescartadas_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats else: datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) dats.to_excel('data_x.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats #data.to_excel('data_nervous_genes_2.xlsx') return data def divide_by_class(data,archNom): print("inside the problem") cl=pd.read_excel("lung_cancer_protein_class.xlsx") cl=substitute_or_remove_prot_id(cl,archNom,"c") cl.to_excel("lung_cancer_protein_class_2.xlsx") #data2=data.copy() cli=cl.groupby('class_id') di=[] dd=data[~(data['protein_id'].isin(cl['protein_id']))] dd.to_excel("proteinas_sin_clase.xlsx") for k,v in cli: for index,row in v.iterrows(): di.append(row['protein_id']) do=data[data["protein_id"].isin(di)] do.to_excel('proteinasClase_'+k+'.xlsx',index=False,columns=data.columns ) di=[] #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) return data def readData(archivoEntrada, enfermedad,Sal,archNom,archivoDescarte=None): if(archivoEntrada.split(".")[1] == "csv"): data = pd.read_csv(archivoEntrada) else: data = pd.read_excel(archivoEntrada) dataor=data.copy() #data.to_excel('data_nervous_genes_2.xlsx') data=substitute_or_remove_prot_id(data,archNom,"r") data.to_excel(Sal+"_postUniprot.xlsx") #data.to_excel("data_nervous_genes_x.xlsx") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) #data = data.loc[data["disease_id"] == enfermedad] if(archivoDescarte != None): dataB = pd.read_excel(archivoDescarte) print(len(data)) #data=substitute_or_remove_prot_id(data,"r") dataB=substitute_or_remove_prot_id(dataB,archNom,"r") #dataB.to_excel("data_nervous_genes_xf2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') filt_data=len(data) alz_filt_data=len(dataB) print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0])) datad=data[(data['protein_id'].isin(dataB['protein_id']))] datad.to_excel("drop_data.xlsx") data.drop(data[data["protein_id"].isin(dataB["protein_id"])].index,inplace = True) data.to_excel(Sal+".xlsx") dd=pd.concat([data,dataB]) dd.to_excel(Sal+"_full.xlsx") #data=substitute_or_remove_prot_id(data,"r") sequences = data["protein_sequence"] return sequences def readOData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] sequences = data["protein_sequence"] return sequences def readDataClassDiv(archivoEntrada,archNom, enfermedad): data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] data=divide_by_class(data,archNom) sequences = data["protein_sequence"] return sequences def restructure_class(data,ArchivoSalida): #data=data.groupby(['protein_id','protein_sequence','disease_id']).agg(list).reset_index() print(data) #data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True) data.to_excel(ArchivoSalida) return data def readDataRestructure(archivoEntrada,archivoNombre, enfermedad,archivoSalida): data = pd.read_excel(archivoEntrada) print(len(data["protein_id"].unique())) data=data.groupby(['protein_id','protein_sequence','disease_id']).agg(set).reset_index() print(data.columns) data=substitute_or_remove_prot_id(data,archivoNombre,"r") print(len(data["protein_id"].unique())) if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] data=restructure_class(data,archivoSalida) sequences = data["protein_sequence"] return sequences if __name__=='__main__': data=readData('Data/autoinmune_proteins_ids.xlsx','C0007131',"Data/data_autoinmume_desease","Data/nombres_sust.txt","Data/data_lung_cancer_treatment.xlsx") #data2 = readDataRestructure('Data/treatment_lung_cancer.xlsx',"nombres_sust.txt",'C0007131','Data/data_lung_cancer_treatment.xlsx') #data2=data2.to_list() datl=data.to_list() #print(len(datl)) du=[] #print(set(data2) - set(datl)) get_index_to_delete=[] for u in range(0,len(datl)): if datl[u] not in data2: du.append(datl[u]) else: get_index_to_delete.append(u) #print(str(u)+" Este no deberia estar: "+str(datl[u])) with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() accept=index.split() listtosubs={} for i in range(0,len(accept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 dia=[] for y in du: dia.append(list(listtosubs.values())[(resub+1)%2][list(listtosubs.values())[resub].index(y)]) #print(dia)