import pandas as pd import Levenshtein import time from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering from sklearn.preprocessing import StandardScaler import numpy as np from scipy.spatial.distance import pdist, squareform from pyclustering.cluster.dbscan import dbscan from pyclustering.utils import timedcall from Levenshtein import distance import re from minineedle import needle, smith, core from Bio.Blast.Applications import NcbiblastpCommandline from io import StringIO from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import swalign import multiprocessing as mp globi=0 df_b=None def substitute_or_remove_prot_id(data,sub_rem): print("inside the problem") with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() global globi if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) elif(sub_rem == "p"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False] data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True] #print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]) #print(datas) #data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True) data=data.drop_duplicates(keep="first", inplace=False) did=data.copy() data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False) did=did[~did.isin(data).all(axis=1)] did=did.drop_duplicates() #print(pd.concat([did,did2]).drop_duplicates(keep=False)) print(did) datas=pd.concat([datas, did], ignore_index=True) data.to_excel('data_nervous_genes_principalpurge.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) elif(sub_rem == "c"): datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) dats.to_excel('clases.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/clasesDescartadas_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats else: datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) print("tamaño original: "+str(len(data))) dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False) print("Despues de tirar duplicados en id: "+str(len(dats))) dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False) print("Despues de tirar duplicados en secuencia: "+str(len(dats))) dats.to_excel('data_nervous_genes_x.xlsx',index=False,columns=data.columns) datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff.to_excel('data_not_valid.xlsx') globi=globi+1 data=dats #data.to_excel('data_nervous_genes_2.xlsx') return data def divide_by_class(data): print("inside the problem") cl=pd.read_excel("alzheimer_protein_class 1.xlsx") cl=substitute_or_remove_prot_id(cl,"c") cl.to_excel("alzheimer_protein_class 2.xlsx") #data2=data.copy() cli=cl.groupby('class_id') di=[] for k,v in cli: for index,row in v.iterrows(): di.append(row['protein_id']) do=data[data["protein_id"].isin(di)] do.to_excel('proteinasClase_'+k+'.xlsx',index=False,columns=data.columns ) di=[] #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) return data def readData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data.to_excel('data_nervous_genes_2.xlsx') if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") print(len(data)) #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] data=substitute_or_remove_prot_id(data,"r") sequences = data["protein_sequence"] return sequences def readOData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] sequences = data["protein_sequence"] return sequences def readCData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,"r") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #data.to_excel('data_nervous_genes_2.xlsx') #filt_data=len(data) #alz_filt_data=len(dataB) #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # (data["gene_id"].isin(dataB["gene_id"])))] data=divide_by_class(data) sequences = data["protein_sequence"] return sequences if __name__=='__main__': #data=readData('data_nervous_genes_1.xlsx','C0002395') data2 = readCData('data_nervous_genes_xf.xlsx','C0002395') data2=data2.to_list() datl=data.to_list() #print(len(datl)) du=[] #print(set(data2) - set(datl)) get_index_to_delete=[] for u in range(0,len(datl)): if datl[u] not in data2: du.append(datl[u]) else: get_index_to_delete.append(u) #print(str(u)+" Este no deberia estar: "+str(datl[u])) with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 dia=[] for y in du: dia.append(list(listtosubs.values())[(resub+1)%2][list(listtosubs.values())[resub].index(y)]) #print(dia)