Commit ce11ea4d authored by Rafael Artinano's avatar Rafael Artinano

removed incorrect file

parent b4953dc3
import pandas as pd
import time
import numpy as np
import re
import multiprocessing as mp
globi=0
df_b=None
def substitute_or_remove_prot_id(data,sub_rem):
print("inside the problem")
with open("nombres_sust.txt") as prottosubs:
index=prottosubs.readline()
acept=index.split()
listtosubs={}
for i in range(0,len(acept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
#print(len(newline))
for i in range(0,len(newline)):
listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())
resub=1
if re.search("Primary",list(listtosubs.keys())[0]):
resub=0
print((resub+1)%2)
#print(data)
#data2=data.copy()
global globi
if(sub_rem == "s"):
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
elif(sub_rem == "p"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False]
data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]
#print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True])
#print(datas)
#data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True)
data=data.drop_duplicates(keep="first", inplace=False)
did=data.copy()
data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False)
did=did[~did.isin(data).all(axis=1)]
did=did.drop_duplicates()
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print(did)
datas=pd.concat([datas, did], ignore_index=True)
data.to_excel('data_principalpurge.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False)
elif(sub_rem == "c"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
print("tamaño original: "+str(len(data)))
dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
dats.to_excel('clases.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/clasesDescartadas_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi=globi+1
data=dats
else:
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
print("tamaño original: "+str(len(data)))
dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
dats.to_excel('data_x.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi=globi+1
data=dats
#data.to_excel('data_nervous_genes_2.xlsx')
return data
def divide_by_class(data):
print("inside the problem")
cl=pd.read_excel("lung_cancer_protein_class.xlsx")
cl=substitute_or_remove_prot_id(cl,"c")
cl.to_excel("lung_cancer_protein_class_2.xlsx")
#data2=data.copy()
cli=cl.groupby('class_id')
di=[]
dd=data[~(data['protein_id'].isin(cl['protein_id']))]
dd.to_excel("proteinas_sin_clase.xlsx")
for k,v in cli:
for index,row in v.iterrows():
di.append(row['protein_id'])
do=data[data["protein_id"].isin(di)]
do.to_excel('proteinasClase_'+k+'.xlsx',index=False,columns=data.columns )
di=[]
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
return data
def readData(archivoEntrada, enfermedad,archivoDescarte=None):
data = pd.read_csv(archivoEntrada)
dataor=data.copy()
#data.to_excel('data_nervous_genes_2.xlsx')
data=substitute_or_remove_prot_id(data,"r")
#data.to_excel("data_nervous_genes_x.xlsx")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if(archivoDescarte != None):
dataB = pd.read_excel(archivoDescarte)
print(len(data))
#data=substitute_or_remove_prot_id(data,"r")
dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data=len(data)
alz_filt_data=len(dataB)
print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
datad=data[(data['protein_id'].isin(dataB['protein_id']))]
datad.to_excel("drop_data.xlsx")
data.drop(data[data["protein_id"].isin(dataB["protein_id"])].index,inplace = True)
data.to_excel(archivoEntrada+"_PostDrop.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
sequences = data["protein_sequence"]
return sequences
def readOData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data = data.loc[data["disease_id"] == enfermedad]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
sequences = data["protein_sequence"]
return sequences
def readDataClassDiv(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data = data.loc[data["disease_id"] == enfermedad]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data=divide_by_class(data)
sequences = data["protein_sequence"]
return sequences
def restructure_class(data,ArchivoSalida):
data=data.groupby(['protein_id','protein_sequence','disease_id']).agg(list)
print(data)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data.to_excel(ArchivoSalida)
return data
def readDataRestructure(archivoEntrada, enfermedad,archivoSalida):
data = pd.read_excel(archivoEntrada)
print(len(data["protein_id"].unique()))
data=substitute_or_remove_prot_id(data,"r")
print(len(data["protein_id"].unique()))
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data = data.loc[data["disease_id"] == enfermedad]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data=restructure_class(data,archivoSalida)
sequences = data["protein_sequence"]
return sequences
if __name__=='__main__':
#data=readData('protein_lung_cancer_C0007131.csv',,'C0007131',)
data2 = readDataRestructure('treatment_lung_cancer.xlsx','C0007131','data_lung_cancer_treatment.xlsx')
#data2=data2.to_list()
datl=data.to_list()
#print(len(datl))
du=[]
#print(set(data2) - set(datl))
get_index_to_delete=[]
for u in range(0,len(datl)):
if datl[u] not in data2:
du.append(datl[u])
else:
get_index_to_delete.append(u)
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
with open("nombres_sust.txt") as prottosubs:
index=prottosubs.readline()
accept=index.split()
listtosubs={}
for i in range(0,len(accept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
#print(len(newline))
for i in range(0,len(newline)):
listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())
resub=1
if re.search("Primary",list(listtosubs.keys())[0]):
resub=0
dia=[]
for y in du:
dia.append(list(listtosubs.values())[(resub+1)%2][list(listtosubs.values())[resub].index(y)])
#print(dia)
import pandas as pd import pandas as pd
import Levenshtein
import time import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np import numpy as np
from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall
from Levenshtein import distance
import re import re
from minineedle import needle, smith, core
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
import multiprocessing as mp import multiprocessing as mp
globi=0 globi=0
df_b=None df_b=None
...@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem): ...@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
#print(pd.concat([did,did2]).drop_duplicates(keep=False)) #print(pd.concat([did,did2]).drop_duplicates(keep=False))
print(did) print(did)
datas=pd.concat([datas, did], ignore_index=True) datas=pd.concat([datas, did], ignore_index=True)
data.to_excel('data_nervous_genes_principalpurge.xlsx',index=False,columns=data.columns) data.to_excel('data_principalpurge.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False)
elif(sub_rem == "c"): elif(sub_rem == "c"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
...@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem): ...@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
print("Despues de tirar duplicados en id: "+str(len(dats))) print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False) dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats))) print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
dats.to_excel('data_nervous_genes_x.xlsx',index=False,columns=data.columns) dats.to_excel('data_x.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False) #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx') #pd_diff.to_excel('data_not_valid.xlsx')
...@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem): ...@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem):
def divide_by_class(data): def divide_by_class(data):
print("inside the problem") print("inside the problem")
cl=pd.read_excel("alzheimer_protein_class 1.xlsx") cl=pd.read_excel("lung_cancer_protein_class.xlsx")
cl=substitute_or_remove_prot_id(cl,"c") cl=substitute_or_remove_prot_id(cl,"c")
cl.to_excel("alzheimer_protein_class 2.xlsx") cl.to_excel("lung_cancer_protein_class_2.xlsx")
#data2=data.copy() #data2=data.copy()
cli=cl.groupby('class_id') cli=cl.groupby('class_id')
di=[] di=[]
dd=data[~(data['protein_id'].isin(cl['protein_id']))]
dd.to_excel("proteinas_sin_clase.xlsx")
for k,v in cli: for k,v in cli:
for index,row in v.iterrows(): for index,row in v.iterrows():
...@@ -116,10 +103,44 @@ def divide_by_class(data): ...@@ -116,10 +103,44 @@ def divide_by_class(data):
return data return data
def readData(archivoEntrada, enfermedad): def readData(archivoEntrada, enfermedad,archivoDescarte=None):
data = pd.read_excel(archivoEntrada) data = pd.read_csv(archivoEntrada)
dataor=data.copy()
#data.to_excel('data_nervous_genes_2.xlsx')
data=substitute_or_remove_prot_id(data,"r")
#data.to_excel("data_nervous_genes_x.xlsx")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if(archivoDescarte != None):
dataB = pd.read_excel(archivoDescarte)
print(len(data))
#data=substitute_or_remove_prot_id(data,"r")
dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx') #data.to_excel('data_nervous_genes_2.xlsx')
filt_data=len(data)
alz_filt_data=len(dataB)
print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
datad=data[(data['protein_id'].isin(dataB['protein_id']))]
datad.to_excel("drop_data.xlsx")
data.drop(data[data["protein_id"].isin(dataB["protein_id"])].index,inplace = True)
data.to_excel(archivoEntrada+"_PostDrop.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
sequences = data["protein_sequence"]
return sequences
def readOData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
if (enfermedad != ''): if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r") #datar=substitute_or_remove_prot_id(data,"r")
...@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad): ...@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad):
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
print(len(data))
#data=substitute_or_remove_prot_id(data,"r") #data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r") #dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx") #dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
...@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad): ...@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad):
#data = data[~((data["disease_id"] == enfermedad) & #data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) & # (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))] # (data["gene_id"].isin(dataB["gene_id"])))]
data=substitute_or_remove_prot_id(data,"r")
sequences = data["protein_sequence"] sequences = data["protein_sequence"]
return sequences return sequences
def readOData(archivoEntrada, enfermedad):
def readDataClassDiv(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada) data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r") #data=substitute_or_remove_prot_id(data,"r")
...@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad): ...@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) & # (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))] # (data["gene_id"].isin(dataB["gene_id"])))]
data=divide_by_class(data)
sequences = data["protein_sequence"] sequences = data["protein_sequence"]
return sequences return sequences
def restructure_class(data,ArchivoSalida):
def readCData(archivoEntrada, enfermedad): data=data.groupby(['protein_id','protein_sequence','disease_id']).agg(list)
print(data)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data.to_excel(ArchivoSalida)
return data
def readDataRestructure(archivoEntrada, enfermedad,archivoSalida):
data = pd.read_excel(archivoEntrada) data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r") print(len(data["protein_id"].unique()))
data=substitute_or_remove_prot_id(data,"r")
print(len(data["protein_id"].unique()))
if (enfermedad != ''): if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r") #datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
...@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad): ...@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) & # (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))] # (data["gene_id"].isin(dataB["gene_id"])))]
data=divide_by_class(data) data=restructure_class(data,archivoSalida)
sequences = data["protein_sequence"] sequences = data["protein_sequence"]
return sequences return sequences
if __name__=='__main__': if __name__=='__main__':
#data=readData('data_nervous_genes_1.xlsx','C0002395') #data=readData('protein_lung_cancer_C0007131.csv',,'C0007131',)
data2 = readCData('data_nervous_genes_xf.xlsx','C0002395') data2 = readDataRestructure('treatment_lung_cancer.xlsx','C0007131','data_lung_cancer_treatment.xlsx')
data2=data2.to_list() #data2=data2.to_list()
datl=data.to_list() datl=data.to_list()
#print(len(datl)) #print(len(datl))
du=[] du=[]
...@@ -228,9 +256,9 @@ if __name__=='__main__': ...@@ -228,9 +256,9 @@ if __name__=='__main__':
#print(str(u)+" Este no deberia estar: "+str(datl[u])) #print(str(u)+" Este no deberia estar: "+str(datl[u]))
with open("nombres_sust.txt") as prottosubs: with open("nombres_sust.txt") as prottosubs:
index=prottosubs.readline() index=prottosubs.readline()
acept=index.split() accept=index.split()
listtosubs={} listtosubs={}
for i in range(0,len(acept)): for i in range(0,len(accept)):
listtosubs[acept[i]]=[] listtosubs[acept[i]]=[]
while line := prottosubs.readline(): while line := prottosubs.readline():
newline=line.split() newline=line.split()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment