Commit ce11ea4d authored by Rafael Artinano's avatar Rafael Artinano

removed incorrect file

parent b4953dc3
This diff is collapsed.
import pandas as pd
import Levenshtein
import time
from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.dbscan import dbscan
from pyclustering.utils import timedcall
from Levenshtein import distance
import re
from minineedle import needle, smith, core
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
import multiprocessing as mp
globi=0
df_b=None
......@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print(did)
datas=pd.concat([datas, did], ignore_index=True)
data.to_excel('data_nervous_genes_principalpurge.xlsx',index=False,columns=data.columns)
data.to_excel('data_principalpurge.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False)
elif(sub_rem == "c"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
......@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
dats.to_excel('data_nervous_genes_x.xlsx',index=False,columns=data.columns)
dats.to_excel('data_x.xlsx',index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
......@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem):
def divide_by_class(data):
print("inside the problem")
cl=pd.read_excel("alzheimer_protein_class 1.xlsx")
cl=pd.read_excel("lung_cancer_protein_class.xlsx")
cl=substitute_or_remove_prot_id(cl,"c")
cl.to_excel("alzheimer_protein_class 2.xlsx")
cl.to_excel("lung_cancer_protein_class_2.xlsx")
#data2=data.copy()
cli=cl.groupby('class_id')
di=[]
dd=data[~(data['protein_id'].isin(cl['protein_id']))]
dd.to_excel("proteinas_sin_clase.xlsx")
for k,v in cli:
for index,row in v.iterrows():
......@@ -116,10 +103,44 @@ def divide_by_class(data):
return data
def readData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
def readData(archivoEntrada, enfermedad,archivoDescarte=None):
data = pd.read_csv(archivoEntrada)
dataor=data.copy()
#data.to_excel('data_nervous_genes_2.xlsx')
data=substitute_or_remove_prot_id(data,"r")
#data.to_excel("data_nervous_genes_x.xlsx")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if(archivoDescarte != None):
dataB = pd.read_excel(archivoDescarte)
print(len(data))
#data=substitute_or_remove_prot_id(data,"r")
dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data=len(data)
alz_filt_data=len(dataB)
print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
datad=data[(data['protein_id'].isin(dataB['protein_id']))]
datad.to_excel("drop_data.xlsx")
data.drop(data[data["protein_id"].isin(dataB["protein_id"])].index,inplace = True)
data.to_excel(archivoEntrada+"_PostDrop.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
sequences = data["protein_sequence"]
return sequences
def readOData(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
......@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad):
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
print(len(data))
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
......@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad):
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data=substitute_or_remove_prot_id(data,"r")
sequences = data["protein_sequence"]
return sequences
def readOData(archivoEntrada, enfermedad):
def readDataClassDiv(archivoEntrada, enfermedad):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
......@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data=divide_by_class(data)
sequences = data["protein_sequence"]
return sequences
def readCData(archivoEntrada, enfermedad):
def restructure_class(data,ArchivoSalida):
data=data.groupby(['protein_id','protein_sequence','disease_id']).agg(list)
print(data)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data.to_excel(ArchivoSalida)
return data
def readDataRestructure(archivoEntrada, enfermedad,archivoSalida):
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
print(len(data["protein_id"].unique()))
data=substitute_or_remove_prot_id(data,"r")
print(len(data["protein_id"].unique()))
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
......@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data=divide_by_class(data)
data=restructure_class(data,archivoSalida)
sequences = data["protein_sequence"]
return sequences
if __name__=='__main__':
#data=readData('data_nervous_genes_1.xlsx','C0002395')
data2 = readCData('data_nervous_genes_xf.xlsx','C0002395')
data2=data2.to_list()
#data=readData('protein_lung_cancer_C0007131.csv',,'C0007131',)
data2 = readDataRestructure('treatment_lung_cancer.xlsx','C0007131','data_lung_cancer_treatment.xlsx')
#data2=data2.to_list()
datl=data.to_list()
#print(len(datl))
du=[]
......@@ -228,9 +256,9 @@ if __name__=='__main__':
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
with open("nombres_sust.txt") as prottosubs:
index=prottosubs.readline()
acept=index.split()
accept=index.split()
listtosubs={}
for i in range(0,len(acept)):
for i in range(0,len(accept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment