Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
P
ProteinsPatterns
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rafael Artinano
ProteinsPatterns
Commits
ce11ea4d
Commit
ce11ea4d
authored
Jan 23, 2024
by
Rafael Artinano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
removed incorrect file
parent
b4953dc3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
64 additions
and
312 deletions
+64
-312
TFM-main/src/generate_tha_excel.py
TFM-main/src/generate_tha_excel.py
+0
-276
TFM-main/src/generate_the_excel.py
TFM-main/src/generate_the_excel.py
+64
-36
No files found.
TFM-main/src/generate_tha_excel.py
deleted
100644 → 0
View file @
b4953dc3
import
pandas
as
pd
import
time
import
numpy
as
np
import
re
import
multiprocessing
as
mp
globi
=
0
df_b
=
None
def
substitute_or_remove_prot_id
(
data
,
sub_rem
):
print
(
"inside the problem"
)
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
index
=
prottosubs
.
readline
()
acept
=
index
.
split
()
listtosubs
=
{}
for
i
in
range
(
0
,
len
(
acept
)):
listtosubs
[
acept
[
i
]]
=
[]
while
line
:
=
prottosubs
.
readline
():
newline
=
line
.
split
()
#print(len(newline))
for
i
in
range
(
0
,
len
(
newline
)):
listtosubs
[
list
(
listtosubs
.
keys
())[
i
]]
.
append
(
newline
[
i
]
.
strip
())
resub
=
1
if
re
.
search
(
"Primary"
,
list
(
listtosubs
.
keys
())[
0
]):
resub
=
0
print
((
resub
+
1
)
%
2
)
#print(data)
#data2=data.copy()
global
globi
if
(
sub_rem
==
"s"
):
data
[
"protein_id"
]
.
replace
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
],
list
(
listtosubs
.
values
())[
resub
])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
elif
(
sub_rem
==
"p"
):
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
)])
==
False
]
data
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
)])
==
True
]
#print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True])
#print(datas)
#data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True)
data
=
data
.
drop_duplicates
(
keep
=
"first"
,
inplace
=
False
)
did
=
data
.
copy
()
data
=
data
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_sequence'
],
keep
=
"first"
,
inplace
=
False
)
did
=
did
[
~
did
.
isin
(
data
)
.
all
(
axis
=
1
)]
did
=
did
.
drop_duplicates
()
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print
(
did
)
datas
=
pd
.
concat
([
datas
,
did
],
ignore_index
=
True
)
data
.
to_excel
(
'data_principalpurge.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadassp_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
elif
(
sub_rem
==
"c"
):
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
data
[
"protein_id"
]
.
replace
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
],
list
(
listtosubs
.
values
())[
resub
])
print
(
"tamaño original: "
+
str
(
len
(
data
)))
dats
=
data
.
drop_duplicates
(
subset
=
[
'protein_id'
,
'class_id'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en id: "
+
str
(
len
(
dats
)))
dats
=
dats
.
drop_duplicates
(
subset
=
[
'protein_sequence'
,
'class_id'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en secuencia: "
+
str
(
len
(
dats
)))
dats
.
to_excel
(
'clases.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/clasesDescartadas_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi
=
globi
+
1
data
=
dats
else
:
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
data
[
"protein_id"
]
.
replace
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
],
list
(
listtosubs
.
values
())[
resub
])
print
(
"tamaño original: "
+
str
(
len
(
data
)))
dats
=
data
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_id'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en id: "
+
str
(
len
(
dats
)))
dats
=
dats
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_sequence'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en secuencia: "
+
str
(
len
(
dats
)))
dats
.
to_excel
(
'data_x.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadas_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi
=
globi
+
1
data
=
dats
#data.to_excel('data_nervous_genes_2.xlsx')
return
data
def
divide_by_class
(
data
):
print
(
"inside the problem"
)
cl
=
pd
.
read_excel
(
"lung_cancer_protein_class.xlsx"
)
cl
=
substitute_or_remove_prot_id
(
cl
,
"c"
)
cl
.
to_excel
(
"lung_cancer_protein_class_2.xlsx"
)
#data2=data.copy()
cli
=
cl
.
groupby
(
'class_id'
)
di
=
[]
dd
=
data
[
~
(
data
[
'protein_id'
]
.
isin
(
cl
[
'protein_id'
]))]
dd
.
to_excel
(
"proteinas_sin_clase.xlsx"
)
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
di
.
append
(
row
[
'protein_id'
])
do
=
data
[
data
[
"protein_id"
]
.
isin
(
di
)]
do
.
to_excel
(
'proteinasClase_'
+
k
+
'.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
di
=
[]
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
return
data
def
readData
(
archivoEntrada
,
enfermedad
,
archivoDescarte
=
None
):
data
=
pd
.
read_csv
(
archivoEntrada
)
dataor
=
data
.
copy
()
#data.to_excel('data_nervous_genes_2.xlsx')
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
#data.to_excel("data_nervous_genes_x.xlsx")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if
(
archivoDescarte
!=
None
):
dataB
=
pd
.
read_excel
(
archivoDescarte
)
print
(
len
(
data
))
#data=substitute_or_remove_prot_id(data,"r")
dataB
=
substitute_or_remove_prot_id
(
dataB
,
"r"
)
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data
=
len
(
data
)
alz_filt_data
=
len
(
dataB
)
print
(
"proteinas descartadas post filtro, principal: "
+
str
(
filt_data
-
len
(
data
)))
print
(
"proteinas descartadas post filtro, comun alz: "
+
str
(
alz_filt_data
-
len
(
dataB
)))
print
(
"tamaño del descarte: "
+
str
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
shape
[
0
]))
datad
=
data
[(
data
[
'protein_id'
]
.
isin
(
dataB
[
'protein_id'
]))]
datad
.
to_excel
(
"drop_data.xlsx"
)
data
.
drop
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
index
,
inplace
=
True
)
data
.
to_excel
(
archivoEntrada
+
"_PostDrop.xlsx"
)
#data=substitute_or_remove_prot_id(data,"r")
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readOData
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data
=
data
.
loc
[
data
[
"disease_id"
]
==
enfermedad
]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readDataClassDiv
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data
=
data
.
loc
[
data
[
"disease_id"
]
==
enfermedad
]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
divide_by_class
(
data
)
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
restructure_class
(
data
,
ArchivoSalida
):
data
=
data
.
groupby
([
'protein_id'
,
'protein_sequence'
,
'disease_id'
])
.
agg
(
list
)
print
(
data
)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data
.
to_excel
(
ArchivoSalida
)
return
data
def
readDataRestructure
(
archivoEntrada
,
enfermedad
,
archivoSalida
):
data
=
pd
.
read_excel
(
archivoEntrada
)
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data
=
data
.
loc
[
data
[
"disease_id"
]
==
enfermedad
]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
#filt_data=len(data)
#alz_filt_data=len(dataB)
#print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
#print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
restructure_class
(
data
,
archivoSalida
)
sequences
=
data
[
"protein_sequence"
]
return
sequences
if
__name__
==
'__main__'
:
#data=readData('protein_lung_cancer_C0007131.csv',,'C0007131',)
data2
=
readDataRestructure
(
'treatment_lung_cancer.xlsx'
,
'C0007131'
,
'data_lung_cancer_treatment.xlsx'
)
#data2=data2.to_list()
datl
=
data
.
to_list
()
#print(len(datl))
du
=
[]
#print(set(data2) - set(datl))
get_index_to_delete
=
[]
for
u
in
range
(
0
,
len
(
datl
)):
if
datl
[
u
]
not
in
data2
:
du
.
append
(
datl
[
u
])
else
:
get_index_to_delete
.
append
(
u
)
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
index
=
prottosubs
.
readline
()
accept
=
index
.
split
()
listtosubs
=
{}
for
i
in
range
(
0
,
len
(
accept
)):
listtosubs
[
acept
[
i
]]
=
[]
while
line
:
=
prottosubs
.
readline
():
newline
=
line
.
split
()
#print(len(newline))
for
i
in
range
(
0
,
len
(
newline
)):
listtosubs
[
list
(
listtosubs
.
keys
())[
i
]]
.
append
(
newline
[
i
]
.
strip
())
resub
=
1
if
re
.
search
(
"Primary"
,
list
(
listtosubs
.
keys
())[
0
]):
resub
=
0
dia
=
[]
for
y
in
du
:
dia
.
append
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
][
list
(
listtosubs
.
values
())[
resub
]
.
index
(
y
)])
#print(dia)
TFM-main/src/generate_the_excel.py
View file @
ce11ea4d
import
pandas
as
pd
import
pandas
as
pd
import
Levenshtein
import
time
import
time
from
sklearn.cluster
import
OPTICS
,
DBSCAN
,
AgglomerativeClustering
,
BisectingKMeans
,
SpectralClustering
from
sklearn.preprocessing
import
StandardScaler
import
numpy
as
np
import
numpy
as
np
from
scipy.spatial.distance
import
pdist
,
squareform
from
pyclustering.cluster.dbscan
import
dbscan
from
pyclustering.utils
import
timedcall
from
Levenshtein
import
distance
import
re
import
re
from
minineedle
import
needle
,
smith
,
core
from
Bio.Blast.Applications
import
NcbiblastpCommandline
from
io
import
StringIO
from
Bio.Blast
import
NCBIXML
from
Bio.Seq
import
Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio
import
SeqIO
import
swalign
import
multiprocessing
as
mp
import
multiprocessing
as
mp
globi
=
0
globi
=
0
df_b
=
None
df_b
=
None
...
@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
...
@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print
(
did
)
print
(
did
)
datas
=
pd
.
concat
([
datas
,
did
],
ignore_index
=
True
)
datas
=
pd
.
concat
([
datas
,
did
],
ignore_index
=
True
)
data
.
to_excel
(
'data_
nervous_genes_
principalpurge.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
data
.
to_excel
(
'data_principalpurge.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadassp_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
datas
.
to_csv
(
'resultados/proteinasDescartadassp_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
elif
(
sub_rem
==
"c"
):
elif
(
sub_rem
==
"c"
):
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
...
@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
...
@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
print
(
"Despues de tirar duplicados en id: "
+
str
(
len
(
dats
)))
print
(
"Despues de tirar duplicados en id: "
+
str
(
len
(
dats
)))
dats
=
dats
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_sequence'
],
keep
=
'first'
,
inplace
=
False
)
dats
=
dats
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_sequence'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en secuencia: "
+
str
(
len
(
dats
)))
print
(
"Despues de tirar duplicados en secuencia: "
+
str
(
len
(
dats
)))
dats
.
to_excel
(
'data_
nervous_genes_
x.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
dats
.
to_excel
(
'data_x.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadas_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
datas
.
to_csv
(
'resultados/proteinasDescartadas_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
#pd_diff.to_excel('data_not_valid.xlsx')
...
@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem):
...
@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem):
def
divide_by_class
(
data
):
def
divide_by_class
(
data
):
print
(
"inside the problem"
)
print
(
"inside the problem"
)
cl
=
pd
.
read_excel
(
"
alzheimer_protein_class 1
.xlsx"
)
cl
=
pd
.
read_excel
(
"
lung_cancer_protein_class
.xlsx"
)
cl
=
substitute_or_remove_prot_id
(
cl
,
"c"
)
cl
=
substitute_or_remove_prot_id
(
cl
,
"c"
)
cl
.
to_excel
(
"
alzheimer_protein_class
2.xlsx"
)
cl
.
to_excel
(
"
lung_cancer_protein_class_
2.xlsx"
)
#data2=data.copy()
#data2=data.copy()
cli
=
cl
.
groupby
(
'class_id'
)
cli
=
cl
.
groupby
(
'class_id'
)
di
=
[]
di
=
[]
dd
=
data
[
~
(
data
[
'protein_id'
]
.
isin
(
cl
[
'protein_id'
]))]
dd
.
to_excel
(
"proteinas_sin_clase.xlsx"
)
for
k
,
v
in
cli
:
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
for
index
,
row
in
v
.
iterrows
():
...
@@ -116,10 +103,44 @@ def divide_by_class(data):
...
@@ -116,10 +103,44 @@ def divide_by_class(data):
return
data
return
data
def
readData
(
archivoEntrada
,
enfermedad
):
def
readData
(
archivoEntrada
,
enfermedad
,
archivoDescarte
=
None
):
data
=
pd
.
read_
excel
(
archivoEntrada
)
data
=
pd
.
read_
csv
(
archivoEntrada
)
dataor
=
data
.
copy
()
#data.to_excel('data_nervous_genes_2.xlsx')
#data.to_excel('data_nervous_genes_2.xlsx')
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
#data.to_excel("data_nervous_genes_x.xlsx")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if
(
archivoDescarte
!=
None
):
dataB
=
pd
.
read_excel
(
archivoDescarte
)
print
(
len
(
data
))
#data=substitute_or_remove_prot_id(data,"r")
dataB
=
substitute_or_remove_prot_id
(
dataB
,
"r"
)
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data
=
len
(
data
)
alz_filt_data
=
len
(
dataB
)
print
(
"proteinas descartadas post filtro, principal: "
+
str
(
filt_data
-
len
(
data
)))
print
(
"proteinas descartadas post filtro, comun alz: "
+
str
(
alz_filt_data
-
len
(
dataB
)))
print
(
"tamaño del descarte: "
+
str
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
shape
[
0
]))
datad
=
data
[(
data
[
'protein_id'
]
.
isin
(
dataB
[
'protein_id'
]))]
datad
.
to_excel
(
"drop_data.xlsx"
)
data
.
drop
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
index
,
inplace
=
True
)
data
.
to_excel
(
archivoEntrada
+
"_PostDrop.xlsx"
)
#data=substitute_or_remove_prot_id(data,"r")
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readOData
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
if
(
enfermedad
!=
''
):
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#datar=substitute_or_remove_prot_id(data,"r")
...
@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad):
...
@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad):
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
print
(
len
(
data
))
#data=substitute_or_remove_prot_id(data,"r")
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
...
@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad):
...
@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad):
#data = data[~((data["disease_id"] == enfermedad) &
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
sequences
=
data
[
"protein_sequence"
]
sequences
=
data
[
"protein_sequence"
]
return
sequences
return
sequences
def
readOData
(
archivoEntrada
,
enfermedad
):
def
readDataClassDiv
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
#data=substitute_or_remove_prot_id(data,"r")
...
@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad):
...
@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
divide_by_class
(
data
)
sequences
=
data
[
"protein_sequence"
]
sequences
=
data
[
"protein_sequence"
]
return
sequences
return
sequences
def
restructure_class
(
data
,
ArchivoSalida
):
def
readCData
(
archivoEntrada
,
enfermedad
):
data
=
data
.
groupby
([
'protein_id'
,
'protein_sequence'
,
'disease_id'
])
.
agg
(
list
)
print
(
data
)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data
.
to_excel
(
ArchivoSalida
)
return
data
def
readDataRestructure
(
archivoEntrada
,
enfermedad
,
archivoSalida
):
data
=
pd
.
read_excel
(
archivoEntrada
)
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
if
(
enfermedad
!=
''
):
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
...
@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad):
...
@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
divide_by_class
(
dat
a
)
data
=
restructure_class
(
data
,
archivoSalid
a
)
sequences
=
data
[
"protein_sequence"
]
sequences
=
data
[
"protein_sequence"
]
return
sequences
return
sequences
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#data=readData('
data_nervous_genes_1.xlsx','C0002395'
)
#data=readData('
protein_lung_cancer_C0007131.csv',,'C0007131',
)
data2
=
read
CData
(
'data_nervous_genes_xf.xlsx'
,
'C0002395
'
)
data2
=
read
DataRestructure
(
'treatment_lung_cancer.xlsx'
,
'C0007131'
,
'data_lung_cancer_treatment.xlsx
'
)
data2
=
data2
.
to_list
()
#
data2=data2.to_list()
datl
=
data
.
to_list
()
datl
=
data
.
to_list
()
#print(len(datl))
#print(len(datl))
du
=
[]
du
=
[]
...
@@ -228,9 +256,9 @@ if __name__=='__main__':
...
@@ -228,9 +256,9 @@ if __name__=='__main__':
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
index
=
prottosubs
.
readline
()
index
=
prottosubs
.
readline
()
acept
=
index
.
split
()
ac
c
ept
=
index
.
split
()
listtosubs
=
{}
listtosubs
=
{}
for
i
in
range
(
0
,
len
(
acept
)):
for
i
in
range
(
0
,
len
(
ac
c
ept
)):
listtosubs
[
acept
[
i
]]
=
[]
listtosubs
[
acept
[
i
]]
=
[]
while
line
:
=
prottosubs
.
readline
():
while
line
:
=
prottosubs
.
readline
():
newline
=
line
.
split
()
newline
=
line
.
split
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment