Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
P
ProteinsPatterns
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rafael Artinano
ProteinsPatterns
Commits
ce11ea4d
Commit
ce11ea4d
authored
Jan 23, 2024
by
Rafael Artinano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
removed incorrect file
parent
b4953dc3
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
64 additions
and
312 deletions
+64
-312
TFM-main/src/generate_tha_excel.py
TFM-main/src/generate_tha_excel.py
+0
-276
TFM-main/src/generate_the_excel.py
TFM-main/src/generate_the_excel.py
+64
-36
No files found.
TFM-main/src/generate_tha_excel.py
deleted
100644 → 0
View file @
b4953dc3
This diff is collapsed.
Click to expand it.
TFM-main/src/generate_the_excel.py
View file @
ce11ea4d
import
pandas
as
pd
import
Levenshtein
import
time
from
sklearn.cluster
import
OPTICS
,
DBSCAN
,
AgglomerativeClustering
,
BisectingKMeans
,
SpectralClustering
from
sklearn.preprocessing
import
StandardScaler
import
numpy
as
np
from
scipy.spatial.distance
import
pdist
,
squareform
from
pyclustering.cluster.dbscan
import
dbscan
from
pyclustering.utils
import
timedcall
from
Levenshtein
import
distance
import
re
from
minineedle
import
needle
,
smith
,
core
from
Bio.Blast.Applications
import
NcbiblastpCommandline
from
io
import
StringIO
from
Bio.Blast
import
NCBIXML
from
Bio.Seq
import
Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio
import
SeqIO
import
swalign
import
multiprocessing
as
mp
globi
=
0
df_b
=
None
...
...
@@ -61,7 +46,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print
(
did
)
datas
=
pd
.
concat
([
datas
,
did
],
ignore_index
=
True
)
data
.
to_excel
(
'data_
nervous_genes_
principalpurge.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
data
.
to_excel
(
'data_principalpurge.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadassp_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
elif
(
sub_rem
==
"c"
):
datas
=
data
[
data
[
"protein_id"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
...
...
@@ -86,7 +71,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
print
(
"Despues de tirar duplicados en id: "
+
str
(
len
(
dats
)))
dats
=
dats
.
drop_duplicates
(
subset
=
[
'disease_id'
,
'protein_sequence'
],
keep
=
'first'
,
inplace
=
False
)
print
(
"Despues de tirar duplicados en secuencia: "
+
str
(
len
(
dats
)))
dats
.
to_excel
(
'data_
nervous_genes_
x.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
dats
.
to_excel
(
'data_x.xlsx'
,
index
=
False
,
columns
=
data
.
columns
)
datas
.
to_csv
(
'resultados/proteinasDescartadas_'
+
str
(
globi
)
+
'.csv'
,
index
=
False
)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
...
...
@@ -98,12 +83,14 @@ def substitute_or_remove_prot_id(data,sub_rem):
def
divide_by_class
(
data
):
print
(
"inside the problem"
)
cl
=
pd
.
read_excel
(
"
alzheimer_protein_class 1
.xlsx"
)
cl
=
pd
.
read_excel
(
"
lung_cancer_protein_class
.xlsx"
)
cl
=
substitute_or_remove_prot_id
(
cl
,
"c"
)
cl
.
to_excel
(
"
alzheimer_protein_class
2.xlsx"
)
cl
.
to_excel
(
"
lung_cancer_protein_class_
2.xlsx"
)
#data2=data.copy()
cli
=
cl
.
groupby
(
'class_id'
)
di
=
[]
dd
=
data
[
~
(
data
[
'protein_id'
]
.
isin
(
cl
[
'protein_id'
]))]
dd
.
to_excel
(
"proteinas_sin_clase.xlsx"
)
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
...
...
@@ -116,10 +103,44 @@ def divide_by_class(data):
return
data
def
readData
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_
excel
(
archivoEntrada
)
def
readData
(
archivoEntrada
,
enfermedad
,
archivoDescarte
=
None
):
data
=
pd
.
read_
csv
(
archivoEntrada
)
dataor
=
data
.
copy
()
#data.to_excel('data_nervous_genes_2.xlsx')
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
#data.to_excel("data_nervous_genes_x.xlsx")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
#data = data.loc[data["disease_id"] == enfermedad]
if
(
archivoDescarte
!=
None
):
dataB
=
pd
.
read_excel
(
archivoDescarte
)
print
(
len
(
data
))
#data=substitute_or_remove_prot_id(data,"r")
dataB
=
substitute_or_remove_prot_id
(
dataB
,
"r"
)
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data
=
len
(
data
)
alz_filt_data
=
len
(
dataB
)
print
(
"proteinas descartadas post filtro, principal: "
+
str
(
filt_data
-
len
(
data
)))
print
(
"proteinas descartadas post filtro, comun alz: "
+
str
(
alz_filt_data
-
len
(
dataB
)))
print
(
"tamaño del descarte: "
+
str
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
shape
[
0
]))
datad
=
data
[(
data
[
'protein_id'
]
.
isin
(
dataB
[
'protein_id'
]))]
datad
.
to_excel
(
"drop_data.xlsx"
)
data
.
drop
(
data
[
data
[
"protein_id"
]
.
isin
(
dataB
[
"protein_id"
])]
.
index
,
inplace
=
True
)
data
.
to_excel
(
archivoEntrada
+
"_PostDrop.xlsx"
)
#data=substitute_or_remove_prot_id(data,"r")
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readOData
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
...
...
@@ -131,7 +152,7 @@ def readData(archivoEntrada, enfermedad):
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
print
(
len
(
data
))
#data=substitute_or_remove_prot_id(data,"r")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#dataB.to_excel("proteinas_en_comun_Alzeheimer2.xlsx")
...
...
@@ -143,11 +164,12 @@ def readData(archivoEntrada, enfermedad):
#data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readOData
(
archivoEntrada
,
enfermedad
):
def
readDataClassDiv
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
...
...
@@ -174,14 +196,21 @@ def readOData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
divide_by_class
(
data
)
sequences
=
data
[
"protein_sequence"
]
return
sequences
def
readCData
(
archivoEntrada
,
enfermedad
):
def
restructure_class
(
data
,
ArchivoSalida
):
data
=
data
.
groupby
([
'protein_id'
,
'protein_sequence'
,
'disease_id'
])
.
agg
(
list
)
print
(
data
)
#data.drop_duplicates(subset=['protein_id','protein_sequence'],keep='first',inplace=True)
data
.
to_excel
(
ArchivoSalida
)
return
data
def
readDataRestructure
(
archivoEntrada
,
enfermedad
,
archivoSalida
):
data
=
pd
.
read_excel
(
archivoEntrada
)
#data=substitute_or_remove_prot_id(data,"r")
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
print
(
len
(
data
[
"protein_id"
]
.
unique
()))
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
...
...
@@ -205,16 +234,15 @@ def readCData(archivoEntrada, enfermedad):
# (data["protein_id"].isin(dataB["protein_id"])) &
# (data["gene_id"].isin(dataB["gene_id"])))]
data
=
divide_by_class
(
dat
a
)
data
=
restructure_class
(
data
,
archivoSalid
a
)
sequences
=
data
[
"protein_sequence"
]
return
sequences
if
__name__
==
'__main__'
:
#data=readData('
data_nervous_genes_1.xlsx','C0002395'
)
data2
=
read
CData
(
'data_nervous_genes_xf.xlsx'
,
'C0002395
'
)
data2
=
data2
.
to_list
()
#data=readData('
protein_lung_cancer_C0007131.csv',,'C0007131',
)
data2
=
read
DataRestructure
(
'treatment_lung_cancer.xlsx'
,
'C0007131'
,
'data_lung_cancer_treatment.xlsx
'
)
#
data2=data2.to_list()
datl
=
data
.
to_list
()
#print(len(datl))
du
=
[]
...
...
@@ -228,9 +256,9 @@ if __name__=='__main__':
#print(str(u)+" Este no deberia estar: "+str(datl[u]))
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
index
=
prottosubs
.
readline
()
acept
=
index
.
split
()
ac
c
ept
=
index
.
split
()
listtosubs
=
{}
for
i
in
range
(
0
,
len
(
acept
)):
for
i
in
range
(
0
,
len
(
ac
c
ept
)):
listtosubs
[
acept
[
i
]]
=
[]
while
line
:
=
prottosubs
.
readline
():
newline
=
line
.
split
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment