Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
P
ProteinsPatterns
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rafael Artinano
ProteinsPatterns
Commits
d5cae6ac
Commit
d5cae6ac
authored
Dec 11, 2023
by
Rafael Artinano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update in similarity with aa
parent
3c565e97
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
691 additions
and
136 deletions
+691
-136
TFM-main/src/compute_for_clases.py
TFM-main/src/compute_for_clases.py
+406
-0
TFM-main/src/generate_the_excel.py
TFM-main/src/generate_the_excel.py
+0
-0
TFM-main/src/metricas.py
TFM-main/src/metricas.py
+160
-0
TFM-main/src/patrones_similares_aa.py
TFM-main/src/patrones_similares_aa.py
+125
-136
No files found.
TFM-main/src/compute_for_clases.py
0 → 100644
View file @
d5cae6ac
import
pandas
as
pd
import
time
import
ast
import
csv
import
math
from
interfazGrafica
import
interfaz
from
descarteProteinas
import
ejecutar
,
remplazar_ID_for_sequence
from
generate_tha_excel
import
substitute_or_remove_prot_id
import
metricas
from
graficas
import
grafica
import
os
import
json
import
ast
import
re
from
patrones_similares_aa
import
remplazar_sequence_for_ID
as
remplazar_s
from
patrones_similares_aa
import
buscar_patrones_simAA
from
collections
import
defaultdict
from
pathlib
import
Path
def
substitute_or_remove_prot_id2
(
data
,
sub_rem
):
print
(
"inside the problem"
)
with
open
(
"nombres_sust.txt"
)
as
prottosubs
:
index
=
prottosubs
.
readline
()
acept
=
index
.
split
()
listtosubs
=
{}
for
i
in
range
(
0
,
len
(
acept
)):
listtosubs
[
acept
[
i
]]
=
[]
while
line
:
=
prottosubs
.
readline
():
newline
=
line
.
split
()
#print(len(newline))
for
i
in
range
(
0
,
len
(
newline
)):
listtosubs
[
list
(
listtosubs
.
keys
())[
i
]]
.
append
(
newline
[
i
]
.
strip
())
resub
=
1
if
re
.
search
(
"Primary"
,
list
(
listtosubs
.
keys
())[
0
]):
resub
=
0
print
((
resub
+
1
)
%
2
)
#print(data)
#data2=data.copy()
if
(
sub_rem
==
"s"
):
data
[
"Proteina"
]
.
replace
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
],
list
(
listtosubs
.
values
())[
resub
])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
else
:
global
globi
datas
=
data
[
data
[
"Proteina"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
True
]
data
=
data
[
data
[
"Proteina"
]
.
isin
(
list
(
listtosubs
.
values
())[(
resub
+
1
)
%
2
])
==
False
]
#datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
globi
=
globi
+
1
return
data
def
readData
(
archivoEntrada
,
enfermedad
,
archivoTarget
):
data
=
pd
.
read_excel
(
archivoEntrada
)
dataC
=
pd
.
read_csv
(
"resultados/proteinasDescartadas2.csv"
)
#data=substitute_or_remove_prot_id(data,"r")
#dataC=substitute_or_remove_prot_id(dataC,"r")
#Descarte de proteinas
data
=
data
[
~
data
[
'protein_id'
]
.
isin
(
dataC
[
'ProteinasDescartadas'
])]
print
(
"Se ha realizado el descarte de proteínas"
)
# "C0002395"
if
(
enfermedad
!=
''
):
data
=
data
.
loc
[
data
[
"disease_id"
]
==
enfermedad
]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#print("Se han seleccionado las proteínas de la enfermedad elegida")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#if(archivoTarget != ''):
# dataB=substitute_or_remove_prot_id(dataB,"r")
#Eliminar las proteinas target
# data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])))]
# print("Se han descartado las proteínas del archivo target")
sequences
=
data
[
"protein_sequence"
]
print
(
sequences
)
num_filas
=
sequences
.
shape
[
0
]
return
sequences
,
num_filas
def
guardar_patrones_len1
(
sequences
,
pattern_freqMin
):
all_patterns
=
dict
()
longitud_max
=
0
# Each pattern associated to the proteins the pattern is in
pattern_proteins
=
{}
for
protein
in
sequences
:
longitud
=
len
(
protein
)
if
longitud
>
longitud_max
:
longitud_max
=
longitud
all_patterns
[
protein
]
=
[]
# En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
posicionPatterns
=
dict
()
for
index
,
letter
in
enumerate
(
protein
):
posicionPatterns
[
letter
]
=
posicionPatterns
.
get
(
letter
,
[])
+
[
index
]
all_patterns
[
protein
]
=
posicionPatterns
for
protein
,
patterns
in
all_patterns
.
items
():
for
pattern
,
positions
in
patterns
.
items
():
if
pattern
not
in
pattern_proteins
:
pattern_proteins
[
pattern
]
=
{}
if
protein
not
in
pattern_proteins
[
pattern
]:
pattern_proteins
[
pattern
][
protein
]
=
[]
pattern_proteins
[
pattern
][
protein
]
.
extend
(
positions
)
for
pattern
,
proteins
in
pattern_proteins
.
items
():
if
len
(
proteins
)
>=
min_ocurrence
:
pattern_freqMin
[
pattern
]
=
proteins
df
=
pd
.
DataFrame
(
pattern_freqMin
.
items
(),
columns
=
[
'pattern'
,
'proteins'
])
df
.
to_csv
(
'prueba2.csv'
,
index
=
False
)
return
pattern_freqMin
,
posicionPatterns
,
longitud_max
def
buscar_patrones_identicos
(
sequences
):
pattern_freqMin
=
{}
pattern_freqMin
,
posicionPatterns
,
longitud_max
=
guardar_patrones_len1
(
sequences
,
pattern_freqMin
)
if
bool
(
pattern_freqMin
):
for
pattern_length
in
range
(
2
,
longitud_max
+
1
):
# Si se intenta acceder a una clave que no existe se creara una lista vacia
auxPos
=
{}
sub_seqs
=
[]
for
pattern
,
proteins
in
pattern_freqMin
.
items
():
if
len
(
pattern
)
==
pattern_length
-
1
:
for
prot
,
positions
in
proteins
.
items
():
protein_len
=
len
(
prot
)
if
protein_len
<
pattern_length
-
1
:
continue
for
position
in
positions
:
if
(
protein_len
<
position
+
pattern_length
):
continue
sub_seq
=
prot
[
position
:
position
+
pattern_length
]
if
sub_seq
in
pattern_freqMin
:
continue
# Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible
# min freq tb
ultima_letra
=
sub_seq
[
-
1
]
pos_ultima_letra
=
position
+
pattern_length
-
1
if
ultima_letra
in
pattern_freqMin
and
pos_ultima_letra
in
pattern_freqMin
[
ultima_letra
][
prot
]:
if
sub_seq
not
in
auxPos
:
auxPos
[
sub_seq
]
=
{}
if
prot
not
in
auxPos
[
sub_seq
]:
auxPos
[
sub_seq
][
prot
]
=
[]
auxPos
[
sub_seq
][
prot
]
.
append
(
position
)
if
sub_seq
not
in
sub_seqs
:
sub_seqs
.
append
(
sub_seq
)
print
(
pattern_length
)
sub_seqs_copy
=
sub_seqs
.
copy
()
for
p
in
sub_seqs_copy
:
if
len
(
auxPos
[
p
])
<
min_ocurrence
:
del
auxPos
[
p
]
sub_seqs
.
remove
(
p
)
# Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
if
not
bool
(
auxPos
):
break
for
pattern
,
proteins
in
auxPos
.
items
():
for
prot
,
pos
in
proteins
.
items
():
if
pattern
not
in
pattern_freqMin
:
pattern_freqMin
[
pattern
]
=
{}
if
prot
not
in
pattern_freqMin
[
pattern
]:
pattern_freqMin
[
pattern
][
prot
]
=
[]
found
=
list
(
filter
(
lambda
x
:
pos
-
len
(
pattern
)
<=
x
<=
pos
+
len
(
pattern
),
pattern_freqMin
[
pattern
][
prot
]))
print
(
found
)
print
(
len
(
found
))
if
(
len
(
found
)
<=
0
):
pattern_freqMin
[
pattern
][
prot
]
.
extend
(
pos
)
if
len
(
pattern
)
>
2
:
if
pattern
[:
-
1
]
in
pattern_freqMin
:
del
pattern_freqMin
[
pattern
[:
-
1
]]
if
pattern
[
1
:]
in
pattern_freqMin
:
del
pattern_freqMin
[
pattern
[
1
:]]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
dict_ordered_patterns
=
dict
(
sorted
(
pattern_freqMin
.
items
(),
key
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
])))
dict_ordered_patterns
=
{
k
:
v
for
k
,
v
in
dict_ordered_patterns
.
items
()
if
len
(
k
)
>=
4
}
df
=
pd
.
DataFrame
(
dict_ordered_patterns
.
items
(),
columns
=
[
'pattern'
,
'proteins'
])
num_patrones
=
df
.
shape
[
0
]
pattern_freqMin
=
{
k
:
v
for
k
,
v
in
pattern_freqMin
.
items
()
if
len
(
k
)
>=
4
}
return
pattern_freqMin
,
num_patrones
def
remplazar_sequence_for_ID
(
pattern_freqMin
,
name
):
df_b
=
pd
.
read_excel
(
"data_nervous_genes_xf.xlsx"
)
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,'r')
cl
=
pd
.
read_excel
(
"alzheimer_protein_class 2.xlsx"
)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli
=
cl
.
groupby
(
'protein_id'
)
di
=
[]
do
=
{}
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
di
.
append
(
row
[
'class_name'
])
do
[
k
]
=
di
di
=
[]
class_dict
=
do
output
=
[]
for
key
,
value
in
pattern_freqMin
.
items
():
for
proteina
,
posiciones
in
value
.
items
():
output
.
append
([
key
,
proteina
,
posiciones
])
output
=
[
sublista
for
sublista
in
output
if
len
(
sublista
[
0
])
!=
1
]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
output_ordered
=
sorted
(
output
,
key
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
]))
proteinas_dict
=
dict
(
df_b
[[
'protein_sequence'
,
'protein_id'
]]
.
values
)
for
item
in
output_ordered
:
protein_sequence
=
item
[
1
]
if
protein_sequence
in
proteinas_dict
:
item
[
1
]
=
proteinas_dict
[
protein_sequence
]
item
.
append
(
class_dict
[
item
[
1
]]
if
item
[
1
]
in
class_dict
else
"N/A"
)
df_a
=
pd
.
DataFrame
(
output_ordered
,
columns
=
[
'Patron'
,
'Proteina'
,
'Posiciones'
,
'classesProt'
])
# Guardar el DataFrame actualizado en un archivo CSV
df_a
.
to_csv
(
'clases/'
+
name
+
'/patronesIdenticos.csv'
,
index
=
False
)
print
(
"Se ha generado el .csv con los patrones idénticos encontrados"
)
if
__name__
==
"__main__"
:
if
not
os
.
path
.
exists
(
"resultados"
):
# Si no existe, crearla
os
.
makedirs
(
"resultados"
)
print
(
f
"La carpeta resultados se ha creado correctamente."
)
else
:
print
(
f
"La carpeta resultados ya existe."
)
inicio
=
time
.
time
()
jsonfile
=
open
(
"param_file.conf"
,
"r"
)
datosInterfaz
=
json
.
load
(
jsonfile
)
#datosInterfaz = interfaz()
print
(
datosInterfaz
)
#archivoEntrada = datosInterfaz["NombreArchivoEntrada"]
enfermedad
=
datosInterfaz
[
"CodigoEnfermedad"
]
archivoTarget
=
datosInterfaz
[
"NombreArchivoTarget"
]
similitud
=
float
(
datosInterfaz
[
"Similitud"
])
cl
=
pd
.
read_excel
(
"alzheimer_protein_class 2.xlsx"
)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli
=
cl
.
groupby
(
'protein_id'
)
di
=
[]
do
=
{}
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
di
.
append
(
row
[
'class_name'
])
do
[
k
]
=
di
di
=
[]
class_dict
=
do
for
fil
in
Path
(
"clases"
)
.
rglob
(
"*.xlsx"
):
if
not
os
.
path
.
exists
(
"clases/"
+
fil
.
name
.
split
(
'.'
)[
0
]
+
"/"
):
# Si no existe, crearla
os
.
makedirs
(
"clases/"
+
fil
.
name
.
split
(
'.'
)[
0
]
+
"/"
)
print
(
f
"La carpeta resultados se ha creado correctamente."
)
else
:
print
(
f
"La carpeta resultados ya existe."
)
ejecutar
(
"clases/"
+
fil
.
name
,
enfermedad
,
similitud
)
pattern_freqMin
=
dict
()
sequences
,
num_filas
=
readData
(
"clases/"
+
fil
.
name
,
enfermedad
,
archivoTarget
)
df_b
=
pd
.
read_excel
(
"clases/"
+
fil
.
name
)
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
proteinas_dict
=
dict
(
df_b
[[
'protein_sequence'
,
'protein_id'
]]
.
values
)
ka
=
""
for
item
in
sequences
:
ka
=
proteinas_dict
[
item
]
min_ocurrence
=
math
.
floor
(
num_filas
*
float
(
datosInterfaz
[
"OcurrenciaMin"
]))
seq_len
=
0
for
i
in
sequences
:
seq_len
+=
len
(
i
)
print
(
min_ocurrence
)
pattern_freqMin
,
num_patrones
=
buscar_patrones_identicos
(
sequences
)
remplazar_sequence_for_ID
(
pattern_freqMin
,
fil
.
name
.
split
(
'.'
)[
0
])
df
=
pd
.
read_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/patronesIdenticos.csv'
,
usecols
=
[
'Patron'
,
'Proteina'
,
'Posiciones'
,
"classesProt"
],
index_col
=
False
)
#df=substitute_or_remove_prot_id2(df,"s")
df
.
to_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/patronesIdenticos.csv'
,
index
=
False
)
#dfx=df.copy()
df2
=
df
.
groupby
(
'Patron'
)
dicta
=
{
'Patron'
:[]
,
'
%
Ocurrencia_caracter'
:[],
'longitud_Apariciones'
:[],
'longitud_Apariciones_Proteina'
:[],
'
%
Patron'
:[],
'
%
Patron_proteina'
:[],
'total_Patrones'
:[],
'total_Patrones_por_prot'
:[]}
compl
=
0
comp
=
0
first
=
True
res
=
set
()
for
k
,
v
in
df2
:
res
=
set
()
for
index
,
row
in
v
.
iterrows
():
Posic
=
[
oo
for
oo
in
ast
.
literal_eval
(
row
[
'Posiciones'
])
if
oo
is
not
'['
and
oo
is
not
']'
]
res
|=
set
(
Posic
)
compl
+=
1
comp
+=
len
(
res
)
for
k
,
v
in
df2
:
dicta
=
{
'Patron'
:[]
,
'
%
Ocurrencia_caracter'
:[],
'longitud_Apariciones'
:[],
'longitud_Apariciones_Proteina'
:[],
'
%
Patron'
:[],
'
%
Patron_proteina'
:[],
'total_Patrones'
:[],
'total_Patrones_por_prot'
:[]}
dicta
[
k
]
=
0
dox
=
0
dix
=
0
co
=
0
Posic
=
set
()
for
index
,
row
in
v
.
iterrows
():
Posic
|=
set
([
oo
for
oo
in
ast
.
literal_eval
(
row
[
'Posiciones'
])
if
oo
is
not
'['
and
oo
is
not
']'
])
co
+=
1
dix
+=
len
(
Posic
)
dox
+=
len
(
Posic
)
*
len
(
str
(
k
))
dox
/=
seq_len
dicta
[
'
%
Ocurrencia_caracter'
]
.
append
(
dox
*
100
)
dicta
[
'longitud_Apariciones'
]
.
append
(
co
)
dicta
[
'longitud_Apariciones_Proteina'
]
.
append
(
dix
)
dicta
[
'
%
Patron'
]
.
append
(
co
/
compl
*
100
)
dicta
[
'
%
Patron_proteina'
]
.
append
(
dix
/
comp
*
100
)
dicta
[
'Patron'
]
.
append
(
str
(
k
))
dicta
[
'total_Patrones'
]
.
append
(
compl
)
dicta
[
'total_Patrones_por_prot'
]
.
append
(
comp
)
do
=
pd
.
DataFrame
(
dicta
)
if
not
first
:
do
.
to_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/patronesOcurrencia.csv'
,
index
=
False
,
header
=
False
,
mode
=
'a'
)
else
:
do
.
to_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/patronesOcurrencia.csv'
,
index
=
False
)
first
=
False
del
df2
df3
=
df
.
groupby
(
'Proteina'
)
del
df
first
=
True
di
=
{
'proteinas'
:[],
'maximum_ocurrence'
:[],
'patrones'
:[],
'global_ocurrence'
:[]}
df_b
=
pd
.
read_excel
(
"data_nervous_genes_xf.xlsx"
)
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,"r")
proteinas_dict
=
dict
(
df_b
[[
'protein_id'
,
'protein_sequence'
]]
.
values
)
positions_visited
=
[]
for
k
,
v
in
df3
:
di
=
{
'proteinas'
:[],
'maximum_ocurrence'
:[],
'patrones'
:[],
'global_ocurrence'
:[],
"classesProt"
:[]}
seq
=
proteinas_dict
[
k
]
di
[
'maximum_ocurrence'
]
.
append
(
len
(
seq
))
di
[
'proteinas'
]
.
append
(
k
)
pato
=
[]
glob_ocurrence
=
0
Acum
=
[]
for
index
,
row
in
v
.
iterrows
():
print
(
row
)
pat
=
{}
pat
[
'patron'
]
=
str
(
row
[
'Patron'
])
Posit
=
[
oo
for
oo
in
ast
.
literal_eval
(
row
[
'Posiciones'
])
if
oo
is
not
'['
and
oo
is
not
']'
]
print
(
Posit
)
Add
=
[]
for
i
in
Posit
:
for
kaa
in
range
(
0
,
len
(
str
(
row
[
'Patron'
]))):
print
(
i
)
Add
.
append
(
int
(
i
)
+
kaa
)
lex
=
len
(
list
(
set
(
Acum
)
&
set
(
Add
)))
Posic
=
Posit
pat
[
'loc_ocurren'
]
=
(
len
(
Posic
)
*
len
(
str
(
row
[
'Patron'
])))
/
len
(
seq
)
glob_ocurrence
+=
len
(
Posic
)
*
len
(
str
(
row
[
'Patron'
]))
-
lex
pato
.
append
(
pat
)
Acum
=
list
(
set
(
Acum
)
|
set
(
Add
))
di
[
'patrones'
]
.
append
(
pato
)
di
[
'global_ocurrence'
]
.
append
(
glob_ocurrence
)
di
[
'classesProt'
]
.
append
(
class_dict
[
k
]
if
k
in
class_dict
else
"N/A"
)
do
=
pd
.
DataFrame
(
di
)
if
not
first
:
do
.
to_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/proteinasOcurrencia.csv'
,
index
=
False
,
header
=
False
,
mode
=
'a'
)
else
:
do
.
to_csv
(
'clases/'
+
fil
.
name
.
split
(
'.'
)[
0
]
+
'/proteinasOcurrencia.csv'
,
index
=
False
)
first
=
False
#metricas.metrica_distanciaProteinas()
archivo
=
'resultados/Metrica_distanciaProteinasMismoPatron.csv'
nombreOutput
=
'resultados/Figura_DistanciaProteinasMismoPatron'
#grafica(archivo, nombreOutput)
print
(
"Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen el mismo patrón"
)
metrica
=
math
.
floor
(
num_patrones
*
float
(
datosInterfaz
[
"Metrica"
]))
metricas
.
patronesComunClas
(
metrica
,
fil
.
name
.
split
(
'.'
)[
0
])
archivo
=
'resultados/Metrica_patronesComunes.csv'
nombreOutput
=
'resultados/Figura_distanciaProteinasPatronesComunes'
#grafica(archivo, nombreOutput)
print
(
"Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen mas de un patrón en común"
)
fin
=
time
.
time
()
tiempo_total
=
fin
-
inicio
print
(
tiempo_total
,
"segundos"
)
TFM-main/src/generate_th
a
_excel.py
→
TFM-main/src/generate_th
e
_excel.py
View file @
d5cae6ac
File moved
TFM-main/src/metricas.py
View file @
d5cae6ac
...
...
@@ -212,6 +212,166 @@ def patronesComun(patronesComun):
# index=False)
def
patronesComunClas
(
patronesComun
,
name
):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros
=
[]
cl
=
pd
.
read_excel
(
"alzheimer_protein_class 2.xlsx"
)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli
=
cl
.
groupby
(
'protein_id'
)
di
=
[]
do
=
{}
for
k
,
v
in
cli
:
for
index
,
row
in
v
.
iterrows
():
di
.
append
(
row
[
'class_name'
])
do
[
k
]
=
di
di
=
[]
class_dict
=
do
with
open
(
"clases/"
+
name
+
"/patronesIdenticos.csv"
,
'r'
)
as
file
:
reader
=
csv
.
DictReader
(
file
)
for
row
in
reader
:
registros
.
append
(
row
)
# Diccionario para almacenar la cantidad de patrones únicos por proteína
patrones_por_proteina
=
{}
posiciones_patron
=
{}
# Iterar sobre los registros y extraer los patrones únicos de cada proteína
for
registro
in
registros
:
proteina
=
registro
[
'Proteina'
]
patron
=
registro
[
'Patron'
]
posicion
=
registro
[
'Posiciones'
]
if
proteina
not
in
patrones_por_proteina
:
patrones_por_proteina
[
proteina
]
=
set
()
patrones_por_proteina
[
proteina
]
.
add
(
patron
)
pp
=
[
oo
for
oo
in
ast
.
literal_eval
(
posicion
)
if
oo
is
not
'['
and
oo
is
not
']'
]
if
proteina
not
in
posiciones_patron
:
posiciones_patron
[
proteina
]
=
{}
posiciones_patron
[
proteina
][
patron
]
=
[]
for
u
in
pp
:
for
kaa
in
range
(
0
,
len
(
patron
)):
posiciones_patron
[
proteina
][
patron
]
.
append
(
kaa
+
int
(
u
))
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes
=
{}
rr
=
[]
df_p
=
pd
.
read_excel
(
"data_nervous_genes_xf.xlsx"
)
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2
=
dict
(
df_p
[[
'protein_id'
,
'protein_sequence'
]]
.
values
)
pares_proteinas_procesados
=
set
()
# Filtrar las proteínas que tienen al menos 10 patrones únicos en común
for
proteina1
,
patrones1
in
patrones_por_proteina
.
items
():
for
proteina2
,
patrones2
in
patrones_por_proteina
.
items
():
if
proteina1
!=
proteina2
and
(
proteina2
,
proteina1
)
not
in
pares_proteinas_procesados
:
patrones_comunes
=
patrones1
.
intersection
(
patrones2
)
if
len
(
patrones_comunes
)
>=
patronesComun
:
par_proteinas
=
(
proteina1
,
proteina2
)
proteinas_comunes
[
par_proteinas
]
=
patrones_comunes
pares_proteinas_procesados
.
add
(
par_proteinas
)
output
=
[]
df_b
=
pd
.
read_csv
(
"AllProteins_
%
Similitud.csv"
)
output2
=
[]
proteinas_dict
=
df_b
.
set_index
([
'Proteina1'
,
'Proteina2'
])[
'Similaridad'
]
.
to_dict
()
outbreak
=
[]
first
=
True
first2
=
True
for
par_proteinas
,
patrones_comunes
in
proteinas_comunes
.
items
():
proteina1
,
proteina2
=
par_proteinas
pattern_lengths
=
{}
pattern_l
=
{}
Antecedentes
=
{}
if
(
proteina1
==
'Q13753'
and
proteina2
==
'P07550'
):
print
(
patrones_comunes
)
for
pattern
in
patrones_comunes
:
length
=
len
(
pattern
)
key
=
f
'Longitud {length}'
if
key
in
pattern_lengths
:
pattern_lengths
[
key
]
.
append
([
pattern
])
Add
=
posiciones_patron
[
proteina1
][
pattern
]
if
(
proteina1
==
'Q13753'
and
proteina2
==
'P07550'
):
print
(
Add
)
if
proteina1
not
in
Antecedentes
:
Antecedentes
[
proteina1
]
=
set
()
lex
=
len
(
Antecedentes
[
proteina1
]
&
set
(
Add
))
Antecedentes
[
proteina1
]
.
update
(
Add
)
pattern_l
[
key
][
0
]
+=
len
(
Add
)
-
lex
Add
=
posiciones_patron
[
proteina2
][
pattern
]
if
proteina2
not
in
Antecedentes
:
Antecedentes
[
proteina2
]
=
set
()
lex
=
len
(
Antecedentes
[
proteina2
]
&
set
(
Add
))
Antecedentes
[
proteina2
]
.
update
(
Add
)
pattern_l
[
key
][
1
]
+=
len
(
Add
)
-
lex
#sprint(length*len(Posic))
else
:
pattern_lengths
[
key
]
=
[[
pattern
]]
Add
=
posiciones_patron
[
proteina1
][
pattern
]
if
proteina1
not
in
Antecedentes
:
Antecedentes
[
proteina1
]
=
set
()
lex
=
len
(
Antecedentes
[
proteina1
]
&
set
(
Add
))
#print(lex)
#print(Antecedentes)
Antecedentes
[
proteina1
]
.
update
(
Add
)
Add2
=
posiciones_patron
[
proteina2
][
pattern
]
if
proteina2
not
in
Antecedentes
:
Antecedentes
[
proteina2
]
=
set
()
lex2
=
len
(
Antecedentes
[
proteina2
]
&
set
(
Add2
))
Antecedentes
[
proteina2
]
.
update
(
Add2
)
pattern_l
[
key
]
=
[
len
(
Add
)
-
lex
,
len
(
Add2
)
-
lex2
]
sorted_pattern_lengths
=
dict
(
sorted
(
pattern_lengths
.
items
(),
key
=
lambda
x
:
int
(
x
[
0
][
9
:]),
reverse
=
True
))
if
proteina1
!=
proteina2
:
prot
=
[
proteinas_dict2
[
proteina1
],
proteinas_dict2
[
proteina2
]]
if
Antecedentes
!=
{}
and
(
len
(
prot
[
0
])
>
0
and
len
(
prot
[
1
])
>
0
):
output
.
append
([
sorted_pattern_lengths
,
proteina1
,
proteina2
,
class_dict
[
proteina1
]
if
proteina1
in
class_dict
else
"N/A"
,
class_dict
[
proteina2
]
if
proteina2
in
class_dict
else
"N/A"
])
df
=
pd
.
DataFrame
(
output
,
columns
=
[
'Patrones'
,
'Proteina1'
,
'Proteina2'
,
"classesProt1"
,
"classesProt2"
])
output
=
[]
if
(
first2
):
df
.
to_csv
(
'clases/'
+
name
+
'/Metrica_patronesComunes.csv'
,
index
=
False
)
first2
=
False
else
:
df
.
to_csv
(
'clases/'
+
name
+
'/Metrica_patronesComunes.csv'
,
index
=
False
,
header
=
False
,
mode
=
'a'
)
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
# 'N/A'])
#print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
#print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
if
Antecedentes
!=
{}
and
(
len
(
prot
[
0
])
>
0
and
len
(
prot
[
1
])
>
0
):
output2
.
append
([
proteina1
,
proteina2
,
(
max
(
len
(
Antecedentes
[
proteina1
])
/
len
(
prot
[
0
]),
len
(
Antecedentes
[
proteina2
])
/
len
(
prot
[
1
]))
*
100
),
class_dict
[
proteina1
]
if
proteina1
in
class_dict
else
"N/A"
,
class_dict
[
proteina2
]
if
proteina2
in
class_dict
else
"N/A"
])
df2
=
pd
.
DataFrame
(
output2
,
columns
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
,
"classesProt1"
,
"classesProt2"
])
output2
=
[]
if
(
first
):
df2
.
to_csv
(
'clases/'
+
name
+
'/Metrica_Coincidencia.csv'
,
index
=
False
)
first
=
False
else
:
df2
.
to_csv
(
'clases/'
+
name
+
'/Metrica_Coincidencia.csv'
,
index
=
False
,
header
=
False
,
mode
=
'a'
)
#output2=sorted(output2, key = lambda x: int(x[2]))
#df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
#df2.to_csv('resultados/Metrica_Coincidencia.csv',
# index=False)
def
remplazar_sequence_for_ID
(
output
):
df_b
=
pd
.
read_excel
(
"data_nervous_genes_xf.xlsx"
)
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
...
...
TFM-main/src/patrones_similares_aa.py
View file @
d5cae6ac
...
...
@@ -11,6 +11,8 @@ import os
import
json
import
ast
import
re
from
collections
import
defaultdict
classes
=
{}
min_ocurrence
=
0
def
swap_dict
(
d
):
...
...
@@ -57,155 +59,146 @@ def readData(archivoEntrada, enfermedad, archivoTarget):
return
sequences
,
num_filas
def
guardar_patrones_len1
(
sequences
,
pattern_freqMin
):
all_patterns
=
dict
()
def
read_aminoacidos
():
cla
=
{}
with
open
(
'aminoacidos.txt'
,
'r'
)
as
op
:
lines
=
op
.
readlines
()
for
line
in
lines
:
oo
=
line
.
replace
(
'
\n
'
,
''
)
.
split
(
'
\t
'
)
key
=
oo
.
pop
(
0
)
cla
[
key
]
=
oo
return
swap_dict
(
cla
),
cla
def
guardar_patrones_len1
(
sequences
,
pattern_freqMin
,
min_ocurrence
):
all_patterns
=
defaultdict
(
list
)
longitud_max
=
0
global
min_ocurrence
# Each pattern associated to the proteins the pattern is in
pattern_proteins
=
{}
classes
,
cla
=
read_aminoacidos
()
for
protein
in
sequences
:
longitud
=
len
(
protein
)
if
longitud
>
longitud_max
:
longitud_max
=
longitud
all_patterns
[
protein
]
=
[]
# En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
posicionPatterns
=
dict
()
cla
=
{}
with
open
(
'aminoacidos.txt'
,
'r'
)
as
op
:
lines
=
op
.
readlines
()
print
(
lines
)
for
line
in
lines
:
oo
=
line
.
replace
(
'
\n
'
,
''
)
.
split
(
'
\t
'
)
key
=
oo
.
pop
(
0
)
print
(
oo
)
cla
[
key
]
=
oo
classes
=
swap_dict
(
cla
)
clases
=
classes
print
(
clases
)
posicion_patterns
=
defaultdict
(
list
)
for
index
,
letter
in
enumerate
(
protein
):
posicionPatterns
[
letter
]
=
posicionPatterns
.
get
(
letter
,
[])
+
[
index
]
if
(
letter
in
clases
):
overst
=
set
()
for
EqvLetter
in
clases
[
letter
]:
overst
=
overst
|
set
(
cla
[
EqvLetter
])
for
EqvLetter
in
overst
:
if
(
EqvLetter
)
!=
letter
:
print
(
EqvLetter
)
posicionPatterns
[
EqvLetter
]
=
posicionPatterns
.
get
(
EqvLetter
,
[])
+
[
index
]
all_patterns
[
protein
]
=
posicionPatterns
posicion_patterns
[
letter
]
.
append
(
index
)
if
letter
in
classes
:
overst
=
set
()
.
union
(
*
[
set
(
cla
[
eqv_letter
])
for
eqv_letter
in
classes
[
letter
]])
for
eqv_letter
in
overst
:
if
eqv_letter
!=
letter
:
posicion_patterns
[
eqv_letter
]
.
append
(
index
)
all_patterns
[
protein
]
=
posicion_patterns
pattern_proteins
=
defaultdict
(
dict
)
for
protein
,
patterns
in
all_patterns
.
items
():
for
pattern
,
positions
in
patterns
.
items
():
if
pattern
not
in
pattern_proteins
:
if
pattern
not
in
pattern_proteins
:
pattern_proteins
[
pattern
]
=
{}
if
protein
not
in
pattern_proteins
[
pattern
]:
pattern_proteins
[
pattern
][
protein
]
=
[]
pattern_proteins
[
pattern
][
protein
]
.
extend
(
positions
)
for
pattern
,
proteins
in
pattern_proteins
.
items
():
if
len
(
proteins
)
>=
min_ocurrence
:
pattern_freqMin
[
pattern
]
=
proteins
df
=
pd
.
DataFrame
(
pattern_freqMin
.
items
(),
columns
=
[
'pattern'
,
'proteins'
])
df
.
to_csv
(
'prueba2.csv'
,
index
=
False
)
return
pattern_freqMin
,
posicionPatterns
,
longitud_max
def
buscar_patrones_simAA
(
sequences
,
min_ocurr
):
min_ocurrence
=
min_ocurr
return
pattern_freqMin
,
posicion_patterns
,
longitud_max
def
buscar_patrones_simAA
(
sequences
,
min_ocurr
):
min_ocurrence
=
min_ocurr
pattern_freqMin
=
{}
pattern_freqMin
,
posicionPatterns
,
longitud_max
=
guardar_patrones_len1
(
sequences
,
pattern_freqMin
)
cla
=
{}
num_patrones
=
0
with
open
(
'aminoacidos.txt'
,
'r'
)
as
op
:
lines
=
op
.
readlines
()
print
(
lines
)
for
line
in
lines
:
oo
=
line
.
replace
(
'
\n
'
,
''
)
.
split
(
'
\t
'
)
key
=
oo
.
pop
(
0
)
print
(
oo
)
cla
[
key
]
=
oo
classes
=
swap_dict
(
cla
)
clases
=
classes
if
bool
(
pattern_freqMin
):
pattern_freqMin
,
posicion_patterns
,
longitud_max
=
guardar_patrones_len1
(
sequences
,
pattern_freqMin
,
min_ocurrence
)
classes
,
cla
=
read_aminoacidos
()
if
not
bool
(
pattern_freqMin
):
return
pattern_freqMin
,
0
for
pattern_length
in
range
(
2
,
longitud_max
+
1
):
# Si se intenta acceder a una clave que no existe se creara una lista vacia
auxPos
=
{}
aux_pos
=
defaultdict
(
dict
)
sub_seqs
=
[]
for
pattern
,
proteins
in
pattern_freqMin
.
items
():
if
len
(
pattern
)
==
pattern_length
-
1
:
for
prot
,
positions
in
proteins
.
items
():
protein_len
=
len
(
prot
)
if
protein_len
<
pattern_length
-
1
:
continue
for
position
in
positions
:
pos_last_letter
=
position
+
pattern_length
-
1
if
pos_last_letter
>
len
(
prot
)
-
1
:
pos_last_letter
=
position
+
pattern_length
-
1
if
pos_last_letter
>
len
(
prot
)
-
1
:
continue
last_letter
=
prot
[
pos_last_letter
]
if
last_letter
not
in
cla
ses
:
pos_ultima_letra
=
position
+
pattern_length
-
1
if
last_letter
not
in
clas
ses
:
sub_seq
=
pattern
+
last_letter
if
sub_seq
in
pattern_freqMin
:
continue
ultima_letra
=
sub_seq
[
-
1
]
pos_ultima_letra
=
position
+
pattern_length
-
1
if
ultima_letra
in
pattern_freqMin
and
pos_ultima_letra
in
pattern_freqMin
[
ultima_letra
][
prot
]:
if
sub_seq
not
in
auxP
os
:
aux
P
os
[
sub_seq
]
=
{}
if
prot
not
in
auxP
os
[
sub_seq
]:
auxP
os
[
sub_seq
][
prot
]
=
[]
auxP
os
[
sub_seq
][
prot
]
.
append
(
position
)
if
sub_seq
not
in
aux_p
os
:
aux
_p
os
[
sub_seq
]
=
{}
if
prot
not
in
aux_p
os
[
sub_seq
]:
aux_p
os
[
sub_seq
][
prot
]
=
[]
aux_p
os
[
sub_seq
][
prot
]
.
append
(
position
)
if
sub_seq
not
in
sub_seqs
:
sub_seqs
.
append
(
sub_seq
)
else
:
overst_set
=
set
()
for
EqvLetter
in
clases
[
last_letter
]:
overst_set
|=
set
(
cla
[
EqvLetter
])
for
EqvLetter
in
overst_set
:
sub_seq
=
pattern
+
EqvLetter
overst_set
=
set
()
.
union
(
*
[
set
(
cla
[
eqv_letter
])
for
eqv_letter
in
classes
[
last_letter
]])
broken
=
False
for
eqv_letter
in
overst_set
:
sub_seq
=
pattern
+
eqv_letter
if
sub_seq
in
pattern_freqMin
:
continue
ultima_letra
=
sub_seq
[
-
1
]
pos_ultima_letra
=
position
+
pattern_length
-
1
broken
=
True
break
if
sub_seq
in
aux_pos
:
if
prot
not
in
aux_pos
[
sub_seq
]:
aux_pos
[
sub_seq
][
prot
]
=
[]
aux_pos
[
sub_seq
][
prot
]
.
append
(
position
)
broken
=
True
break
ultima_letra
=
last_letter
sub_seq
=
pattern
+
last_letter
if
ultima_letra
in
pattern_freqMin
and
pos_ultima_letra
in
pattern_freqMin
[
ultima_letra
][
prot
]:
if
sub_seq
not
in
auxP
os
:
auxP
os
[
sub_seq
]
=
{}
if
prot
not
in
auxP
os
[
sub_seq
]:
auxP
os
[
sub_seq
][
prot
]
=
[]
auxP
os
[
sub_seq
][
prot
]
.
append
(
position
)
if
not
broken
and
ultima_letra
in
pattern_freqMin
and
pos_ultima_letra
in
pattern_freqMin
[
ultima_letra
][
prot
]:
if
sub_seq
not
in
aux_p
os
:
aux_p
os
[
sub_seq
]
=
{}
if
prot
not
in
aux_p
os
[
sub_seq
]:
aux_p
os
[
sub_seq
][
prot
]
=
[]
aux_p
os
[
sub_seq
][
prot
]
.
append
(
position
)
if
sub_seq
not
in
sub_seqs
:
sub_seqs
.
append
(
sub_seq
)
print
(
pattern_length
)
sub_seqs_copy
=
sub_seqs
.
copy
()
for
p
in
sub_seqs_copy
:
if
len
(
auxP
os
[
p
])
<
min_ocurrence
:
del
auxP
os
[
p
]
if
len
(
aux_p
os
[
p
])
<
min_ocurrence
:
del
aux_p
os
[
p
]
sub_seqs
.
remove
(
p
)
# Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
if
not
bool
(
auxPos
):
if
not
bool
(
aux_pos
):
break
for
pattern
,
proteins
in
auxP
os
.
items
():
for
pattern
,
proteins
in
aux_p
os
.
items
():
for
prot
,
pos
in
proteins
.
items
():
if
pattern
not
in
pattern_freqMin
:
pattern_freqMin
[
pattern
]
=
{}
if
prot
not
in
pattern_freqMin
[
pattern
]:
pattern_freqMin
[
pattern
][
prot
]
=
[]
found
=
list
(
filter
(
lambda
x
:
pos
-
len
(
pattern
)
<=
x
<=
pos
+
len
(
pattern
),
pattern_freqMin
[
pattern
][
prot
]))
print
(
found
)
print
(
len
(
found
))
if
(
len
(
found
)
<=
0
):
found
=
list
(
filter
(
lambda
x
:
pos
-
len
(
pattern
)
<=
x
<=
pos
+
len
(
pattern
),
pattern_freqMin
[
pattern
][
prot
]))
if
len
(
found
)
<=
0
:
pattern_freqMin
[
pattern
][
prot
]
.
extend
(
pos
)
if
len
(
pattern
)
>
2
:
if
pattern
[:
-
1
]
in
pattern_freqMin
:
...
...
@@ -213,15 +206,11 @@ def buscar_patrones_simAA(sequences,min_ocurr):
if
pattern
[
1
:]
in
pattern_freqMin
:
del
pattern_freqMin
[
pattern
[
1
:]]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
dict_ordered_patterns
=
dict
(
sorted
(
pattern_freqMin
.
items
(),
key
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
])))
dict_ordered_patterns
=
{
k
:
v
for
k
,
v
in
dict_ordered_patterns
.
items
()
if
len
(
k
)
>=
4
}
#
dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
df
=
pd
.
DataFrame
(
dict_ordered_patterns
.
items
(),
columns
=
[
'pattern'
,
'proteins'
])
num_patrones
=
df
.
shape
[
0
]
pattern_freqMin
=
{
k
:
v
for
k
,
v
in
pattern_freqMin
.
items
()
if
len
(
k
)
>=
4
}
#
pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
return
pattern_freqMin
,
num_patrones
def
buscar_patrones_identicos
(
sequences
,
min_ocurr
):
pattern_freqMin
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment