Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
P
ProteinsPatterns
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rafael Artinano
ProteinsPatterns
Commits
bad93c31
Commit
bad93c31
authored
Dec 13, 2023
by
Rafael Artinano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Change issue in distance_matrix and cluster generator scripts
parent
d5cae6ac
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
201 additions
and
37 deletions
+201
-37
TFM-main/src/clustering.py
TFM-main/src/clustering.py
+123
-13
TFM-main/src/compute_distance_mat.py
TFM-main/src/compute_distance_mat.py
+78
-24
No files found.
TFM-main/src/clustering.py
View file @
bad93c31
...
...
@@ -4,6 +4,8 @@ import time
from
sklearn.cluster
import
OPTICS
,
DBSCAN
,
AgglomerativeClustering
,
BisectingKMeans
,
SpectralClustering
from
sklearn.preprocessing
import
StandardScaler
import
numpy
as
np
from
matplotlib
import
pyplot
as
plt
from
scipy.spatial.distance
import
pdist
,
squareform
from
pyclustering.cluster.dbscan
import
dbscan
from
pyclustering.utils
import
timedcall
...
...
@@ -17,6 +19,7 @@ from Bio.Seq import Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio
import
SeqIO
import
swalign
from
scipy.cluster.hierarchy
import
dendrogram
import
multiprocessing
as
mp
globi
=
0
df_b
=
None
...
...
@@ -73,7 +76,7 @@ def substitute_or_remove_prot_id(data,sub_rem):
return
data
def
readData
(
archivoEntrada
,
enfermedad
):
data
=
pd
.
read_excel
(
archivoEntrada
)
data
=
substitute_or_remove_prot_id
(
data
,
"r"
)
#
data=substitute_or_remove_prot_id(data,"r")
#data.to_excel('data_nervous_genes_xf.xlsx')
if
(
enfermedad
!=
''
):
#datar=substitute_or_remove_prot_id(data,"r")
...
...
@@ -111,7 +114,7 @@ def descarte(data, threshold):
#print(str(blast_similarity(data[0],data[1]))+" similarity of equals")
# Crear matriz de similitud
#num_points = len(data)
similarity_matrix
=
pd
.
read_csv
(
'resultados/matriz
SmithWater
.csv'
,
header
=
None
,
index_col
=
False
)
-
1
similarity_matrix
=
pd
.
read_csv
(
'resultados/matriz
NeedleWunchFS_70
.csv'
,
header
=
None
,
index_col
=
False
)
-
1
similarity_matrix
=
similarity_matrix
.
abs
()
#sim_matrix=[item[0] for item in similarity_matrix]
#k=0
...
...
@@ -132,12 +135,117 @@ def descarte(data, threshold):
dat
=
data
#datx=np.arange(len(data)).reshape(-1, 1)
#sim
aglom_instance
=
AgglomerativeClustering
(
n_clusters
=
5
00
,
affinity
=
'precomputed'
,
linkage
=
'average'
)
.
fit
(
similarity_matrix
.
to_numpy
())
aglom_instance
=
AgglomerativeClustering
(
n_clusters
=
1
00
,
affinity
=
'precomputed'
,
linkage
=
'average'
)
.
fit
(
similarity_matrix
.
to_numpy
())
print
(
aglom_instance
.
labels_
)
plot_dendrogram
(
algom_instance
,
labels
=
aglom_instance
.
labels_
)
plt
.
show
()
spectre
=
SpectralClustering
(
n_clusters
=
100
,
affinity
=
'precomputed_nearest_neighbors'
)
.
fit
(
similarity_matrix
.
to_numpy
())
cluster
=
aglom_instance
.
labels_
plot_dendrogram
(
aglom_instance
,
labels
=
aglom_instance
.
labels_
)
plt
.
show
()
filtered_clusters
=
[]
discarded_data
=
[]
discarded_data2
=
[]
dato
=
remplazar_sequence_for_ID
(
data
)
similarity_matrix2
=
similarity_matrix
.
values
.
tolist
()
clusters
=
{}
for
k
in
range
(
0
,
len
(
cluster
)):
if
cluster
[
k
]
in
clusters
:
clusters
[
cluster
[
k
]]
.
append
(
k
)
else
:
clusters
[
cluster
[
k
]]
=
[
k
]
print
(
clusters
)
for
cluster_id
,
cluster
in
clusters
.
items
():
filtered_cluster
=
[]
min_avg_distance
=
float
(
'inf'
)
central_point_index
=
None
print
(
cluster
)
#Calcular la distancia promedio para cada punto del cluster
for
point_index
in
cluster
:
total_distance
=
0
for
other_index
in
cluster
:
total_distance
+=
similarity_matrix2
[
point_index
][
other_index
]
avg_distance
=
total_distance
/
len
(
cluster
)
if
avg_distance
<
min_avg_distance
:
min_avg_distance
=
avg_distance
central_point_index
=
point_index
# Verificar si el punto central supera el umbral
similarity_percentage
=
1
-
(
min_avg_distance
/
eps
)
filtered_cluster
.
append
(
central_point_index
)
print
(
max
(
cluster
))
print
(
len
(
datacp
))
print
(
len
(
data
))
print
(
len
(
dato
))
discarded_data
.
extend
([[
datacp
[
i
],
cluster_id
,
data
[
central_point_index
]
,
dato
[
i
]]
for
i
in
cluster
])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if
filtered_cluster
:
filtered_clusters
.
append
(
filtered_cluster
)
data
=
remplazar_sequence_for_ID
(
data
)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if
discarded_data
:
df
=
pd
.
DataFrame
(
discarded_data
,
columns
=
[
'protein_sequence'
,
'cluster_id'
,
'centroid'
,
'protein_id'
])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df
.
to_csv
(
'resultados/proteinasClusterAglomerativeNW70.csv'
,
index
=
False
)
spectre
=
SpectralClustering
(
n_clusters
=
100
,
affinity
=
'precomputed_nearest_neighbors'
)
.
fit
(
similarity_matrix
.
to_numpy
())
cluster
=
spectre
.
labels_
print
(
spectre
.
labels_
)
filtered_clusters
=
[]
discarded_data
=
[]
discarded_data2
=
[]
dato
=
remplazar_sequence_for_ID
(
data
)
similarity_matrix2
=
similarity_matrix
.
values
.
tolist
()
clusters
=
{}
for
k
in
range
(
0
,
len
(
cluster
)):
if
cluster
[
k
]
in
clusters
:
clusters
[
cluster
[
k
]]
.
append
(
k
)
else
:
clusters
[
cluster
[
k
]]
=
[
k
]
print
(
clusters
)
for
cluster_id
,
cluster
in
clusters
.
items
():
filtered_cluster
=
[]
min_avg_distance
=
float
(
'inf'
)
central_point_index
=
None
print
(
cluster
)
#Calcular la distancia promedio para cada punto del cluster
for
point_index
in
cluster
:
total_distance
=
0
for
other_index
in
cluster
:
total_distance
+=
similarity_matrix2
[
point_index
][
other_index
]
avg_distance
=
total_distance
/
len
(
cluster
)
if
avg_distance
<
min_avg_distance
:
min_avg_distance
=
avg_distance
central_point_index
=
point_index
# Verificar si el punto central supera el umbral
similarity_percentage
=
1
-
(
min_avg_distance
/
eps
)
filtered_cluster
.
append
(
central_point_index
)
print
(
max
(
cluster
))
discarded_data
.
extend
([[
datacp
[
i
],
cluster_id
,
data
[
central_point_index
]
,
dato
[
i
]]
for
i
in
cluster
])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if
filtered_cluster
:
filtered_clusters
.
append
(
filtered_cluster
)
data
=
remplazar_sequence_for_ID
(
data
)
# Imprimir los resultados
#for cluster_id, cluster in enumerate(filtered_clusters):
# cluster_data = [data[i] for i in cluster]
# print(f'Cluster {cluster_id}: {", ".join(cluster_data)}')
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
if
discarded_data
:
df
=
pd
.
DataFrame
(
discarded_data
,
columns
=
[
'protein_sequence'
,
'cluster_id'
,
'centroid'
,
'protein_id'
])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
df
.
to_csv
(
'resultados/proteinasClusterSpectralNW70.csv'
,
index
=
False
)
dbscan_instance
=
DBSCAN
(
eps
=
eps
,
min_samples
=
min_samples
,
metric
=
'precomputed'
,
algorithm
=
'brute'
)
.
fit
(
similarity_matrix
.
to_numpy
())
cluster
=
dbscan_instance
.
labels_
print
(
str
(
len
(
cluster
))
+
" "
+
str
(
len
(
similarity_matrix
.
values
.
tolist
())))
...
...
@@ -146,7 +254,7 @@ def descarte(data, threshold):
discarded_data
=
[]
discarded_data2
=
[]
dato
=
remplazar_sequence_for_ID
(
data
)
similarity_matrix
=
similarity_matrix
.
values
.
tolist
()
similarity_matrix
2
=
similarity_matrix
.
values
.
tolist
()
clusters
=
{}
for
k
in
range
(
0
,
len
(
cluster
)):
if
cluster
[
k
]
in
clusters
:
...
...
@@ -163,7 +271,7 @@ def descarte(data, threshold):
for
point_index
in
cluster
:
total_distance
=
0
for
other_index
in
cluster
:
total_distance
+=
similarity_matrix
[
point_index
][
other_index
]
total_distance
+=
similarity_matrix
2
[
point_index
][
other_index
]
avg_distance
=
total_distance
/
len
(
cluster
)
if
avg_distance
<
min_avg_distance
:
min_avg_distance
=
avg_distance
...
...
@@ -174,7 +282,7 @@ def descarte(data, threshold):
filtered_cluster
.
append
(
central_point_index
)
#
discarded_data.extend([[datacp[i], cluster_id,data[central_point_index] , dato[i]]for i in cluster])
discarded_data
.
extend
([[
datacp
[
i
],
cluster_id
,
data
[
central_point_index
]
,
dato
[
i
]]
for
i
in
cluster
])
#discarded_data2.extend([[dato[i],datacp[i]] for i in cluster if i != central_point_index] )
if
filtered_cluster
:
filtered_clusters
.
append
(
filtered_cluster
)
...
...
@@ -188,11 +296,11 @@ def descarte(data, threshold):
#discarded_data = remplazar_sequence_for_ID(discarded_data)
# Guardar los datos descartados en un archivo CSV utilizando Pandas
#
if discarded_data:
#df = pd.DataFrame( [], columns=['protein_sequence','cluster_id','centroid','ProteinasDescartadas
'])
if
discarded_data
:
df
=
pd
.
DataFrame
(
discarded_data
,
columns
=
[
'protein_sequence'
,
'cluster_id'
,
'centroid'
,
'protein_id
'
])
#df2 = pd.DataFrame( discarded_data2, columns=['ProteinasDescartadas','secuencia'])
#df.to_csv('resultados/proteinasDescartadasSmith
.csv', index=False)
#df2.to_csv('resultados/proteinas
Descartadas2
.csv', index=False)
df
.
to_csv
(
'resultados/proteinasClusterDBScanNW70
.csv'
,
index
=
False
)
#df2.to_csv('resultados/proteinas
ClusterDBScan
.csv', index=False)
def
remplazar_sequence_for_ID
(
output
):
...
...
@@ -227,3 +335,5 @@ def remplazar_ID_for_sequence(output):
def
ejecutar
(
archivoEntrada
,
enfermedad
,
similitud
):
data
=
readData
(
archivoEntrada
,
enfermedad
)
descarte
(
data
,
similitud
)
if
__name__
==
'__main__'
:
ejecutar
(
"data_nervous_genes_xf.xlsx"
,
"C0002395"
,
0.0001
)
TFM-main/src/compute_distance_mat.py
View file @
bad93c31
...
...
@@ -143,8 +143,8 @@ def calculate_matrix_blasto(data):
datf
.
to_csv
(
'resultados/matrizBlast.csv'
,
index
=
False
,
header
=
False
)
def
remplazar_sequence_for_ID
(
output
):
df_b
=
pd
.
read_excel
(
"data_nervous_genes_
2
.xlsx"
)
df_b
=
substitute_or_remove_prot_id
(
df_b
,
"s"
)
df_b
=
pd
.
read_excel
(
"data_nervous_genes_
xf
.xlsx"
)
#
df_b= substitute_or_remove_prot_id(df_b,"s")
proteinas_dict
=
dict
(
df_b
[[
'protein_sequence'
,
'protein_id'
]]
.
values
)
for
i
in
range
(
len
(
output
)):
...
...
@@ -255,45 +255,99 @@ def readData(archivoEntrada, enfermedad):
if
__name__
==
"__main__"
:
data
=
readData
(
"data_nervous_genes_x.xlsx"
,
"C0002395"
)
calculate_matrix_needle
(
data
)
calculate_matrix_smith
(
data
)
data
=
readData
(
"data_nervous_genes_xf.xlsx"
,
"C0002395"
)
#calculate_matrix_needle(data)
#calculate_matrix_smith(data)
#calculate_matriz_levens(data)
output
=
data
.
to_list
()
#output=remplazar_sequence_for_ID(data
)
output
=
list
(
remplazar_sequence_for_ID
(
data
)
)
similarity_matrix
=
pd
.
read_csv
(
'resultados/matrizLevenshtein.csv'
,
header
=
None
,
index_col
=
False
)
-
1
#similarity_matrix=similarity_matrix/2
similarity_matrix
=
similarity_matrix
.
abs
()
similarity_matrix
.
to_numpy
()
sim_mat_40
=
similarity_matrix
.
copy
()
sim_mat_40
=
sim_mat_40
.
to_numpy
()
sim_mat_20
=
similarity_matrix
.
copy
()
sim_mat_10
=
similarity_matrix
.
copy
()
data_40
=
pd
.
read_csv
(
'resultados/Metrica_Coincidencia_40.csv'
,
names
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
],
index_col
=
False
)
sim_mat_20
=
sim_mat_20
.
to_numpy
()
#sim_mat_10=similarity_matrix.copy()
data_40
=
pd
.
read_csv
(
'resultados/Metrica_Coincidencia.csv'
,
names
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
],
index_col
=
False
)
data_40
=
data_40
.
drop
([
0
])
data_20
=
pd
.
read_csv
(
'resultados/Metrica_Coincidencia_20.csv'
,
names
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
],
index_col
=
False
)
data_20
=
data_20
.
drop
([
0
])
data_10
=
pd
.
read_csv
(
'resultados/Metrica_Coincidencia_10.csv'
,
names
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
],
index_col
=
False
)
data_10
=
data_10
.
drop
([
0
])
new_sim
=
np
.
copy
(
similarity_matrix
)
print
(
output
)
new_sim_mean
=
np
.
copy
(
similarity_matrix
)
#
data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#
data_20=data_20.drop([0])
#
data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#
data_10=data_10.drop([0])
#
new_sim=np.copy(similarity_matrix)
#
print(output)
#
new_sim_mean=np.copy(similarity_matrix)
for
i
,
ks
in
data_40
.
iterrows
():
sim_mat_40
[
output
.
index
(
ks
[
'proteina1'
])][
output
.
index
(
ks
[
'proteina2'
])]
+=
float
(
ks
[
'
%
Coincidencia'
])
*
0.3
#print(ks['proteina1'])
sim_mat_40
[
output
.
index
(
ks
[
'proteina1'
])][
output
.
index
(
ks
[
'proteina2'
])]
+=
float
(
ks
[
'
%
Coincidencia'
])
*
0.3
/
70
sim_mat_20
[
output
.
index
(
ks
[
'proteina1'
])][
output
.
index
(
ks
[
'proteina2'
])]
+=
float
(
ks
[
'
%
Coincidencia'
])
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
#dfx=pd.DataFrame(sim_mat_20)
#dfx=df/1.3
#dfx=df-1
#dfx.abs()
for
i
in
range
(
0
,
sim_mat_20
.
shape
[
0
]):
sim_mat_20
[
i
][
i
]
*=
2
dfx
=
pd
.
DataFrame
(
sim_mat_20
)
dfx
=
dfx
.
apply
(
lambda
x
:
x
/
2
)
dfx
=
dfx
-
1
dfx
=
dfx
.
abs
()
#dfx.to_csv("resultados/matrizLevenshteinFS_20.csv",header=False,index=False)
dfx
.
to_csv
(
"resultados/matrizLevenshteinFS_Mean.csv"
,
header
=
False
,
index
=
False
)
for
i
in
range
(
0
,
sim_mat_40
.
shape
[
0
]):
sim_mat_40
[
i
][
i
]
/=
0.7
dfx
=
pd
.
DataFrame
(
sim_mat_40
)
dfx
=
dfx
/
1.3
dfx
=
dfx
.
apply
(
lambda
x
:
x
*
0.7
)
dfx
=
dfx
-
1
dfx
.
abs
()
dfx
=
dfx
.
abs
()
dfx
.
to_csv
(
"resultados/matrizLevenshteinFS_70.csv"
,
header
=
False
,
index
=
False
)
similarity_matrix
=
pd
.
read_csv
(
'resultados/matrizNeedleWunch.csv'
,
header
=
None
,
index_col
=
False
)
+
1
similarity_matrix
=
similarity_matrix
/
2
similarity_matrix
=
similarity_matrix
.
abs
()
similarity_matrix
.
to_numpy
()
sim_mat_40
=
similarity_matrix
.
copy
()
sim_mat_40
=
sim_mat_40
.
to_numpy
()
sim_mat_20
=
similarity_matrix
.
copy
()
sim_mat_20
=
sim_mat_20
.
to_numpy
()
#sim_mat_10=similarity_matrix.copy()
data_40
=
pd
.
read_csv
(
'resultados/Metrica_Coincidencia.csv'
,
names
=
[
'proteina1'
,
'proteina2'
,
'
%
Coincidencia'
],
index_col
=
False
)
data_40
=
data_40
.
drop
([
0
])
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
indexes
=
[]
for
i
,
ks
in
data_40
.
iterrows
():
indexes
.
append
((
output
.
index
(
ks
[
'proteina1'
]),
output
.
index
(
ks
[
'proteina2'
])))
sim_mat_40
[
output
.
index
(
ks
[
'proteina1'
])][
output
.
index
(
ks
[
'proteina2'
])]
+=
float
(
ks
[
'
%
Coincidencia'
])
*
0.3
/
70
sim_mat_20
[
output
.
index
(
ks
[
'proteina1'
])][
output
.
index
(
ks
[
'proteina2'
])]
+=
float
(
ks
[
'
%
Coincidencia'
])
/
100
#print(indexes)
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
for
i
in
range
(
0
,
sim_mat_20
.
shape
[
0
]):
sim_mat_20
[
i
][
i
]
*=
2
dfx
=
pd
.
DataFrame
(
sim_mat_20
)
dfx
=
dfx
.
apply
(
lambda
x
:
x
/
2
)
dfx
=
dfx
-
1
dfx
=
dfx
.
abs
()
dfx
.
to_csv
(
"resultados/matrizNeedleWunchFS_Mean.csv"
,
header
=
False
,
index
=
False
)
for
i
in
range
(
0
,
sim_mat_40
.
shape
[
0
]):
sim_mat_40
[
i
][
i
]
/=
0.7
dfx
=
pd
.
DataFrame
(
sim_mat_40
)
dfx
=
dfx
.
apply
(
lambda
x
:
x
*
0.7
)
dfx
=
dfx
-
1
dfx
=
dfx
.
abs
()
dfx
.
to_csv
(
"resultados/m
atrizLevenshteinFS_4
0.csv"
,
header
=
False
,
index
=
False
)
dfx
.
to_csv
(
"resultados/m
matrizNeedleWunchFS_7
0.csv"
,
header
=
False
,
index
=
False
)
"""
dfx=pd.DataFrame(sim_mat_10)
dfx=df/1.3
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment