Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
D
DRIVE ML Metamodel
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
GRENADA
DRIVE ML Metamodel
Commits
607f73b7
Commit
607f73b7
authored
Jun 03, 2025
by
albasanzbus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cambios1
parent
58abbb6b
Pipeline
#81
failed with stages
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
75 additions
and
63 deletions
+75
-63
code/metrics.py
code/metrics.py
+15
-6
code/models.py
code/models.py
+15
-14
code/pipelines/downsampling.py
code/pipelines/downsampling.py
+3
-10
code/pipelines/oversampling.py
code/pipelines/oversampling.py
+4
-9
code/pipelines/oversampling_gridsearch.py
code/pipelines/oversampling_gridsearch.py
+1
-1
code/preprocessing.py
code/preprocessing.py
+12
-7
code/run.py
code/run.py
+25
-16
No files found.
code/metrics.py
View file @
607f73b7
...
...
@@ -4,8 +4,10 @@ from sklearn.metrics import (f1_score, precision_score, roc_auc_score,
recall_score
,
confusion_matrix
,
classification_report
,
precision_recall_curve
)
# ----------------- @K -----------------
def
recall_precision_at_k
(
y_true
,
y_proba
,
step
=
50
,
name_model
=
""
):
"""
Calcula recall y precisión para los top-k pares positivos.
"""
idx
=
np
.
argsort
(
y_proba
)[::
-
1
]
y_true
=
np
.
asarray
(
y_true
)[
idx
]
out
=
[]
...
...
@@ -18,9 +20,10 @@ def recall_precision_at_k(y_true, y_proba, step=50, name_model=""):
modelo
=
name_model
))
return
pd
.
DataFrame
(
out
)
# ---------------- resumen métricas -----------------
def
summary_metrics
(
y_test
,
y_pred
,
y_proba
):
# Obtener índices de las 100 predicciones con mayor probabilidad
"""
Devuelve resumen de métricas (F1 por clase, precisión, recall, AUC, matriz de confusión).
"""
top_100_idx
=
np
.
argsort
(
y_proba
)[::
-
1
][:
100
]
top_100_true
=
np
.
array
(
y_test
)[
top_100_idx
]
.
sum
()
recall_at_100
=
top_100_true
/
np
.
array
(
y_test
)
.
sum
()
...
...
@@ -47,16 +50,22 @@ def summary_metrics(y_test, y_pred, y_proba):
FN
=
fn
)
# ----------------- matriz de confusion -----------------
def
conf_mat
(
y_test
,
y_pred
):
"""
Devuelve la matriz de confusión (TN, FP, FN, TP).
"""
return
confusion_matrix
(
y_test
,
y_pred
)
# ----------------- reporte de clasificacion -----------------
def
clasif_report
(
y_test
,
y_pred
):
"""
Devuelve un informe detallado de clasificación:
"""
return
classification_report
(
y_test
,
y_pred
,
zero_division
=
0
)
# ----------------- umbral -----------------
def
find_best_threshold
(
y_true
,
y_proba
,
metric
=
f1_score
):
"""
Calcula el mejor umbral de decisión que maximiza el F1-score.
"""
prec
,
rec
,
thr
=
precision_recall_curve
(
y_true
,
y_proba
)
f1
=
2
*
prec
*
rec
/
(
prec
+
rec
+
1e-9
)
best_idx
=
np
.
nanargmax
(
f1
)
...
...
code/models.py
View file @
607f73b7
...
...
@@ -9,9 +9,10 @@ from xgboost import XGBClassifier
from
imblearn.pipeline
import
Pipeline
from
imblearn.over_sampling
import
SMOTE
#
----------------- configuraciones globales -----------------
#
Validación cruzada estratificada con 5 folds
CV
=
StratifiedKFold
(
n_splits
=
5
,
shuffle
=
True
,
random_state
=
42
)
# Diccionario de métodos disponibles
MODELS
=
{
"knn"
:
KNeighborsClassifier
(),
"random_forest"
:
RandomForestClassifier
(
random_state
=
42
),
...
...
@@ -20,6 +21,7 @@ MODELS = {
"xgboost"
:
XGBClassifier
(
eval_metric
=
"logloss"
,
random_state
=
42
)
}
# Hiperparámetros para GridSearchCV
PARAMS
=
{
"knn"
:
{
"n_neighbors"
:
[
3
,
5
,
7
,
9
],
...
...
@@ -48,60 +50,59 @@ PARAMS = {
}
}
# ----------------- GridSearch --------------------
def
_grid_with_smote
(
estimator
,
params
):
"""
GridSearchCV aplicando SMOTE
"""
pipe
=
Pipeline
([
(
"smote"
,
SMOTE
(
random_state
=
42
)),
(
"clf"
,
estimator
)
])
# parámetros del estimador → prefijo "clf__"
params
=
{
f
"clf__{k}"
:
v
for
k
,
v
in
params
.
items
()}
return
GridSearchCV
(
pipe
,
params
,
scoring
=
"f1"
,
cv
=
CV
,
verbose
=
0
)
def
grid_search
(
estimator
,
params
):
"""
GridSearchCV sin técnicas de balanceo
"""
return
GridSearchCV
(
estimator
,
params
,
scoring
=
'f1'
,
cv
=
CV
,
verbose
=
0
)
# ------------- BASE PARA LOS MODELOS -----------------------
def
_make
(
name
,
*
,
variant
=
"normal"
,
pos_weight
=
None
):
"""
variant ∈ {"normal", "smote", "cs"}
GridSearchCV (smote o normal) con el método correspondiente (con o sin cost-sensitive)
"""
est
=
clone
(
MODELS
[
name
])
grid
=
deepcopy
(
PARAMS
[
name
])
#
--- cost-sensitive -------------------------------------
#
cost-sensitive
if
variant
.
startswith
(
"cs"
):
if
name
==
"xgboost"
:
est
.
set_params
(
scale_pos_weight
=
pos_weight
)
else
:
est
.
set_params
(
class_weight
=
"balanced"
)
#
--- grid builder ---------------------------------------
#
GridSearchCV
if
"smote"
in
variant
:
return
_grid_with_smote
(
est
,
grid
)
else
:
return
grid_search
(
est
,
grid
)
# ------------- FUNCIONES -----------------------
# normales
# Modelos normales (sin balanceo)
def
knn
():
return
_make
(
"knn"
)
def
random_forest
():
return
_make
(
"random_forest"
)
def
decision_tree
():
return
_make
(
"decision_tree"
)
def
logistic_regression
():
return
_make
(
"logistic_regression"
)
def
xgboost
():
return
_make
(
"xgboost"
)
#
SMOTE EN EL GRIDSEARCH
#
Modelos con SMOTE
def
knn_smote
():
return
_make
(
"knn"
,
variant
=
"smote"
)
def
random_forest_smote
():
return
_make
(
"random_forest"
,
variant
=
"smote"
)
def
decision_tree_smote
():
return
_make
(
"decision_tree"
,
variant
=
"smote"
)
def
logistic_regression_smote
():
return
_make
(
"logistic_regression"
,
variant
=
"smote"
)
def
xgboost_smote
():
return
_make
(
"xgboost"
,
variant
=
"smote"
)
# cost-sensitive
#
Modelos
cost-sensitive
def
random_forest_cs
():
return
_make
(
"random_forest"
,
variant
=
"cs"
)
def
decision_tree_cs
():
return
_make
(
"decision_tree"
,
variant
=
"cs"
)
def
logistic_regression_cs
():
return
_make
(
"logistic_regression"
,
variant
=
"cs"
)
...
...
code/pipelines/downsampling.py
View file @
607f73b7
# pipelines/downsampling.py
from
..preprocessing
import
split_holdout
,
scale
,
cluster_downsample
# pipelines/oversampling.py
from
..preprocessing
import
split_holdout
,
scale
,
oversample
import
pandas
as
pd
import
os
from
pathlib
import
Path
ROOT
=
Path
(
__file__
)
.
resolve
()
.
parents
[
2
]
...
...
@@ -11,30 +8,26 @@ CACHED_DIR = ROOT / "data" / "downsampling"
CACHED_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
def
prepare
(
df
,
model_name
):
#Dividir datos
(
X_train_raw
,
y_train_raw
,
X_test_alz
,
y_test_alz
,
df_test_alz
,
X_test_esq
,
y_test_esq
,
df_test_esq
)
=
split_holdout
(
df
)
#
2) Escalado (si procede) - ajustamos scaler sobre el raw train
#
Escalado
if
model_name
in
{
"knn"
,
"logistic_regression"
}:
X_train_scaled
,
X_test_alz
=
scale
(
X_train_raw
,
X_test_alz
)
_
,
X_test_esq
=
scale
(
X_train_raw
,
X_test_esq
)
else
:
# Si no escalamos, dejamos sin cambiar
X_train_scaled
=
X_train_raw
.
copy
()
# Nombre de archivos
x_path
=
CACHED_DIR
/
"X_train_downsampled.parquet"
y_path
=
CACHED_DIR
/
"y_train_downsampled.parquet"
if
x_path
.
exists
()
and
y_path
.
exists
():
X_train
=
pd
.
read_parquet
(
x_path
)
y_train
=
pd
.
read_parquet
(
y_path
)
.
squeeze
()
# para convertirlo a Series
y_train
=
pd
.
read_parquet
(
y_path
)
.
squeeze
()
else
:
#
3)
SMOTE sobre datos ya escalados
# SMOTE sobre datos ya escalados
X_train
,
y_train
=
cluster_downsample
(
X_train_scaled
,
y_train_raw
)
# Guardar en caché
X_train
.
to_parquet
(
x_path
,
index
=
False
)
pd
.
DataFrame
(
y_train
,
columns
=
[
"class"
])
.
to_parquet
(
y_path
,
index
=
False
)
...
...
code/pipelines/oversampling.py
View file @
607f73b7
# pipelines/oversampling.py
from
..preprocessing
import
split_holdout
,
scale
,
oversample
import
pandas
as
pd
import
os
from
pathlib
import
Path
ROOT
=
Path
(
__file__
)
.
resolve
()
.
parents
[
2
]
# apunta a tu carpeta raíz (TFG/)
ROOT
=
Path
(
__file__
)
.
resolve
()
.
parents
[
2
]
CACHED_DIR
=
ROOT
/
"data"
/
"oversampling"
CACHED_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
def
prepare
(
df
,
model_name
):
#Dividir datos
(
X_train_raw
,
y_train_raw
,
X_test_alz
,
y_test_alz
,
df_test_alz
,
X_test_esq
,
y_test_esq
,
df_test_esq
)
=
split_holdout
(
df
)
#
2) Escalado (si procede) - ajustamos scaler sobre el raw train
#
Escalado
if
model_name
in
{
"knn"
,
"logistic_regression"
}:
X_train_scaled
,
X_test_alz
=
scale
(
X_train_raw
,
X_test_alz
)
_
,
X_test_esq
=
scale
(
X_train_raw
,
X_test_esq
)
else
:
# Si no escalamos, dejamos sin cambiar
X_train_scaled
=
X_train_raw
.
copy
()
# Nombre de archivos
x_path
=
CACHED_DIR
/
"X_train_oversampled.parquet"
y_path
=
CACHED_DIR
/
"y_train_oversampled.parquet"
if
x_path
.
exists
()
and
y_path
.
exists
():
X_train
=
pd
.
read_parquet
(
x_path
)
y_train
=
pd
.
read_parquet
(
y_path
)
.
squeeze
()
# para convertirlo a Series
y_train
=
pd
.
read_parquet
(
y_path
)
.
squeeze
()
else
:
#
3)
SMOTE sobre datos ya escalados
# SMOTE sobre datos ya escalados
X_train
,
y_train
=
oversample
(
X_train_scaled
,
y_train_raw
)
# Guardar en caché
X_train
.
to_parquet
(
x_path
,
index
=
False
)
pd
.
DataFrame
(
y_train
,
columns
=
[
"class"
])
.
to_parquet
(
y_path
,
index
=
False
)
...
...
code/pipelines/oversampling_gridsearch.py
View file @
607f73b7
# pipelines/oversampling.py
# pipelines/oversampling
_gridsearch
.py
from
..preprocessing
import
split_holdout
,
scale
,
oversample
def
prepare
(
df
,
model_name
):
...
...
code/preprocessing.py
View file @
607f73b7
...
...
@@ -5,9 +5,10 @@ from sklearn.cluster import KMeans
from
sklearn.metrics
import
pairwise_distances_argmin_min
from
sklearn.preprocessing
import
StandardScaler
### ----------------- dividir datos -----------------
def
split_holdout
(
df
:
pd
.
DataFrame
,
alzheimer_id
=
"C0002395"
,
esquizofrenia_id
=
"C0036341"
):
"""
Divide el dataset en entrenamiento y test para dos enfermedades concretas (Alzhéimer y Esquizofrenia).
"""
# Separar conjuntos
train
=
df
[(
df
.
disease_id
!=
alzheimer_id
)
&
(
df
.
disease_id
!=
esquizofrenia_id
)]
test_alzheimer
=
df
[
df
.
disease_id
==
alzheimer_id
]
...
...
@@ -29,22 +30,26 @@ def split_holdout(df: pd.DataFrame, alzheimer_id="C0002395", esquizofrenia_id="C
return
X_train
,
y_train
,
X_test_alz
,
y_test_alz
,
df_test_alz
,
X_test_esq
,
y_test_esq
,
df_test_esq
### ----------------- scaler -----------------
def
scale
(
X_train
,
X_test
):
"""
Aplica escalado estándar (media=0, var=1) a los datos de entrenamiento y test.
"""
scaler
=
StandardScaler
()
# Asegúrate de que X_train sea DataFrame con columnas
X_train_scaled
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X_train
),
columns
=
X_train
.
columns
,
index
=
X_train
.
index
)
X_test_scaled
=
pd
.
DataFrame
(
scaler
.
transform
(
X_test
),
columns
=
X_test
.
columns
,
index
=
X_test
.
index
)
return
X_train_scaled
,
X_test_scaled
### ----------------- SMOTE -----------------
def
oversample
(
X
,
y
):
"""
Aplica SMOTE para balancear clases en los datos de entrenamiento.
"""
smote
=
SMOTE
(
random_state
=
42
)
return
smote
.
fit_resample
(
X
,
y
)
### ----------------- down-sampling por clustering -----------------
def
cluster_downsample
(
X
:
pd
.
DataFrame
,
y
:
pd
.
Series
):
"""
Downsampling sobre la clase mayoritaria usando KMeans.
"""
# Asegurar que X es DataFrame y y es Series
if
isinstance
(
X
,
np
.
ndarray
):
X
=
pd
.
DataFrame
(
X
)
...
...
code/run.py
View file @
607f73b7
...
...
@@ -10,14 +10,19 @@ from .metrics import recall_precision_at_k, summary_metrics, clasif_report, find
from
.
import
models
matplotlib
.
use
(
"Agg"
)
#
───────────────────────── paths globales ────────────────────────────────────
#
Paths globales
ROOT
=
pathlib
.
Path
(
__file__
)
.
resolve
()
.
parents
[
1
]
DATA
=
ROOT
/
"data"
/
"unified"
RESULTS
=
ROOT
/
"results"
FIGURES
=
ROOT
/
"figures"
def
_nested_cv_train
(
X
,
y
,
model_fn
):
"""
Ejecuta validación cruzada anidada:
- Inner loop: GridSearchCV para buscar hiperparámetros
- Outer loop: evalúa el rendimiento del modelo ajustado (con los mejores hiperparámetros del inner loop) en múltiples folds
Devuelve: mejor modelo, hiperparámetros óptimos, umbral y métricas agregadas
"""
if
not
isinstance
(
X
,
pd
.
DataFrame
):
X
=
pd
.
DataFrame
(
X
)
if
not
isinstance
(
y
,
pd
.
Series
):
y
=
pd
.
Series
(
y
)
...
...
@@ -39,10 +44,12 @@ def _nested_cv_train(X, y, model_fn):
best_params
=
grid
.
best_params_
y_val_proba
=
best_model
.
predict_proba
(
X_val
)[:,
1
]
# Mejor umbral
thr
,
_
=
find_best_threshold
(
y_val
,
y_val_proba
)
y_val_pred
=
(
y_val_proba
>=
thr
)
.
astype
(
int
)
metrics_fold
=
summary_metrics
(
y_val
,
y_val_pred
,
y_val_proba
)
# Guardar resultados
metrics_per_fold
.
append
(
metrics_fold
)
thresholds
.
append
(
thr
)
models
.
append
(
best_model
)
...
...
@@ -65,6 +72,10 @@ def _nested_cv_train(X, y, model_fn):
return
best_model
,
best_params
,
best_thr
,
metrics_cv
def
_evaluate_test_set
(
model
,
threshold
,
X_test
,
y_test
,
df_test
,
tag
):
"""
Evalúa el modelo entrenado sobre los conjuntos de test.
Devuelve predicciones, métricas y curva precision-recall@K
"""
y_proba
=
model
.
predict_proba
(
X_test
)[:,
1
]
y_pred
=
(
y_proba
>=
threshold
)
.
astype
(
int
)
...
...
@@ -78,9 +89,14 @@ def _evaluate_test_set(model, threshold, X_test, y_test, df_test, tag):
return
df_k
,
summ
,
report
,
df_preds
# ───────────────────────── Ejecutar pipeline ───────────────────────────────
def
run_pipeline
(
df
,
model_name
:
str
,
pipeline_name
:
str
):
"""
Ejecuta todo el pipeline de clasificación:
- Prepara los datos
- Entrena con validación cruzada anidada
- Evalúa en los conjuntos de test (Alzhéimer y Esquizofrenia)
- Devuelve las predicciones, .pickle y métricas
"""
# Preparar datos según el pipeline
prep
=
importlib
.
import_module
(
f
".pipelines.{pipeline_name}"
,
package
=
"code"
)
X_train
,
y_train
,
X_test_alz
,
y_test_alz
,
df_test_alz
,
X_test_esq
,
y_test_esq
,
df_test_esq
=
prep
.
prepare
(
df
,
model_name
)
...
...
@@ -98,14 +114,13 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
model_fn
=
getattr
(
models
,
model_name
)
tag
=
f
"{pipeline_name}-{model_name}"
# ─── ENTRENAMIENTO CON VALIDACIÓN CRUZADA ───
best_model
,
best_params
,
best_thr
,
metrics_cv
=
_nested_cv_train
(
X_train
,
y_train
,
model_fn
)
# Entrenamiento
best_model
,
best_params
,
best_thr
,
metrics_cv
=
_nested_cv_train
(
X_train
,
y_train
,
model_fn
)
# Evaluar Alzheimer
df_k_alz
,
summ_alz
,
report_alz
,
df_preds_alz
=
_evaluate_test_set
(
best_model
,
best_thr
,
X_test_alz
,
y_test_alz
,
df_test_alz
,
tag
+
"-alz"
)
# Evaluar Esquizofrenia
df_k_esq
,
summ_esq
,
report_esq
,
df_preds_esq
=
_evaluate_test_set
(
best_model
,
best_thr
,
X_test_esq
,
y_test_esq
,
df_test_esq
,
tag
+
"-esq"
...
...
@@ -120,7 +135,6 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
"tag"
:
tag
,
}
# ──────────────────────────── main ──────────────────────────────────────────
if
__name__
==
"__main__"
:
df
=
pd
.
read_parquet
(
DATA
/
"table_completed_preprocessed.parquet"
)
...
...
@@ -134,7 +148,6 @@ if __name__ == "__main__":
results
=
run_pipeline
(
df
,
model_name
,
pipeline_name
)
tag
=
f
"{pipeline_name}-{model_name}"
# ─────── Crear rutas de almacenamiento ───────
path_models
=
RESULTS
/
"models"
/
model_name
/
pipeline_name
path_alz
=
RESULTS
/
"alzheimer"
/
model_name
/
pipeline_name
path_esq
=
RESULTS
/
"esquizofrenia"
/
model_name
/
pipeline_name
...
...
@@ -142,7 +155,7 @@ if __name__ == "__main__":
for
path
in
[
path_models
,
path_alz
,
path_esq
]:
path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
#
─────── Guardar modelo y métricas CV ───────
#
Guardar modelo entrenado y métricas de CV
with
open
(
path_models
/
"model.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
results
[
"best_model"
],
f
)
...
...
@@ -151,26 +164,22 @@ if __name__ == "__main__":
metrics_cv_df
[
"best_params"
]
=
str
(
results
[
"params"
])
metrics_cv_df
.
to_csv
(
path_models
/
"metrics.csv"
,
index
=
False
)
#
─────── Guardar Alzheimer ───────
#
Guardar resultados Alzheimer
df_k_alz
,
summ_alz
,
report_alz
,
df_preds_alz
=
results
[
"alz"
]
df_preds_alz
[
"pipeline"
]
=
tag
df_preds_alz
.
to_csv
(
path_alz
/
"predictions.csv"
,
index
=
False
)
df_k_alz
[
"pipeline"
]
=
tag
df_k_alz
.
to_csv
(
path_alz
/
"recall_precision_at_k.csv"
,
index
=
False
)
df_metrics_alz
=
pd
.
DataFrame
([
summ_alz
])
df_metrics_alz
[
"pipeline"
]
=
tag
df_metrics_alz
.
to_csv
(
path_alz
/
"metrics.csv"
,
index
=
False
)
#
─────── Guardar Esquizofrenia ───────
#
Guardar resultados Esquizofrenia
df_k_esq
,
summ_esq
,
report_esq
,
df_preds_esq
=
results
[
"esq"
]
df_preds_esq
[
"pipeline"
]
=
tag
df_preds_esq
.
to_csv
(
path_esq
/
"predictions.csv"
,
index
=
False
)
df_k_esq
[
"pipeline"
]
=
tag
df_k_esq
.
to_csv
(
path_esq
/
"recall_precision_at_k.csv"
,
index
=
False
)
df_metrics_esq
=
pd
.
DataFrame
([
summ_esq
])
df_metrics_esq
[
"pipeline"
]
=
tag
df_metrics_esq
.
to_csv
(
path_esq
/
"metrics.csv"
,
index
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment