Cambios1

607f73b7 · albasanzbus · 58abbb6b · 607f73b7 · 607f73b7 · 607f73b7
Commit 607f73b7 authored Jun 03, 2025 by albasanzbus
7 changed files
--- a/code/metrics.py
+++ b/code/metrics.py
@@ -4,8 +4,10 @@ from sklearn.metrics import (f1_score, precision_score, roc_auc_score,
                             recall_score, confusion_matrix, classification_report,
                             precision_recall_curve)

-# ----------------- @K -----------------
 def recall_precision_at_k(y_true, y_proba, step=50, name_model=""):
+    """
+    Calcula recall y precisión para los top-k pares positivos.
+    """
    idx = np.argsort(y_proba)[::-1]
    y_true = np.asarray(y_true)[idx]
    out = []
@@ -18,9 +20,10 @@ def recall_precision_at_k(y_true, y_proba, step=50, name_model=""):
                        modelo=name_model))
    return pd.DataFrame(out)

-# ---------------- resumen métricas -----------------
 def summary_metrics(y_test, y_pred, y_proba):
-    # Obtener índices de las 100 predicciones con mayor probabilidad
+    """
+    Devuelve resumen de métricas (F1 por clase, precisión, recall, AUC, matriz de confusión).
+    """
    top_100_idx = np.argsort(y_proba)[::-1][:100]
    top_100_true = np.array(y_test)[top_100_idx].sum()
    recall_at_100 = top_100_true / np.array(y_test).sum()
@@ -47,16 +50,22 @@ def summary_metrics(y_test, y_pred, y_proba):
        FN=fn
    )

-# ----------------- matriz de confusion -----------------
 def conf_mat(y_test, y_pred):
+    """
+    Devuelve la matriz de confusión (TN, FP, FN, TP).
+    """
    return confusion_matrix(y_test, y_pred)

-# ----------------- reporte de clasificacion -----------------
 def clasif_report(y_test, y_pred):
+    """
+    Devuelve un informe detallado de clasificación:
+    """
    return classification_report(y_test, y_pred, zero_division=0)

-# ----------------- umbral -----------------
 def find_best_threshold(y_true, y_proba, metric=f1_score):
+    """
+    Calcula el mejor umbral de decisión que maximiza el F1-score.
+    """
    prec, rec, thr = precision_recall_curve(y_true, y_proba)
    f1 = 2 * prec * rec / (prec + rec + 1e-9)
    best_idx = np.nanargmax(f1)

--- a/code/models.py
+++ b/code/models.py
@@ -9,9 +9,10 @@ from xgboost import XGBClassifier
 from imblearn.pipeline import Pipeline
 from imblearn.over_sampling import SMOTE

-# -----------------  configuraciones globales -----------------
+# Validación cruzada estratificada con 5 folds
 CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

+# Diccionario de métodos disponibles
 MODELS = {
    "knn":                KNeighborsClassifier(),
    "random_forest":      RandomForestClassifier(random_state=42),
@@ -20,6 +21,7 @@ MODELS = {
    "xgboost":            XGBClassifier(eval_metric="logloss", random_state=42)
 }

+# Hiperparámetros para GridSearchCV
 PARAMS = {
    "knn": {
        "n_neighbors": [3, 5, 7, 9],
@@ -48,60 +50,59 @@ PARAMS = {
    }
 }

-
-# -----------------  GridSearch --------------------
 def _grid_with_smote(estimator, params):
+    """
+    GridSearchCV aplicando SMOTE
+    """
    pipe = Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("clf", estimator)
    ])
-    # parámetros del estimador → prefijo "clf__"
    params = {f"clf__{k}": v for k, v in params.items()}
    return GridSearchCV(pipe, params, scoring="f1",
                        cv=CV, verbose=0)

 def grid_search(estimator, params):
+    """
+    GridSearchCV sin técnicas de balanceo
+    """
    return GridSearchCV(estimator, params, scoring='f1', cv=CV, verbose=0)

-
-# -------------  BASE PARA LOS MODELOS  -----------------------
 def _make(name, *, variant="normal", pos_weight=None):
    """
-    variant ∈ {"normal", "smote", "cs"}
+    GridSearchCV (smote o normal) con el método correspondiente (con o sin cost-sensitive)
    """
    est = clone(MODELS[name])
    grid = deepcopy(PARAMS[name])

-    # --- cost-sensitive -------------------------------------
+    # cost-sensitive
    if variant.startswith("cs"):
        if name == "xgboost":
            est.set_params(scale_pos_weight=pos_weight)
        else:
            est.set_params(class_weight="balanced")

-    # --- grid builder ---------------------------------------
+    # GridSearchCV
    if "smote" in variant:
        return _grid_with_smote(est, grid)
    else:
        return grid_search(est, grid)

-
-# -------------  FUNCIONES -----------------------
-# normales
+# Modelos normales (sin balanceo)
 def knn(): return _make("knn")
 def random_forest(): return _make("random_forest")
 def decision_tree(): return _make("decision_tree")
 def logistic_regression(): return _make("logistic_regression")
 def xgboost(): return _make("xgboost")

-# SMOTE EN EL GRIDSEARCH
+# Modelos con SMOTE
 def knn_smote():return _make("knn", variant="smote")
 def random_forest_smote(): return _make("random_forest", variant="smote")
 def decision_tree_smote(): return _make("decision_tree", variant="smote")
 def logistic_regression_smote(): return _make("logistic_regression", variant="smote")
 def xgboost_smote(): return _make("xgboost", variant="smote")

-# cost-sensitive
+# Modelos cost-sensitive
 def random_forest_cs(): return _make("random_forest", variant="cs")
 def decision_tree_cs(): return _make("decision_tree", variant="cs")
 def logistic_regression_cs(): return _make("logistic_regression", variant="cs")

--- a/code/pipelines/downsampling.py
+++ b/code/pipelines/downsampling.py
 # pipelines/downsampling.py
 from ..preprocessing import split_holdout, scale, cluster_downsample
-# pipelines/oversampling.py
-from ..preprocessing import split_holdout, scale, oversample
 import pandas as pd
-import os
 from pathlib import Path

 ROOT = Path(__file__).resolve().parents[2]
@@ -11,30 +8,26 @@ CACHED_DIR = ROOT / "data" / "downsampling"
 CACHED_DIR.mkdir(parents=True, exist_ok=True)

 def prepare(df, model_name):
-    #Dividir datos
    (X_train_raw, y_train_raw,
     X_test_alz, y_test_alz, df_test_alz,
     X_test_esq, y_test_esq, df_test_esq) = split_holdout(df)

-    # 2) Escalado (si procede) - ajustamos scaler sobre el raw train
+    # Escalado
    if model_name in {"knn", "logistic_regression"}:
        X_train_scaled, X_test_alz = scale(X_train_raw, X_test_alz)
        _,              X_test_esq = scale(X_train_raw, X_test_esq)
    else:
-        # Si no escalamos, dejamos sin cambiar
        X_train_scaled = X_train_raw.copy()

-    # Nombre de archivos
    x_path = CACHED_DIR / "X_train_downsampled.parquet"
    y_path = CACHED_DIR / "y_train_downsampled.parquet"

    if x_path.exists() and y_path.exists():
        X_train = pd.read_parquet(x_path)
-        y_train = pd.read_parquet(y_path).squeeze()  # para convertirlo a Series
+        y_train = pd.read_parquet(y_path).squeeze()
    else:
-        # 3) SMOTE sobre datos ya escalados
+        # SMOTE sobre datos ya escalados
        X_train, y_train = cluster_downsample(X_train_scaled, y_train_raw)
-        # Guardar en caché
        X_train.to_parquet(x_path, index=False)
        pd.DataFrame(y_train, columns=["class"]).to_parquet(y_path, index=False)


--- a/code/pipelines/oversampling.py
+++ b/code/pipelines/oversampling.py
 # pipelines/oversampling.py
 from ..preprocessing import split_holdout, scale, oversample
 import pandas as pd
-import os
 from pathlib import Path

-ROOT = Path(__file__).resolve().parents[2]  # apunta a tu carpeta raíz (TFG/)
+ROOT = Path(__file__).resolve().parents[2]
 CACHED_DIR = ROOT / "data" / "oversampling"
 CACHED_DIR.mkdir(parents=True, exist_ok=True)

 def prepare(df, model_name):
-    #Dividir datos
    (X_train_raw, y_train_raw,
     X_test_alz, y_test_alz, df_test_alz,
     X_test_esq, y_test_esq, df_test_esq) = split_holdout(df)

-    # 2) Escalado (si procede) - ajustamos scaler sobre el raw train
+    # Escalado
    if model_name in {"knn", "logistic_regression"}:
        X_train_scaled, X_test_alz = scale(X_train_raw, X_test_alz)
        _,              X_test_esq = scale(X_train_raw, X_test_esq)
    else:
-        # Si no escalamos, dejamos sin cambiar
        X_train_scaled = X_train_raw.copy()

-    # Nombre de archivos
    x_path = CACHED_DIR / "X_train_oversampled.parquet"
    y_path = CACHED_DIR / "y_train_oversampled.parquet"

    if x_path.exists() and y_path.exists():
        X_train = pd.read_parquet(x_path)
-        y_train = pd.read_parquet(y_path).squeeze()  # para convertirlo a Series
+        y_train = pd.read_parquet(y_path).squeeze()
    else:
-        # 3) SMOTE sobre datos ya escalados
+        # SMOTE sobre datos ya escalados
        X_train, y_train = oversample(X_train_scaled, y_train_raw)
-        # Guardar en caché
        X_train.to_parquet(x_path, index=False)
        pd.DataFrame(y_train, columns=["class"]).to_parquet(y_path, index=False)


--- a/code/pipelines/oversampling_gridsearch.py
+++ b/code/pipelines/oversampling_gridsearch.py
-# pipelines/oversampling.py
+# pipelines/oversampling_gridsearch.py
 from ..preprocessing import split_holdout, scale, oversample

 def prepare(df, model_name):

--- a/code/preprocessing.py
+++ b/code/preprocessing.py
@@ -5,9 +5,10 @@ from sklearn.cluster import KMeans
 from sklearn.metrics import pairwise_distances_argmin_min
 from sklearn.preprocessing import StandardScaler

-
-### ----------------- dividir datos  -----------------
 def split_holdout(df: pd.DataFrame, alzheimer_id="C0002395", esquizofrenia_id="C0036341"):
+    """
+        Divide el dataset en entrenamiento y test para dos enfermedades concretas (Alzhéimer y Esquizofrenia).
+    """
    # Separar conjuntos
    train = df[(df.disease_id != alzheimer_id) & (df.disease_id != esquizofrenia_id)]
    test_alzheimer = df[df.disease_id == alzheimer_id]
@@ -29,22 +30,26 @@ def split_holdout(df: pd.DataFrame, alzheimer_id="C0002395", esquizofrenia_id="C

    return X_train, y_train, X_test_alz, y_test_alz, df_test_alz, X_test_esq, y_test_esq, df_test_esq

-
-### ----------------- scaler -----------------
 def scale(X_train, X_test):
+    """
+    Aplica escalado estándar (media=0, var=1) a los datos de entrenamiento y test.
+    """
    scaler = StandardScaler()
-    # Asegúrate de que X_train sea DataFrame con columnas
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled

-### ----------------- SMOTE -----------------
 def oversample(X, y):
+    """
+    Aplica SMOTE para balancear clases en los datos de entrenamiento.
+    """
    smote = SMOTE(random_state=42)
    return smote.fit_resample(X, y)

-### ----------------- down-sampling por clustering -----------------
 def cluster_downsample(X: pd.DataFrame, y: pd.Series):
+    """
+    Downsampling sobre la clase mayoritaria usando KMeans.
+    """
    # Asegurar que X es DataFrame y y es Series
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)

--- a/code/run.py
+++ b/code/run.py
@@ -10,14 +10,19 @@ from .metrics import recall_precision_at_k, summary_metrics, clasif_report, find
 from . import models
 matplotlib.use("Agg")

-# ───────────────────────── paths globales ────────────────────────────────────
+# Paths globales
 ROOT = pathlib.Path(__file__).resolve().parents[1]
 DATA = ROOT / "data"/ "unified"
 RESULTS = ROOT / "results"
 FIGURES = ROOT / "figures"

-
 def _nested_cv_train(X, y, model_fn):
+    """
+    Ejecuta validación cruzada anidada:
+    - Inner loop: GridSearchCV para buscar hiperparámetros
+    - Outer loop: evalúa el rendimiento del modelo ajustado (con los mejores hiperparámetros del inner loop) en múltiples folds
+    Devuelve: mejor modelo, hiperparámetros óptimos, umbral y métricas agregadas
+    """
    if not isinstance(X, pd.DataFrame):X = pd.DataFrame(X)
    if not isinstance(y, pd.Series):y = pd.Series(y)

@@ -39,10 +44,12 @@ def _nested_cv_train(X, y, model_fn):
        best_params = grid.best_params_
        y_val_proba = best_model.predict_proba(X_val)[:, 1]

+        # Mejor umbral
        thr, _ = find_best_threshold(y_val, y_val_proba)
        y_val_pred = (y_val_proba >= thr).astype(int)
        metrics_fold = summary_metrics(y_val, y_val_pred, y_val_proba)

+        # Guardar resultados
        metrics_per_fold.append(metrics_fold)
        thresholds.append(thr)
        models.append(best_model)
@@ -65,6 +72,10 @@ def _nested_cv_train(X, y, model_fn):
    return best_model, best_params, best_thr, metrics_cv

 def _evaluate_test_set(model, threshold, X_test, y_test, df_test, tag):
+    """
+    Evalúa el modelo entrenado sobre los conjuntos de test.
+    Devuelve predicciones, métricas y curva precision-recall@K
+    """
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

@@ -78,9 +89,14 @@ def _evaluate_test_set(model, threshold, X_test, y_test, df_test, tag):

    return df_k, summ, report, df_preds

-
-# ───────────────────────── Ejecutar  pipeline ───────────────────────────────
 def run_pipeline(df, model_name:str, pipeline_name:str):
+    """
+    Ejecuta todo el pipeline de clasificación:
+    - Prepara los datos
+    - Entrena con validación cruzada anidada
+    - Evalúa en los conjuntos de test (Alzhéimer y Esquizofrenia)
+    - Devuelve las predicciones, .pickle y métricas
+    """
    # Preparar datos según el pipeline
    prep = importlib.import_module(f".pipelines.{pipeline_name}", package="code")
    X_train, y_train, X_test_alz, y_test_alz, df_test_alz, X_test_esq, y_test_esq, df_test_esq = prep.prepare(df, model_name)
@@ -98,14 +114,13 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
        model_fn = getattr(models, model_name)

    tag = f"{pipeline_name}-{model_name}"
-    # ─── ENTRENAMIENTO CON VALIDACIÓN CRUZADA ───
-    best_model, best_params, best_thr, metrics_cv = _nested_cv_train(X_train, y_train, model_fn)

+    # Entrenamiento
+    best_model, best_params, best_thr, metrics_cv = _nested_cv_train(X_train, y_train, model_fn)
    # Evaluar Alzheimer
    df_k_alz, summ_alz, report_alz, df_preds_alz = _evaluate_test_set(
        best_model, best_thr, X_test_alz, y_test_alz, df_test_alz, tag + "-alz"
    )
-
    # Evaluar Esquizofrenia
    df_k_esq, summ_esq, report_esq, df_preds_esq = _evaluate_test_set(
        best_model, best_thr, X_test_esq, y_test_esq, df_test_esq, tag + "-esq"
@@ -120,7 +135,6 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
        "tag": tag,
    }

-# ──────────────────────────── main ──────────────────────────────────────────
 if __name__ == "__main__":
    df = pd.read_parquet(DATA / "table_completed_preprocessed.parquet")

@@ -134,7 +148,6 @@ if __name__ == "__main__":
            results = run_pipeline(df, model_name, pipeline_name)
            tag = f"{pipeline_name}-{model_name}"

-            # ─────── Crear rutas de almacenamiento ───────
            path_models = RESULTS / "models" / model_name / pipeline_name
            path_alz = RESULTS / "alzheimer" / model_name / pipeline_name
            path_esq = RESULTS / "esquizofrenia" / model_name / pipeline_name
@@ -142,7 +155,7 @@ if __name__ == "__main__":
            for path in [path_models, path_alz, path_esq]:
                path.mkdir(parents=True, exist_ok=True)

-            # ─────── Guardar modelo y métricas CV ───────
+            # Guardar modelo entrenado y métricas de CV
            with open(path_models / "model.pkl", "wb") as f:
                pickle.dump(results["best_model"], f)

@@ -151,26 +164,22 @@ if __name__ == "__main__":
            metrics_cv_df["best_params"] = str(results["params"])
            metrics_cv_df.to_csv(path_models / "metrics.csv", index=False)

-            # ─────── Guardar Alzheimer ───────
+            # Guardar resultados Alzheimer
            df_k_alz, summ_alz, report_alz, df_preds_alz = results["alz"]
            df_preds_alz["pipeline"] = tag
            df_preds_alz.to_csv(path_alz / "predictions.csv", index=False)
-
            df_k_alz["pipeline"] = tag
            df_k_alz.to_csv(path_alz / "recall_precision_at_k.csv", index=False)
-
            df_metrics_alz = pd.DataFrame([summ_alz])
            df_metrics_alz["pipeline"] = tag
            df_metrics_alz.to_csv(path_alz / "metrics.csv", index=False)

-            # ─────── Guardar Esquizofrenia ───────
+            # Guardar resultados Esquizofrenia
            df_k_esq, summ_esq, report_esq, df_preds_esq = results["esq"]
            df_preds_esq["pipeline"] = tag
            df_preds_esq.to_csv(path_esq / "predictions.csv", index=False)
-
            df_k_esq["pipeline"] = tag
            df_k_esq.to_csv(path_esq / "recall_precision_at_k.csv", index=False)
-
            df_metrics_esq = pd.DataFrame([summ_esq])
            df_metrics_esq["pipeline"] = tag
            df_metrics_esq.to_csv(path_esq / "metrics.csv", index=False)