Commit 607f73b7 authored by albasanzbus's avatar albasanzbus

Cambios1

parent 58abbb6b
Pipeline #81 failed with stages
......@@ -4,8 +4,10 @@ from sklearn.metrics import (f1_score, precision_score, roc_auc_score,
recall_score, confusion_matrix, classification_report,
precision_recall_curve)
# ----------------- @K -----------------
def recall_precision_at_k(y_true, y_proba, step=50, name_model=""):
"""
Calcula recall y precisión para los top-k pares positivos.
"""
idx = np.argsort(y_proba)[::-1]
y_true = np.asarray(y_true)[idx]
out = []
......@@ -18,9 +20,10 @@ def recall_precision_at_k(y_true, y_proba, step=50, name_model=""):
modelo=name_model))
return pd.DataFrame(out)
# ---------------- resumen métricas -----------------
def summary_metrics(y_test, y_pred, y_proba):
# Obtener índices de las 100 predicciones con mayor probabilidad
"""
Devuelve resumen de métricas (F1 por clase, precisión, recall, AUC, matriz de confusión).
"""
top_100_idx = np.argsort(y_proba)[::-1][:100]
top_100_true = np.array(y_test)[top_100_idx].sum()
recall_at_100 = top_100_true / np.array(y_test).sum()
......@@ -47,16 +50,22 @@ def summary_metrics(y_test, y_pred, y_proba):
FN=fn
)
# ----------------- matriz de confusion -----------------
def conf_mat(y_test, y_pred):
"""
Devuelve la matriz de confusión (TN, FP, FN, TP).
"""
return confusion_matrix(y_test, y_pred)
# ----------------- reporte de clasificacion -----------------
def clasif_report(y_test, y_pred):
"""
Devuelve un informe detallado de clasificación:
"""
return classification_report(y_test, y_pred, zero_division=0)
# ----------------- umbral -----------------
def find_best_threshold(y_true, y_proba, metric=f1_score):
"""
Calcula el mejor umbral de decisión que maximiza el F1-score.
"""
prec, rec, thr = precision_recall_curve(y_true, y_proba)
f1 = 2 * prec * rec / (prec + rec + 1e-9)
best_idx = np.nanargmax(f1)
......
......@@ -9,9 +9,10 @@ from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# ----------------- configuraciones globales -----------------
# Validación cruzada estratificada con 5 folds
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Diccionario de métodos disponibles
MODELS = {
"knn": KNeighborsClassifier(),
"random_forest": RandomForestClassifier(random_state=42),
......@@ -20,6 +21,7 @@ MODELS = {
"xgboost": XGBClassifier(eval_metric="logloss", random_state=42)
}
# Hiperparámetros para GridSearchCV
PARAMS = {
"knn": {
"n_neighbors": [3, 5, 7, 9],
......@@ -48,60 +50,59 @@ PARAMS = {
}
}
# ----------------- GridSearch --------------------
def _grid_with_smote(estimator, params):
"""
GridSearchCV aplicando SMOTE
"""
pipe = Pipeline([
("smote", SMOTE(random_state=42)),
("clf", estimator)
])
# parámetros del estimador → prefijo "clf__"
params = {f"clf__{k}": v for k, v in params.items()}
return GridSearchCV(pipe, params, scoring="f1",
cv=CV, verbose=0)
def grid_search(estimator, params):
"""
GridSearchCV sin técnicas de balanceo
"""
return GridSearchCV(estimator, params, scoring='f1', cv=CV, verbose=0)
# ------------- BASE PARA LOS MODELOS -----------------------
def _make(name, *, variant="normal", pos_weight=None):
"""
variant ∈ {"normal", "smote", "cs"}
GridSearchCV (smote o normal) con el método correspondiente (con o sin cost-sensitive)
"""
est = clone(MODELS[name])
grid = deepcopy(PARAMS[name])
# --- cost-sensitive -------------------------------------
# cost-sensitive
if variant.startswith("cs"):
if name == "xgboost":
est.set_params(scale_pos_weight=pos_weight)
else:
est.set_params(class_weight="balanced")
# --- grid builder ---------------------------------------
# GridSearchCV
if "smote" in variant:
return _grid_with_smote(est, grid)
else:
return grid_search(est, grid)
# ------------- FUNCIONES -----------------------
# normales
# Modelos normales (sin balanceo)
def knn(): return _make("knn")
def random_forest(): return _make("random_forest")
def decision_tree(): return _make("decision_tree")
def logistic_regression(): return _make("logistic_regression")
def xgboost(): return _make("xgboost")
# SMOTE EN EL GRIDSEARCH
# Modelos con SMOTE
def knn_smote():return _make("knn", variant="smote")
def random_forest_smote(): return _make("random_forest", variant="smote")
def decision_tree_smote(): return _make("decision_tree", variant="smote")
def logistic_regression_smote(): return _make("logistic_regression", variant="smote")
def xgboost_smote(): return _make("xgboost", variant="smote")
# cost-sensitive
# Modelos cost-sensitive
def random_forest_cs(): return _make("random_forest", variant="cs")
def decision_tree_cs(): return _make("decision_tree", variant="cs")
def logistic_regression_cs(): return _make("logistic_regression", variant="cs")
......
# pipelines/downsampling.py
from ..preprocessing import split_holdout, scale, cluster_downsample
# pipelines/oversampling.py
from ..preprocessing import split_holdout, scale, oversample
import pandas as pd
import os
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
......@@ -11,30 +8,26 @@ CACHED_DIR = ROOT / "data" / "downsampling"
CACHED_DIR.mkdir(parents=True, exist_ok=True)
def prepare(df, model_name):
#Dividir datos
(X_train_raw, y_train_raw,
X_test_alz, y_test_alz, df_test_alz,
X_test_esq, y_test_esq, df_test_esq) = split_holdout(df)
# 2) Escalado (si procede) - ajustamos scaler sobre el raw train
# Escalado
if model_name in {"knn", "logistic_regression"}:
X_train_scaled, X_test_alz = scale(X_train_raw, X_test_alz)
_, X_test_esq = scale(X_train_raw, X_test_esq)
else:
# Si no escalamos, dejamos sin cambiar
X_train_scaled = X_train_raw.copy()
# Nombre de archivos
x_path = CACHED_DIR / "X_train_downsampled.parquet"
y_path = CACHED_DIR / "y_train_downsampled.parquet"
if x_path.exists() and y_path.exists():
X_train = pd.read_parquet(x_path)
y_train = pd.read_parquet(y_path).squeeze() # para convertirlo a Series
y_train = pd.read_parquet(y_path).squeeze()
else:
# 3) SMOTE sobre datos ya escalados
# SMOTE sobre datos ya escalados
X_train, y_train = cluster_downsample(X_train_scaled, y_train_raw)
# Guardar en caché
X_train.to_parquet(x_path, index=False)
pd.DataFrame(y_train, columns=["class"]).to_parquet(y_path, index=False)
......
# pipelines/oversampling.py
from ..preprocessing import split_holdout, scale, oversample
import pandas as pd
import os
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2] # apunta a tu carpeta raíz (TFG/)
ROOT = Path(__file__).resolve().parents[2]
CACHED_DIR = ROOT / "data" / "oversampling"
CACHED_DIR.mkdir(parents=True, exist_ok=True)
def prepare(df, model_name):
#Dividir datos
(X_train_raw, y_train_raw,
X_test_alz, y_test_alz, df_test_alz,
X_test_esq, y_test_esq, df_test_esq) = split_holdout(df)
# 2) Escalado (si procede) - ajustamos scaler sobre el raw train
# Escalado
if model_name in {"knn", "logistic_regression"}:
X_train_scaled, X_test_alz = scale(X_train_raw, X_test_alz)
_, X_test_esq = scale(X_train_raw, X_test_esq)
else:
# Si no escalamos, dejamos sin cambiar
X_train_scaled = X_train_raw.copy()
# Nombre de archivos
x_path = CACHED_DIR / "X_train_oversampled.parquet"
y_path = CACHED_DIR / "y_train_oversampled.parquet"
if x_path.exists() and y_path.exists():
X_train = pd.read_parquet(x_path)
y_train = pd.read_parquet(y_path).squeeze() # para convertirlo a Series
y_train = pd.read_parquet(y_path).squeeze()
else:
# 3) SMOTE sobre datos ya escalados
# SMOTE sobre datos ya escalados
X_train, y_train = oversample(X_train_scaled, y_train_raw)
# Guardar en caché
X_train.to_parquet(x_path, index=False)
pd.DataFrame(y_train, columns=["class"]).to_parquet(y_path, index=False)
......
# pipelines/oversampling.py
# pipelines/oversampling_gridsearch.py
from ..preprocessing import split_holdout, scale, oversample
def prepare(df, model_name):
......
......@@ -5,9 +5,10 @@ from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
### ----------------- dividir datos -----------------
def split_holdout(df: pd.DataFrame, alzheimer_id="C0002395", esquizofrenia_id="C0036341"):
"""
Divide el dataset en entrenamiento y test para dos enfermedades concretas (Alzhéimer y Esquizofrenia).
"""
# Separar conjuntos
train = df[(df.disease_id != alzheimer_id) & (df.disease_id != esquizofrenia_id)]
test_alzheimer = df[df.disease_id == alzheimer_id]
......@@ -29,22 +30,26 @@ def split_holdout(df: pd.DataFrame, alzheimer_id="C0002395", esquizofrenia_id="C
return X_train, y_train, X_test_alz, y_test_alz, df_test_alz, X_test_esq, y_test_esq, df_test_esq
### ----------------- scaler -----------------
def scale(X_train, X_test):
"""
Aplica escalado estándar (media=0, var=1) a los datos de entrenamiento y test.
"""
scaler = StandardScaler()
# Asegúrate de que X_train sea DataFrame con columnas
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
return X_train_scaled, X_test_scaled
### ----------------- SMOTE -----------------
def oversample(X, y):
"""
Aplica SMOTE para balancear clases en los datos de entrenamiento.
"""
smote = SMOTE(random_state=42)
return smote.fit_resample(X, y)
### ----------------- down-sampling por clustering -----------------
def cluster_downsample(X: pd.DataFrame, y: pd.Series):
"""
Downsampling sobre la clase mayoritaria usando KMeans.
"""
# Asegurar que X es DataFrame y y es Series
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
......
......@@ -10,14 +10,19 @@ from .metrics import recall_precision_at_k, summary_metrics, clasif_report, find
from . import models
matplotlib.use("Agg")
# ───────────────────────── paths globales ────────────────────────────────────
# Paths globales
ROOT = pathlib.Path(__file__).resolve().parents[1]
DATA = ROOT / "data"/ "unified"
RESULTS = ROOT / "results"
FIGURES = ROOT / "figures"
def _nested_cv_train(X, y, model_fn):
"""
Ejecuta validación cruzada anidada:
- Inner loop: GridSearchCV para buscar hiperparámetros
- Outer loop: evalúa el rendimiento del modelo ajustado (con los mejores hiperparámetros del inner loop) en múltiples folds
Devuelve: mejor modelo, hiperparámetros óptimos, umbral y métricas agregadas
"""
if not isinstance(X, pd.DataFrame):X = pd.DataFrame(X)
if not isinstance(y, pd.Series):y = pd.Series(y)
......@@ -39,10 +44,12 @@ def _nested_cv_train(X, y, model_fn):
best_params = grid.best_params_
y_val_proba = best_model.predict_proba(X_val)[:, 1]
# Mejor umbral
thr, _ = find_best_threshold(y_val, y_val_proba)
y_val_pred = (y_val_proba >= thr).astype(int)
metrics_fold = summary_metrics(y_val, y_val_pred, y_val_proba)
# Guardar resultados
metrics_per_fold.append(metrics_fold)
thresholds.append(thr)
models.append(best_model)
......@@ -65,6 +72,10 @@ def _nested_cv_train(X, y, model_fn):
return best_model, best_params, best_thr, metrics_cv
def _evaluate_test_set(model, threshold, X_test, y_test, df_test, tag):
"""
Evalúa el modelo entrenado sobre los conjuntos de test.
Devuelve predicciones, métricas y curva precision-recall@K
"""
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= threshold).astype(int)
......@@ -78,9 +89,14 @@ def _evaluate_test_set(model, threshold, X_test, y_test, df_test, tag):
return df_k, summ, report, df_preds
# ───────────────────────── Ejecutar pipeline ───────────────────────────────
def run_pipeline(df, model_name:str, pipeline_name:str):
"""
Ejecuta todo el pipeline de clasificación:
- Prepara los datos
- Entrena con validación cruzada anidada
- Evalúa en los conjuntos de test (Alzhéimer y Esquizofrenia)
- Devuelve las predicciones, .pickle y métricas
"""
# Preparar datos según el pipeline
prep = importlib.import_module(f".pipelines.{pipeline_name}", package="code")
X_train, y_train, X_test_alz, y_test_alz, df_test_alz, X_test_esq, y_test_esq, df_test_esq = prep.prepare(df, model_name)
......@@ -98,14 +114,13 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
model_fn = getattr(models, model_name)
tag = f"{pipeline_name}-{model_name}"
# ─── ENTRENAMIENTO CON VALIDACIÓN CRUZADA ───
best_model, best_params, best_thr, metrics_cv = _nested_cv_train(X_train, y_train, model_fn)
# Entrenamiento
best_model, best_params, best_thr, metrics_cv = _nested_cv_train(X_train, y_train, model_fn)
# Evaluar Alzheimer
df_k_alz, summ_alz, report_alz, df_preds_alz = _evaluate_test_set(
best_model, best_thr, X_test_alz, y_test_alz, df_test_alz, tag + "-alz"
)
# Evaluar Esquizofrenia
df_k_esq, summ_esq, report_esq, df_preds_esq = _evaluate_test_set(
best_model, best_thr, X_test_esq, y_test_esq, df_test_esq, tag + "-esq"
......@@ -120,7 +135,6 @@ def run_pipeline(df, model_name:str, pipeline_name:str):
"tag": tag,
}
# ──────────────────────────── main ──────────────────────────────────────────
if __name__ == "__main__":
df = pd.read_parquet(DATA / "table_completed_preprocessed.parquet")
......@@ -134,7 +148,6 @@ if __name__ == "__main__":
results = run_pipeline(df, model_name, pipeline_name)
tag = f"{pipeline_name}-{model_name}"
# ─────── Crear rutas de almacenamiento ───────
path_models = RESULTS / "models" / model_name / pipeline_name
path_alz = RESULTS / "alzheimer" / model_name / pipeline_name
path_esq = RESULTS / "esquizofrenia" / model_name / pipeline_name
......@@ -142,7 +155,7 @@ if __name__ == "__main__":
for path in [path_models, path_alz, path_esq]:
path.mkdir(parents=True, exist_ok=True)
# ─────── Guardar modelo y métricas CV ───────
# Guardar modelo entrenado y métricas de CV
with open(path_models / "model.pkl", "wb") as f:
pickle.dump(results["best_model"], f)
......@@ -151,26 +164,22 @@ if __name__ == "__main__":
metrics_cv_df["best_params"] = str(results["params"])
metrics_cv_df.to_csv(path_models / "metrics.csv", index=False)
# ─────── Guardar Alzheimer ───────
# Guardar resultados Alzheimer
df_k_alz, summ_alz, report_alz, df_preds_alz = results["alz"]
df_preds_alz["pipeline"] = tag
df_preds_alz.to_csv(path_alz / "predictions.csv", index=False)
df_k_alz["pipeline"] = tag
df_k_alz.to_csv(path_alz / "recall_precision_at_k.csv", index=False)
df_metrics_alz = pd.DataFrame([summ_alz])
df_metrics_alz["pipeline"] = tag
df_metrics_alz.to_csv(path_alz / "metrics.csv", index=False)
# ─────── Guardar Esquizofrenia ───────
# Guardar resultados Esquizofrenia
df_k_esq, summ_esq, report_esq, df_preds_esq = results["esq"]
df_preds_esq["pipeline"] = tag
df_preds_esq.to_csv(path_esq / "predictions.csv", index=False)
df_k_esq["pipeline"] = tag
df_k_esq.to_csv(path_esq / "recall_precision_at_k.csv", index=False)
df_metrics_esq = pd.DataFrame([summ_esq])
df_metrics_esq["pipeline"] = tag
df_metrics_esq.to_csv(path_esq / "metrics.csv", index=False)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment