{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Training Data Generation\n", "By JoaquĆ­n Torres, May 2024" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set-up" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Libraries\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.combine import SMOTETomek\n", "from imblearn.under_sampling import TomekLinks" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load clean datasets\n", "df_pre = pd.read_csv('./data/input/pre_dataset.csv')\n", "df_post = pd.read_csv('./data/input/post_dataset.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(22861, 39)\n", "(10677, 39)\n", "(22861,)\n", "(10677,)\n", "39\n" ] } ], "source": [ "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n", "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n", "X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n", "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n", "\n", "print(X_pre.shape)\n", "print(X_post.shape)\n", "print(y_pre.shape)\n", "print(y_post.shape)\n", "print(len(feat))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training-Test Split & Sampling" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# ORIGINAL\n", "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n", "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Save test data\n", "np.save('./data/output/pre/X_test_pre.npy', X_test_pre)\n", "np.save('./data/output/pre/y_test_pre.npy', y_test_pre)\n", "np.save('./data/output/post/X_test_post.npy', X_test_post)\n", "np.save('./data/output/post/y_test_post.npy', y_test_post)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Save ORIGINAL training data\n", "np.save('./data/output/pre/X_train_pre.npy', X_train_pre)\n", "np.save('./data/output/pre/y_train_pre.npy', y_train_pre)\n", "np.save('./data/output/post/X_train_post.npy', X_train_post)\n", "np.save('./data/output/post/y_train_post.npy', y_train_post)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# OVERSAMPLED training data\n", "smote_tomek = SMOTETomek()\n", "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Save oversampled training data\n", "np.save('./data/output/pre/X_train_over_pre.npy', X_train_over_pre)\n", "np.save('./data/output/pre/y_train_over_pre.npy', y_train_over_pre)\n", "np.save('./data/output/post/X_train_over_post.npy', X_train_over_post)\n", "np.save('./data/output/post/y_train_over_post.npy', y_train_over_post)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# UNDERSAMPLING: TOMEK-LINKS \n", "tomek = TomekLinks()\n", "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Save undersampled training data\n", "np.save('./data/output/pre/X_train_under_pre.npy', X_train_under_pre)\n", "np.save('./data/output/pre/y_train_under_pre.npy', y_train_under_pre)\n", "np.save('./data/output/post/X_train_under_post.npy', X_train_under_post)\n", "np.save('./data/output/post/y_train_under_post.npy', y_train_under_post)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }