diff --git a/gen_train_data/.gitignore b/gen_train_data/.gitignore index 695af1c930ad033b8ad05a640c7ab74d378d582f..6320cd248dd8aeaab759d5871f8781b5c0505172 100644 --- a/gen_train_data/.gitignore +++ b/gen_train_data/.gitignore @@ -1,2 +1 @@ -post_dataset.csv -pre_dataset.csv \ No newline at end of file +data \ No newline at end of file diff --git a/gen_train_data/gen_train_data.ipynb b/gen_train_data/gen_train_data.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f7061aed333dabaadbbe09cd89ddd44bad015352 --- /dev/null +++ b/gen_train_data/gen_train_data.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Data Generation\n", + "By JoaquĆ­n Torres, May 2024" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set-up" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.under_sampling import TomekLinks" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Load clean datasets\n", + "df_pre = pd.read_csv('./data/input/pre_dataset.csv')\n", + "df_post = pd.read_csv('./data/input/post_dataset.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(22861, 39)\n", + "(10677, 39)\n", + "(22861,)\n", + "(10677,)\n", + "39\n" + ] + } + ], + "source": [ + "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n", + "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n", + "X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n", + "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n", + "\n", + "print(X_pre.shape)\n", + "print(X_post.shape)\n", + "print(y_pre.shape)\n", + "print(y_post.shape)\n", + "print(len(feat))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training-Test Split & Sampling" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# ORIGINAL\n", + "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n", + "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Save test data\n", + "np.save('./data/output/pre/X_test_pre.npy', X_test_pre)\n", + "np.save('./data/output/pre/y_test_pre.npy', y_test_pre)\n", + "np.save('./data/output/post/X_test_post.npy', X_test_post)\n", + "np.save('./data/output/post/y_test_post.npy', y_test_post)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save ORIGINAL training data\n", + "np.save('./data/output/pre/X_train_pre.npy', X_train_pre)\n", + "np.save('./data/output/pre/y_train_pre.npy', y_train_pre)\n", + "np.save('./data/output/post/X_train_post.npy', X_train_post)\n", + "np.save('./data/output/post/y_train_post.npy', y_train_post)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# OVERSAMPLING: SMOTE\n", + "smote = SMOTE()\n", + "X_train_over_pre, y_train_over_pre = smote.fit_resample(X_train_pre, y_train_pre)\n", + "X_train_over_post, y_train_over_post = smote.fit_resample(X_train_post, y_train_post)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Save oversampled training data\n", + "np.save('./data/output/pre/X_train_over_pre.npy', X_train_over_pre)\n", + "np.save('./data/output/pre/y_train_over_pre.npy', y_train_over_pre)\n", + "np.save('./data/output/post/X_train_over_post.npy', X_train_over_post)\n", + "np.save('./data/output/post/y_train_over_post.npy', y_train_over_post)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# UNDERSAMPLING: TOMEK-LINKS \n", + "tomek = TomekLinks()\n", + "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n", + "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Save undersampled training data\n", + "np.save('./data/output/pre/X_train_under_pre.npy', X_train_under_pre)\n", + "np.save('./data/output/pre/y_train_under_pre.npy', y_train_under_pre)\n", + "np.save('./data/output/post/X_train_under_post.npy', X_train_under_post)\n", + "np.save('./data/output/post/y_train_under_post.npy', y_train_under_post)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}