diff --git a/gen_train_data/.gitignore b/gen_train_data/.gitignore
index 695af1c930ad033b8ad05a640c7ab74d378d582f..6320cd248dd8aeaab759d5871f8781b5c0505172 100644
--- a/gen_train_data/.gitignore
+++ b/gen_train_data/.gitignore
@@ -1,2 +1 @@
-post_dataset.csv
-pre_dataset.csv
\ No newline at end of file
+data
\ No newline at end of file
diff --git a/gen_train_data/gen_train_data.ipynb b/gen_train_data/gen_train_data.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f7061aed333dabaadbbe09cd89ddd44bad015352
--- /dev/null
+++ b/gen_train_data/gen_train_data.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training Data Generation\n",
+    "By Joaquín Torres, May 2024"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set-up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.under_sampling import TomekLinks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load clean datasets\n",
+    "df_pre = pd.read_csv('./data/input/pre_dataset.csv')\n",
+    "df_post = pd.read_csv('./data/input/post_dataset.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(22861, 39)\n",
+      "(10677, 39)\n",
+      "(22861,)\n",
+      "(10677,)\n",
+      "39\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
+    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n",
+    "X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n",
+    "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
+    "\n",
+    "print(X_pre.shape)\n",
+    "print(X_post.shape)\n",
+    "print(y_pre.shape)\n",
+    "print(y_post.shape)\n",
+    "print(len(feat))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training-Test Split & Sampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ORIGINAL\n",
+    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
+    "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save test data\n",
+    "np.save('./data/output/pre/X_test_pre.npy', X_test_pre)\n",
+    "np.save('./data/output/pre/y_test_pre.npy', y_test_pre)\n",
+    "np.save('./data/output/post/X_test_post.npy', X_test_post)\n",
+    "np.save('./data/output/post/y_test_post.npy', y_test_post)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save ORIGINAL training data\n",
+    "np.save('./data/output/pre/X_train_pre.npy', X_train_pre)\n",
+    "np.save('./data/output/pre/y_train_pre.npy', y_train_pre)\n",
+    "np.save('./data/output/post/X_train_post.npy', X_train_post)\n",
+    "np.save('./data/output/post/y_train_post.npy', y_train_post)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OVERSAMPLING: SMOTE\n",
+    "smote = SMOTE()\n",
+    "X_train_over_pre, y_train_over_pre = smote.fit_resample(X_train_pre, y_train_pre)\n",
+    "X_train_over_post, y_train_over_post = smote.fit_resample(X_train_post, y_train_post)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save oversampled training data\n",
+    "np.save('./data/output/pre/X_train_over_pre.npy', X_train_over_pre)\n",
+    "np.save('./data/output/pre/y_train_over_pre.npy', y_train_over_pre)\n",
+    "np.save('./data/output/post/X_train_over_post.npy', X_train_over_post)\n",
+    "np.save('./data/output/post/y_train_over_post.npy', y_train_over_post)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# UNDERSAMPLING: TOMEK-LINKS \n",
+    "tomek = TomekLinks()\n",
+    "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
+    "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save undersampled training data\n",
+    "np.save('./data/output/pre/X_train_under_pre.npy', X_train_under_pre)\n",
+    "np.save('./data/output/pre/y_train_under_pre.npy', y_train_under_pre)\n",
+    "np.save('./data/output/post/X_train_under_post.npy', X_train_under_post)\n",
+    "np.save('./data/output/post/y_train_under_post.npy', y_train_under_post)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}