diff --git a/gen_train_data/gen_train_data.ipynb b/gen_train_data/gen_train_data.ipynb
index eac682dc1aad218732cb9f2c3fb3869235df0dd3..309411a3e6112d20571f3c15edf57ca55cb27a6c 100644
--- a/gen_train_data/gen_train_data.ipynb
+++ b/gen_train_data/gen_train_data.ipynb
@@ -17,20 +17,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from imblearn.combine import SMOTETomek\n",
+    "# Over/under sampling methods\n",
+    "from imblearn.combine import SMOTETomek \n",
     "from imblearn.under_sampling import TomekLinks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,9 +42,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(22861, 39)\n",
+      "(10677, 39)\n",
+      "(22861,)\n",
+      "(10677,)\n",
+      "['Ed_Not_Complete_Primary' 'Ed_Primary' 'Ed_Secondary'\n",
+      " 'Ed_Secondary_Technical' 'Ed_Tertiary' 'Social_Protection'\n",
+      " 'JobIn_Unstable' 'JobIn_Stable' 'JobIn_Unemployed' 'Hous_Institutional'\n",
+      " 'Hous_Stable' 'Hous_Unstable' 'Early_Alterations' 'SocInc_Family_Friends'\n",
+      " 'SocInc_Alone' 'SocInc_Instit' 'Risk_Stigma' 'Structural_Conflict' 'Age'\n",
+      " 'Sex' 'Num_Children' 'Smoking' 'Bio_Vulner' 'Opiods_DXCIE'\n",
+      " 'Cannabis_DXCIE' 'BZD_DXCIE' 'Cocaine_DXCIE' 'Hallucin_DXCIE'\n",
+      " 'Tobacco_DXCIE' 'Freq_1dpw' 'Freq_2-3dpw' 'Freq_4-6dpw' 'Freq_l1dpw'\n",
+      " 'Freq_None' 'Freq_Everyday' 'Years_Drug_Use' 'Other_Psychiatric_DX'\n",
+      " 'Previous_Treatments' 'Treatment_Adherence']\n"
+     ]
+    }
+   ],
    "source": [
     "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
     "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
@@ -64,14 +86,21 @@
     "### Training-Test Split & Sampling"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pipelines 1 and 2: ORIG and ORIG_CW"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 1. ORIGINAL\n",
-    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
+    "# 90-10 split\n",
+    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) \n",
     "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
    ]
   },
@@ -101,13 +130,20 @@
     "np.save('./output/post/y_train_post.npy', y_train_post)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pipeline 3: OVER"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 2. OVERSAMPLED training data\n",
+    "# OVERSAMPLED training data\n",
     "smote_tomek = SMOTETomek()\n",
     "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
     "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"