gen_train_data.ipynb 4.89 KB
Newer Older
1 2 3 4 5 6
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
7 8
    "**Training Data Generation** \\\n",
    "_Author: Joaquín Torres Bravo_"
9 10 11 12 13 14 15 16 17 18 19
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set-up"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
20
   "execution_count": null,
21 22 23 24 25 26
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
Joaquin Torres's avatar
Joaquin Torres committed
27
    "from imblearn.combine import SMOTETomek\n",
28 29 30 31 32
    "from imblearn.under_sampling import TomekLinks"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
33
   "execution_count": null,
34 35 36 37
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load clean datasets\n",
Joaquin Torres's avatar
Joaquin Torres committed
38 39
    "df_pre = pd.read_csv('../EDA/output/datasets/pre_dataset.csv')\n",
    "df_post = pd.read_csv('../EDA/output/datasets/post_dataset.csv')"
40 41 42 43
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
44
   "execution_count": null,
45
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
46
   "outputs": [],
47 48
   "source": [
    "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
Joaquin Torres's avatar
Joaquin Torres committed
49 50
    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
    "X_post, y_post = df_post.loc[:, df_post.columns != \"Treatment_Outcome\"].to_numpy(), df_post.Treatment_Outcome\n",
51 52 53 54 55 56
    "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
    "\n",
    "print(X_pre.shape)\n",
    "print(X_post.shape)\n",
    "print(y_pre.shape)\n",
    "print(y_post.shape)\n",
Joaquin Torres's avatar
Joaquin Torres committed
57
    "print(feat)"
58 59 60 61 62 63 64 65 66 67 68
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training-Test Split & Sampling"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
69
   "execution_count": null,
70 71 72
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
73
    "# 1. ORIGINAL\n",
74 75 76 77 78 79
    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
    "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
80
   "execution_count": null,
81 82 83 84
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save test data\n",
Joaquin Torres's avatar
Joaquin Torres committed
85 86 87 88
    "np.save('./output/pre/X_test_pre.npy', X_test_pre)\n",
    "np.save('./output/pre/y_test_pre.npy', y_test_pre)\n",
    "np.save('./output/post/X_test_post.npy', X_test_post)\n",
    "np.save('./output/post/y_test_post.npy', y_test_post)"
89 90 91 92
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
93
   "execution_count": null,
94 95 96 97
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save ORIGINAL training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
98 99 100 101
    "np.save('./output/pre/X_train_pre.npy', X_train_pre)\n",
    "np.save('./output/pre/y_train_pre.npy', y_train_pre)\n",
    "np.save('./output/post/X_train_post.npy', X_train_post)\n",
    "np.save('./output/post/y_train_post.npy', y_train_post)"
102 103 104 105
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
106
   "execution_count": null,
107 108 109
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
110
    "# 2. OVERSAMPLED training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
111 112 113
    "smote_tomek = SMOTETomek()\n",
    "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
114 115 116 117
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
118
   "execution_count": null,
119 120 121 122
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save oversampled training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
123 124 125 126
    "np.save('./output/pre/X_train_over_pre.npy', X_train_over_pre)\n",
    "np.save('./output/pre/y_train_over_pre.npy', y_train_over_pre)\n",
    "np.save('./output/post/X_train_over_post.npy', X_train_over_post)\n",
    "np.save('./output/post/y_train_over_post.npy', y_train_over_post)"
127 128 129 130
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
131
   "execution_count": null,
132 133 134
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
135
    "# 3. UNDERSAMPLING: TOMEK-LINKS \n",
136 137 138 139 140 141 142
    "tomek = TomekLinks()\n",
    "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
143
   "execution_count": null,
144 145 146 147
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save undersampled training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
148 149 150 151
    "np.save('./output/pre/X_train_under_pre.npy', X_train_under_pre)\n",
    "np.save('./output/pre/y_train_under_pre.npy', y_train_under_pre)\n",
    "np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
    "np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}