gen_train_data.ipynb 6.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training Data Generation\n",
    "By Joaquín Torres, May 2024"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set-up"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
20
   "execution_count": 1,
21 22 23 24 25 26 27
   "metadata": {},
   "outputs": [],
   "source": [
    "# Libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
Joaquin Torres's avatar
Joaquin Torres committed
28
    "from imblearn.combine import SMOTETomek\n",
29 30 31 32 33
    "from imblearn.under_sampling import TomekLinks"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
34
   "execution_count": 2,
35 36 37 38
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load clean datasets\n",
Joaquin Torres's avatar
Joaquin Torres committed
39 40
    "df_pre = pd.read_csv('./input/pre_dataset.csv')\n",
    "df_post = pd.read_csv('./input/post_dataset.csv')"
41 42 43 44
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
45
   "execution_count": 3,
46 47 48 49 50 51 52 53 54 55
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(22861, 39)\n",
      "(10677, 39)\n",
      "(22861,)\n",
      "(10677,)\n",
Joaquin Torres's avatar
Joaquin Torres committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
      "['Ed_Not Complete primary school' 'Ed_Primary education'\n",
      " 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
      " 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
      " 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
      " 'Alterations_early_childhood_develop_REDEF'\n",
      " 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
      " 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
      " 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
      " 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
      " 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
      " 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
      " 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
      " 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
      " 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
      " 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
71 72 73 74 75 76 77 78 79 80 81 82 83
     ]
    }
   ],
   "source": [
    "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n",
    "X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n",
    "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
    "\n",
    "print(X_pre.shape)\n",
    "print(X_post.shape)\n",
    "print(y_pre.shape)\n",
    "print(y_post.shape)\n",
Joaquin Torres's avatar
Joaquin Torres committed
84
    "print((feat))"
85 86 87 88 89 90 91 92 93 94 95
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training-Test Split & Sampling"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
96
   "execution_count": 4,
97 98 99 100 101 102 103 104 105 106
   "metadata": {},
   "outputs": [],
   "source": [
    "# ORIGINAL\n",
    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
    "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
107
   "execution_count": 5,
108 109 110 111
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save test data\n",
Joaquin Torres's avatar
Joaquin Torres committed
112 113 114 115
    "np.save('./output/pre/X_test_pre.npy', X_test_pre)\n",
    "np.save('./output/pre/y_test_pre.npy', y_test_pre)\n",
    "np.save('./output/post/X_test_post.npy', X_test_post)\n",
    "np.save('./output/post/y_test_post.npy', y_test_post)"
116 117 118 119
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
120
   "execution_count": 6,
121 122 123 124
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save ORIGINAL training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
125 126 127 128
    "np.save('./output/pre/X_train_pre.npy', X_train_pre)\n",
    "np.save('./output/pre/y_train_pre.npy', y_train_pre)\n",
    "np.save('./output/post/X_train_post.npy', X_train_post)\n",
    "np.save('./output/post/y_train_post.npy', y_train_post)"
129 130 131 132
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
133
   "execution_count": 7,
134 135 136
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
137 138 139 140
    "# OVERSAMPLED training data\n",
    "smote_tomek = SMOTETomek()\n",
    "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
141 142 143 144
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
145
   "execution_count": 9,
146 147 148 149
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save oversampled training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
150 151 152 153
    "np.save('./output/pre/X_train_over_pre.npy', X_train_over_pre)\n",
    "np.save('./output/pre/y_train_over_pre.npy', y_train_over_pre)\n",
    "np.save('./output/post/X_train_over_post.npy', X_train_over_post)\n",
    "np.save('./output/post/y_train_over_post.npy', y_train_over_post)"
154 155 156 157
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
158
   "execution_count": 10,
159 160 161 162 163 164 165 166 167 168 169
   "metadata": {},
   "outputs": [],
   "source": [
    "# UNDERSAMPLING: TOMEK-LINKS \n",
    "tomek = TomekLinks()\n",
    "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
170
   "execution_count": 14,
171 172 173 174
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save undersampled training data\n",
Joaquin Torres's avatar
Joaquin Torres committed
175 176 177 178
    "np.save('./output/pre/X_train_under_pre.npy', X_train_under_pre)\n",
    "np.save('./output/pre/y_train_under_pre.npy', y_train_under_pre)\n",
    "np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
    "np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
179
   ]
180 181 182
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
183
   "execution_count": 5,
184 185 186 187
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save features\n",
Joaquin Torres's avatar
Joaquin Torres committed
188
    "np.save('./output/attributes.npy', feat)"
189
   ]
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}