## Training Data Generation
By Joaquín Torres, May 2024

### Set-up

In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

In [2]:
# Load clean datasets
df_pre = pd.read_csv('./input/pre_dataset.csv')
df_post = pd.read_csv('./input/post_dataset.csv')

In [3]:
# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) 
X_pre, y_pre = df_pre.loc[:, df_pre.columns != "Situacion_tratamiento_REDEF"].to_numpy(), df_pre.Situacion_tratamiento_REDEF
X_post, y_post = df_post.loc[:, df_post.columns != "Situacion_tratamiento_REDEF"].to_numpy(), df_post.Situacion_tratamiento_REDEF
feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target 

print(X_pre.shape)
print(X_post.shape)
print(y_pre.shape)
print(y_post.shape)
print((feat))

(22861, 39)
(10677, 39)
(22861,)
(10677,)
['Ed_Not Complete primary school' 'Ed_Primary education'
 'Ed_Secondary Education' 'Ed_Secondary more technical education'
 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'
 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'
 'Alterations_early_childhood_develop_REDEF'
 'SocInc_Live with families or friends' 'SocInc_live alone'
 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'
 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'
 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'
 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'
 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'
 'Frec30_2-3 días\u200e/semana' 'Frec30_4-6 días/semana'
 'Frec30_Menos de 1 día\u200e/semana' 'Frec30_No consumio'
 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'
 'Tx_previos_REDEF' 'Adherencia_tto_recalc']


### Training-Test Split & Sampling

In [4]:
# ORIGINAL
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split
X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)

In [5]:
# Save test data
np.save('./output/pre/X_test_pre.npy', X_test_pre)
np.save('./output/pre/y_test_pre.npy', y_test_pre)
np.save('./output/post/X_test_post.npy', X_test_post)
np.save('./output/post/y_test_post.npy', y_test_post)

In [6]:
# Save ORIGINAL training data
np.save('./output/pre/X_train_pre.npy', X_train_pre)
np.save('./output/pre/y_train_pre.npy', y_train_pre)
np.save('./output/post/X_train_post.npy', X_train_post)
np.save('./output/post/y_train_post.npy', y_train_post)

In [7]:
# OVERSAMPLED training data
smote_tomek = SMOTETomek()
X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)
X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)

In [9]:
# Save oversampled training data
np.save('./output/pre/X_train_over_pre.npy', X_train_over_pre)
np.save('./output/pre/y_train_over_pre.npy', y_train_over_pre)
np.save('./output/post/X_train_over_post.npy', X_train_over_post)
np.save('./output/post/y_train_over_post.npy', y_train_over_post)

In [10]:
# UNDERSAMPLING: TOMEK-LINKS 
tomek = TomekLinks()
X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)
X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)

In [14]:
# Save undersampled training data
np.save('./output/pre/X_train_under_pre.npy', X_train_under_pre)
np.save('./output/pre/y_train_under_pre.npy', y_train_under_pre)
np.save('./output/post/X_train_under_post.npy', X_train_under_post)
np.save('./output/post/y_train_under_post.npy', y_train_under_post)

In [5]:
# Save features
np.save('./output/attributes.npy', feat)