**Training Data Generation** \
_Author: Joaqu√≠n Torres Bravo_

### Set-up

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Over/under sampling methods
from imblearn.combine import SMOTETomek 
from imblearn.under_sampling import TomekLinks

In [None]:
# Load clean datasets
df_pre = pd.read_csv('../EDA/output/datasets/pre_dataset.csv')
df_post = pd.read_csv('../EDA/output/datasets/post_dataset.csv')

In [None]:
# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) 
X_pre, y_pre = df_pre.loc[:, df_pre.columns != "Treatment_Outcome"].to_numpy(), df_pre.Treatment_Outcome
X_post, y_post = df_post.loc[:, df_post.columns != "Treatment_Outcome"].to_numpy(), df_post.Treatment_Outcome
feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target 

print(X_pre.shape)
print(X_post.shape)
print(y_pre.shape)
print(y_post.shape)
print(feat)

### Training-Test Split & Sampling

#### Pipelines 1 and 2: ORIG and ORIG_CW

In [None]:
# 90-10 split
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) 
X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)

In [None]:
# Save test data
np.save('./output/pre/X_test_pre.npy', X_test_pre)
np.save('./output/pre/y_test_pre.npy', y_test_pre)
np.save('./output/post/X_test_post.npy', X_test_post)
np.save('./output/post/y_test_post.npy', y_test_post)

In [None]:
# Save ORIGINAL training data
np.save('./output/pre/X_train_pre.npy', X_train_pre)
np.save('./output/pre/y_train_pre.npy', y_train_pre)
np.save('./output/post/X_train_post.npy', X_train_post)
np.save('./output/post/y_train_post.npy', y_train_post)

#### Pipeline 3: OVER

In [None]:
# OVERSAMPLED training data
smote_tomek = SMOTETomek()
X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)
X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)

In [None]:
# Save oversampled training data
np.save('./output/pre/X_train_over_pre.npy', X_train_over_pre)
np.save('./output/pre/y_train_over_pre.npy', y_train_over_pre)
np.save('./output/post/X_train_over_post.npy', X_train_over_post)
np.save('./output/post/y_train_over_post.npy', y_train_over_post)

#### Pipeline 4: UNDER

In [None]:
# 3. UNDERSAMPLING: TOMEK-LINKS 
tomek = TomekLinks()
X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)
X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)

In [None]:
# Save undersampled training data
np.save('./output/pre/X_train_under_pre.npy', X_train_under_pre)
np.save('./output/pre/y_train_under_pre.npy', y_train_under_pre)
np.save('./output/post/X_train_under_post.npy', X_train_under_post)
np.save('./output/post/y_train_under_post.npy', y_train_under_post)