Commit cc7ff8ef authored by Joaquin Torres's avatar Joaquin Torres

generated training data with sampling techniques

parent 9e5d470a
post_dataset.csv
pre_dataset.csv
\ No newline at end of file
data
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training Data Generation\n",
"By Joaquín Torres, May 2024"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set-up"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.under_sampling import TomekLinks"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Load clean datasets\n",
"df_pre = pd.read_csv('./data/input/pre_dataset.csv')\n",
"df_post = pd.read_csv('./data/input/post_dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"39\n"
]
}
],
"source": [
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n",
"feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n",
"print(X_pre.shape)\n",
"print(X_post.shape)\n",
"print(y_pre.shape)\n",
"print(y_post.shape)\n",
"print(len(feat))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training-Test Split & Sampling"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# ORIGINAL\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Save test data\n",
"np.save('./data/output/pre/X_test_pre.npy', X_test_pre)\n",
"np.save('./data/output/pre/y_test_pre.npy', y_test_pre)\n",
"np.save('./data/output/post/X_test_post.npy', X_test_post)\n",
"np.save('./data/output/post/y_test_post.npy', y_test_post)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save ORIGINAL training data\n",
"np.save('./data/output/pre/X_train_pre.npy', X_train_pre)\n",
"np.save('./data/output/pre/y_train_pre.npy', y_train_pre)\n",
"np.save('./data/output/post/X_train_post.npy', X_train_post)\n",
"np.save('./data/output/post/y_train_post.npy', y_train_post)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# OVERSAMPLING: SMOTE\n",
"smote = SMOTE()\n",
"X_train_over_pre, y_train_over_pre = smote.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_post, y_train_over_post = smote.fit_resample(X_train_post, y_train_post)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Save oversampled training data\n",
"np.save('./data/output/pre/X_train_over_pre.npy', X_train_over_pre)\n",
"np.save('./data/output/pre/y_train_over_pre.npy', y_train_over_pre)\n",
"np.save('./data/output/post/X_train_over_post.npy', X_train_over_post)\n",
"np.save('./data/output/post/y_train_over_post.npy', y_train_over_post)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# UNDERSAMPLING: TOMEK-LINKS \n",
"tomek = TomekLinks()\n",
"X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Save undersampled training data\n",
"np.save('./data/output/pre/X_train_under_pre.npy', X_train_under_pre)\n",
"np.save('./data/output/pre/y_train_under_pre.npy', y_train_under_pre)\n",
"np.save('./data/output/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./data/output/post/y_train_under_post.npy', y_train_under_post)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment