diff --git a/gen_train_data/gen_train_data.ipynb b/gen_train_data/gen_train_data.ipynb index eac682dc1aad218732cb9f2c3fb3869235df0dd3..309411a3e6112d20571f3c15edf57ca55cb27a6c 100644 --- a/gen_train_data/gen_train_data.ipynb +++ b/gen_train_data/gen_train_data.ipynb @@ -17,20 +17,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", - "from imblearn.combine import SMOTETomek\n", + "# Over/under sampling methods\n", + "from imblearn.combine import SMOTETomek \n", "from imblearn.under_sampling import TomekLinks" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -41,9 +42,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(22861, 39)\n", + "(10677, 39)\n", + "(22861,)\n", + "(10677,)\n", + "['Ed_Not_Complete_Primary' 'Ed_Primary' 'Ed_Secondary'\n", + " 'Ed_Secondary_Technical' 'Ed_Tertiary' 'Social_Protection'\n", + " 'JobIn_Unstable' 'JobIn_Stable' 'JobIn_Unemployed' 'Hous_Institutional'\n", + " 'Hous_Stable' 'Hous_Unstable' 'Early_Alterations' 'SocInc_Family_Friends'\n", + " 'SocInc_Alone' 'SocInc_Instit' 'Risk_Stigma' 'Structural_Conflict' 'Age'\n", + " 'Sex' 'Num_Children' 'Smoking' 'Bio_Vulner' 'Opiods_DXCIE'\n", + " 'Cannabis_DXCIE' 'BZD_DXCIE' 'Cocaine_DXCIE' 'Hallucin_DXCIE'\n", + " 'Tobacco_DXCIE' 'Freq_1dpw' 'Freq_2-3dpw' 'Freq_4-6dpw' 'Freq_l1dpw'\n", + " 'Freq_None' 'Freq_Everyday' 'Years_Drug_Use' 'Other_Psychiatric_DX'\n", + " 'Previous_Treatments' 'Treatment_Adherence']\n" + ] + } + ], "source": [ "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n", "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n", @@ -64,14 +86,21 @@ "### Training-Test Split & Sampling" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pipelines 1 and 2: ORIG and ORIG_CW" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# 1. ORIGINAL\n", - "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n", + "# 90-10 split\n", + "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) \n", "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)" ] }, @@ -101,13 +130,20 @@ "np.save('./output/post/y_train_post.npy', y_train_post)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pipeline 3: OVER" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# 2. OVERSAMPLED training data\n", + "# OVERSAMPLED training data\n", "smote_tomek = SMOTETomek()\n", "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"