Completed comments

62af7a7b · Joaquin Torres · 4946dc0b · 62af7a7b
Commit 62af7a7b authored Jul 08, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 8 deletions

gen_train_data/gen_train_data.ipynb gen_train_data/gen_train_data.ipynb +44 -8

No files found.
--- a/gen_train_data/gen_train_data.ipynb
+++ b/gen_train_data/gen_train_data.ipynb
@@ -17,20 +17,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
-    "from imblearn.combine import SMOTETomek\n",
+    "# Over/under sampling methods\n",
+    "from imblearn.combine import SMOTETomek \n",
    "from imblearn.under_sampling import TomekLinks"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -41,9 +42,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(22861, 39)\n",
+      "(10677, 39)\n",
+      "(22861,)\n",
+      "(10677,)\n",
+      "['Ed_Not_Complete_Primary' 'Ed_Primary' 'Ed_Secondary'\n",
+      " 'Ed_Secondary_Technical' 'Ed_Tertiary' 'Social_Protection'\n",
+      " 'JobIn_Unstable' 'JobIn_Stable' 'JobIn_Unemployed' 'Hous_Institutional'\n",
+      " 'Hous_Stable' 'Hous_Unstable' 'Early_Alterations' 'SocInc_Family_Friends'\n",
+      " 'SocInc_Alone' 'SocInc_Instit' 'Risk_Stigma' 'Structural_Conflict' 'Age'\n",
+      " 'Sex' 'Num_Children' 'Smoking' 'Bio_Vulner' 'Opiods_DXCIE'\n",
+      " 'Cannabis_DXCIE' 'BZD_DXCIE' 'Cocaine_DXCIE' 'Hallucin_DXCIE'\n",
+      " 'Tobacco_DXCIE' 'Freq_1dpw' 'Freq_2-3dpw' 'Freq_4-6dpw' 'Freq_l1dpw'\n",
+      " 'Freq_None' 'Freq_Everyday' 'Years_Drug_Use' 'Other_Psychiatric_DX'\n",
+      " 'Previous_Treatments' 'Treatment_Adherence']\n"
+     ]
+    }
+   ],
   "source": [
    "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
@@ -64,14 +86,21 @@
    "### Training-Test Split & Sampling"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pipelines 1 and 2: ORIG and ORIG_CW"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# 1. ORIGINAL\n",
+    "# 90-10 split\n",
-    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
+    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) \n",
    "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
   ]
  },
@@ -101,13 +130,20 @@
    "np.save('./output/post/y_train_post.npy', y_train_post)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pipeline 3: OVER"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# 2. OVERSAMPLED training data\n",
+    "# OVERSAMPLED training data\n",
    "smote_tomek = SMOTETomek()\n",
    "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"