Commit 62af7a7b authored by Joaquin Torres's avatar Joaquin Torres

Completed comments

parent 4946dc0b
...@@ -17,20 +17,21 @@ ...@@ -17,20 +17,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from imblearn.combine import SMOTETomek\n", "# Over/under sampling methods\n",
"from imblearn.combine import SMOTETomek \n",
"from imblearn.under_sampling import TomekLinks" "from imblearn.under_sampling import TomekLinks"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -41,9 +42,30 @@ ...@@ -41,9 +42,30 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"['Ed_Not_Complete_Primary' 'Ed_Primary' 'Ed_Secondary'\n",
" 'Ed_Secondary_Technical' 'Ed_Tertiary' 'Social_Protection'\n",
" 'JobIn_Unstable' 'JobIn_Stable' 'JobIn_Unemployed' 'Hous_Institutional'\n",
" 'Hous_Stable' 'Hous_Unstable' 'Early_Alterations' 'SocInc_Family_Friends'\n",
" 'SocInc_Alone' 'SocInc_Instit' 'Risk_Stigma' 'Structural_Conflict' 'Age'\n",
" 'Sex' 'Num_Children' 'Smoking' 'Bio_Vulner' 'Opiods_DXCIE'\n",
" 'Cannabis_DXCIE' 'BZD_DXCIE' 'Cocaine_DXCIE' 'Hallucin_DXCIE'\n",
" 'Tobacco_DXCIE' 'Freq_1dpw' 'Freq_2-3dpw' 'Freq_4-6dpw' 'Freq_l1dpw'\n",
" 'Freq_None' 'Freq_Everyday' 'Years_Drug_Use' 'Other_Psychiatric_DX'\n",
" 'Previous_Treatments' 'Treatment_Adherence']\n"
]
}
],
"source": [ "source": [
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n", "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n", "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
...@@ -64,14 +86,21 @@ ...@@ -64,14 +86,21 @@
"### Training-Test Split & Sampling" "### Training-Test Split & Sampling"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Pipelines 1 and 2: ORIG and ORIG_CW"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# 1. ORIGINAL\n", "# 90-10 split\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n", "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) \n",
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)" "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
] ]
}, },
...@@ -101,13 +130,20 @@ ...@@ -101,13 +130,20 @@
"np.save('./output/post/y_train_post.npy', y_train_post)" "np.save('./output/post/y_train_post.npy', y_train_post)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Pipeline 3: OVER"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# 2. OVERSAMPLED training data\n", "# OVERSAMPLED training data\n",
"smote_tomek = SMOTETomek()\n", "smote_tomek = SMOTETomek()\n",
"X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)" "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment