adapting code to new dataset (waiting to know what do to with unknown values)

a0bfde29 · Joaquin Torres · a74eb5ce · a0bfde29
Commit a0bfde29 authored Apr 18, 2024 by Joaquin Torres
Show whitespace changes
Inline Side-by-side

Showing with 38 additions and 58 deletions

EDA.ipynb EDA.ipynb +38 -58

No files found.
--- a/EDA.ipynb
+++ b/EDA.ipynb
@@ -302,9 +302,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total missing values Age: 10\n",
+      "Total missing values Años_consumo_droga: 718\n",
+      "Total missing values Risk_stigma: 1847\n",
+      "Total missing values NumHijos: 1788\n",
+      "\tCONJUNTO PREPANDEMIA\n",
+      "\t\tMissing values Age: 9\n",
+      "\t\tMissing values Años_consumo_droga: 519\n",
+      "\t\tMissing values Risk_stigma: 1255\n",
+      "\t\tMissing values NumHijos: 1214\n",
+      "\tCONJUNTO POSTPANDEMIA\n",
+      "\t\tMissing values Age: 1\n",
+      "\t\tMissing values Años_consumo_droga: 199\n",
+      "\t\tMissing values Risk_stigma: 592\n",
+      "\t\tMissing values NumHijos: 574\n"
+     ]
+    }
+   ],
   "source": [
    "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
    "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
@@ -608,7 +629,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -664,7 +685,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -680,7 +701,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -702,7 +723,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### One-hot encode all categorical variables"
+    "##### One-hot encode categorical variables"
   ]
  },
  {
@@ -723,34 +744,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 68,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
-   "source": [
-    "# Original approach\n",
-    "one_hot_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop']\n",
-    "\n",
-    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
-    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
-    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
-    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
-    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
-    "target_var = 'Situacion_tratamiento'"
-   ]
-  },
    {
-   "cell_type": "code",
+     "name": "stdout",
-   "execution_count": null,
+     "output_type": "stream",
-   "metadata": {},
+     "text": [
-   "outputs": [],
+      "['Frec30_1 día/semana', 'Frec30_2-3 días\\u200e/semana', 'Frec30_4-6 días/semana', 'Frec30_Desconocido', 'Frec30_Menos de 1 día\\u200e/semana', 'Frec30_No consumio', 'Frec30_Todos los días']\n"
+     ]
+    }
+   ],
   "source": [
    "# Specify columns to one hot encode; empty list otherwise\n",
-    "one_hot_vars = ['Droga_Ppal_REC', 'Sexo_x_Hijos', 'Education',\n",
+    "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n",
-    "                'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias'] \n",
    "\n",
    "one_hots_vars_prefix = {\n",
-    "    'Droga_Ppal_REC': 'DrogP',\n",
-    "    'Sexo_x_Hijos': 'SexHij',\n",
    "    'Education': 'Ed',\n",
    "    'Job_insecurity': 'JobIn',\n",
    "    'Housing': 'Hous', \n",
@@ -939,35 +948,6 @@
    "    plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###### Original approach (all categorical mapped to integers)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=2)\n",
-    "\n",
-    "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
-    "for sit_tto in range(1,4):\n",
-    "    for group in range(1,4):\n",
-    "        plt.subplot(3, 3, 3*(sit_tto-1) + group)  # Calculate the subplot position dynamically\n",
-    "        plot_heatmap(sit_tto, group)\n",
-    "        \n",
-    "# Adjust layout to prevent overlapping titles\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format in the \"./EDA_plots\" folder\n",
-    "plt.savefig('./EDA_plots/heatmaps_original.svg', dpi=550, bbox_inches='tight')"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},