From a0bfde29b5d9d43efc1f82418985d49d5de7168b Mon Sep 17 00:00:00 2001 From: joaquintb Date: Thu, 18 Apr 2024 12:53:48 +0200 Subject: [PATCH] adapting code to new dataset (waiting to know what do to with unknown values) --- EDA.ipynb | 96 ++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 58 deletions(-) diff --git a/EDA.ipynb b/EDA.ipynb index 1b82c56..fe3222d 100644 --- a/EDA.ipynb +++ b/EDA.ipynb @@ -302,9 +302,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total missing values Age: 10\n", + "Total missing values Años_consumo_droga: 718\n", + "Total missing values Risk_stigma: 1847\n", + "Total missing values NumHijos: 1788\n", + "\tCONJUNTO PREPANDEMIA\n", + "\t\tMissing values Age: 9\n", + "\t\tMissing values Años_consumo_droga: 519\n", + "\t\tMissing values Risk_stigma: 1255\n", + "\t\tMissing values NumHijos: 1214\n", + "\tCONJUNTO POSTPANDEMIA\n", + "\t\tMissing values Age: 1\n", + "\t\tMissing values Años_consumo_droga: 199\n", + "\t\tMissing values Risk_stigma: 592\n", + "\t\tMissing values NumHijos: 574\n" + ] + } + ], "source": [ "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n", "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n", @@ -608,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -664,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -680,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -702,7 +723,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### One-hot encode all categorical variables" + "##### One-hot encode categorical variables" ] }, { @@ -723,34 +744,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, - "outputs": [], - "source": [ - "# Original approach\n", - "one_hot_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop']\n", - "\n", - "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", - " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", - "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", - " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", - " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", - "target_var = 'Situacion_tratamiento'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Frec30_1 día/semana', 'Frec30_2-3 días\\u200e/semana', 'Frec30_4-6 días/semana', 'Frec30_Desconocido', 'Frec30_Menos de 1 día\\u200e/semana', 'Frec30_No consumio', 'Frec30_Todos los días']\n" + ] + } + ], "source": [ "# Specify columns to one hot encode; empty list otherwise\n", - "one_hot_vars = ['Droga_Ppal_REC', 'Sexo_x_Hijos', 'Education',\n", - " 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias'] \n", + "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n", "\n", "one_hots_vars_prefix = {\n", - " 'Droga_Ppal_REC': 'DrogP',\n", - " 'Sexo_x_Hijos': 'SexHij',\n", " 'Education': 'Ed',\n", " 'Job_insecurity': 'JobIn',\n", " 'Housing': 'Hous', \n", @@ -939,35 +948,6 @@ " plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###### Original approach (all categorical mapped to integers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n", - "plt.subplots_adjust(hspace=0.75, wspace=2)\n", - "\n", - "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n", - "for sit_tto in range(1,4):\n", - " for group in range(1,4):\n", - " plt.subplot(3, 3, 3*(sit_tto-1) + group) # Calculate the subplot position dynamically\n", - " plot_heatmap(sit_tto, group)\n", - " \n", - "# Adjust layout to prevent overlapping titles\n", - "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format in the \"./EDA_plots\" folder\n", - "plt.savefig('./EDA_plots/heatmaps_original.svg', dpi=550, bbox_inches='tight')" - ] - }, { "cell_type": "markdown", "metadata": {}, -- 2.24.1