Completed comments

4946dc0b · Joaquin Torres · bb509e14 · 4946dc0b
Commit 4946dc0b authored Jul 08, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 36 deletions

EDA/EDA.ipynb EDA/EDA.ipynb +41 -36

No files found.
--- a/EDA/EDA.ipynb
+++ b/EDA/EDA.ipynb
@@ -25,7 +25,8 @@
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
-    "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
+    "from pypair.association import binary_binary, continuous_continuous, binary_continuous # Correlations\n",
+    "# Feature Importance\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_classif\n",
@@ -50,7 +51,7 @@
    "# Filter the dataset to work only with alcohol patients\n",
    "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
    "\n",
-    "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
+    "# Filter the dataset to work only with 'Abandono' or 'Alta' patients\n",
    "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
   ]
  },
@@ -60,15 +61,15 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Pre-pandemic\n",
+    "# Pre-pandemic group\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
    "# Pre-pandemic alta\n",
    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
-    "# Post-pandemic\n",
-    "# Merging last two classes to balance sets\n",
+    "# Post-pandemic group\n",
+    "# Merging last two classes\n",
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "# Post-pandemic abandono\n",
@@ -142,7 +143,9 @@
   "source": [
    "# 9.0 represents unknown according to Variables.docx \n",
    "print(bd['Social_inclusion'].unique())\n",
+    "# Obtain mode for this feature\n",
    "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
+    "# Replace unknown value by the mode\n",
    "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
    "print(bd['Social_inclusion'].unique())\n",
    "\n",
@@ -192,6 +195,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Replace NaN values by the mode\n",
    "age_mode = bd['Age'].mode()[0]\n",
    "bd['Age'].fillna(age_mode, inplace=True)\n",
    "\n",
@@ -218,6 +222,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Discrete attributes\n",
    "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
    "        'Alterations_early_childhood_develop', 'Social_inclusion',\n",
    "        'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
@@ -226,6 +231,7 @@
    "        'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
    "        ]\n",
    "\n",
+    "# Numerical attributes\n",
    "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
   ]
  },
@@ -252,7 +258,9 @@
    "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
    "\n",
+    "# Generate countplot for each attribute\n",
    "for i, disc_att in enumerate(disc_atts):\n",
+    "    # For each possible value of the attribute, consider the PRE-POST and ALTA-ABANDONO combinations in the same subplot\n",
    "    ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
    "                       hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
    "                       ax=axs[i])\n",
@@ -268,9 +276,7 @@
    "                        xytext = (0, 9), \n",
    "                        textcoords = 'offset points')\n",
    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
-    "\n",
    "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
@@ -287,12 +293,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Function to plot countplot \n",
+    "# Function to plot normalized countplot \n",
    "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
    "    \"\"\"\n",
    "        group: 1 (all), 2 (pre), 3 (post) \n",
    "    \"\"\"\n",
-    "\n",
+    "    \n",
    "    # Define data to work with based on group\n",
    "    if group == 1:\n",
    "        df = bd \n",
@@ -340,7 +346,6 @@
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, disc_att in enumerate(disc_atts):\n",
-    "\n",
    "    # # 1: ALL    \n",
    "    # plot_count_perc_norm(i, 1, disc_att)\n",
    "    # axs[i, 0].set_title(\"\\nALL\")\n",
@@ -362,11 +367,7 @@
    "    axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    axs[i, 1].tick_params(axis='x', rotation=90)\n",
    "\n",
-    "    \n",
-    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
    "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
@@ -413,15 +414,12 @@
    "    plt.subplot(len(num_atts), 1, i+1)\n",
    "    sns.boxplot(\n",
    "        data=combined_pre_post,\n",
-    "        x = num_att,\n",
-    "        y = 'Group',\n",
-    "        hue='Situacion_tratamiento',\n",
+    "        x = num_att, # attribute value in the x axis\n",
+    "        y = 'Group', # pre and post in y axis\n",
+    "        hue='Situacion_tratamiento', # side by side abandono vs alta\n",
    "    )\n",
    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
@@ -458,10 +456,7 @@
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
    "    axs[i, 2].set_title(f\"\\nDistr. of {num_att}  - POST\")\n",
    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
   ]
  },
@@ -492,7 +487,7 @@
    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
    "target_var = 'Situacion_tratamiento'\n",
    "\n",
-    "# Columns that are already numeric and we don't need to redefine \n",
+    "# Columns that are already numeric and we do not need to redefine \n",
    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
   ]
  },
@@ -607,7 +602,7 @@
    "one_hot_cols_dic = {}\n",
    "\n",
    "for one_hot_var in one_hot_vars:\n",
-    "    # Create one hot encoding version of attribute and concatenate new columns to main df\n",
+    "    # Create one hot encoding version of attribute and concatenate new columns to main df using specified prefix\n",
    "    encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
    "    bd = pd.concat([bd, encoded_var], axis=1)\n",
    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
@@ -685,6 +680,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Keep target column and Pandemia_inicio_fin_tratamiento to update df and split again into PRE and POST\n",
    "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
    "bd = bd[columns_to_keep]"
   ]
@@ -695,6 +691,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Cleaning attribute names\n",
    "name_mapping = {\n",
    "    'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
    "    'Ed_Primary education': 'Ed_Primary',\n",
@@ -764,8 +761,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
-    "#print(bd.columns)"
+    "# Renaming columns\n",
+    "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]"
   ]
  },
  {
@@ -805,7 +802,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
+    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF']] #, name_mapping['Risk_stigma_REDEF']]\n",
    "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
   ]
  },
@@ -825,16 +822,16 @@
    "        for j, var_j in enumerate(cols):\n",
    "            # Fill lower triangle of matrix\n",
    "            if i > j:\n",
-    "                # Binary with binary correlation: tetrachoric\n",
+    "                # Binary with binary correlation -> tetrachoric\n",
    "                if var_i in binary_vars and var_j in binary_vars:\n",
    "                    corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
-    "                # Continuous with continuous correlation: \n",
+    "                # Continuous with continuous correlation -> Spearman\n",
    "                elif var_i in cont_vars and var_j in cont_vars:\n",
    "                    # Returning nan sometimes:\n",
    "                    # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
    "                    # corr = corr_tuple[0]\n",
    "                    corr = df[var_i].corr(df[var_j], method='spearman')\n",
-    "                # Binary vs Continuous correlation:\n",
+    "                # Binary vs Continuous correlation -> Point Biserial\n",
    "                else:\n",
    "                    if var_i in binary_vars:\n",
    "                        bin_var = var_i\n",
@@ -868,7 +865,7 @@
    "    else:\n",
    "        cols = corr_cols\n",
    "        \n",
-    "    # Title plot and select datat based on group and sit_tto\n",
+    "    # Title plot and select data based on group and sit_tto\n",
    "    if group == 1:\n",
    "        plot_title = \"Correl Matrix - ALL\"\n",
    "        if sit_tto == 1:\n",
@@ -947,7 +944,7 @@
    "plt.subplots_adjust(hspace=0.75, wspace=2)\n",
    "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
    "\n",
-    "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
+    "# Go through possible values for 'Situacion_tratamiento': 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
    "for sit_tto in range(1,4):\n",
    "    # ALL\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 1)  # Calculate the subplot position dynamically\n",
@@ -990,7 +987,6 @@
    "    # Go through matrices\n",
    "    for i, var_i in enumerate(cols):\n",
    "        for j, var_j in enumerate(cols):\n",
-    "            # If difference greater than certain threshold, print variables \n",
    "            val_pre = m_pre[i][j]\n",
    "            val_post = m_post[i][j]\n",
    "            diff = abs(val_pre - val_post)\n",
@@ -1079,6 +1075,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Drop target\n",
    "bd = bd.drop(columns=['Situacion_tratamiento'])\n",
    "# print(len(bd.columns))\n",
    "\n",
@@ -1101,10 +1098,18 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Numpy matrices for features and target\n",
    "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n",
    "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n",
-    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
-    "\n",
+    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "# Export datasets\n",
    "conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n",
    "conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"