Commit 4946dc0b authored by Joaquin Torres's avatar Joaquin Torres

Completed comments

parent bb509e14
......@@ -25,7 +25,8 @@
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
"from pypair.association import binary_binary, continuous_continuous, binary_continuous # Correlations\n",
"# Feature Importance\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import f_classif\n",
......@@ -50,7 +51,7 @@
"# Filter the dataset to work only with alcohol patients\n",
"bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
"\n",
"# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
"# Filter the dataset to work only with 'Abandono' or 'Alta' patients\n",
"bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
]
},
......@@ -60,15 +61,15 @@
"metadata": {},
"outputs": [],
"source": [
"# Pre-pandemic\n",
"# Pre-pandemic group\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"# Pre-pandemic abandono\n",
"pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
"# Pre-pandemic alta\n",
"pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
"\n",
"# Post-pandemic\n",
"# Merging last two classes to balance sets\n",
"# Post-pandemic group\n",
"# Merging last two classes\n",
"conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
" (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
"# Post-pandemic abandono\n",
......@@ -142,7 +143,9 @@
"source": [
"# 9.0 represents unknown according to Variables.docx \n",
"print(bd['Social_inclusion'].unique())\n",
"# Obtain mode for this feature\n",
"mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
"# Replace unknown value by the mode\n",
"bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
"print(bd['Social_inclusion'].unique())\n",
"\n",
......@@ -192,6 +195,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Replace NaN values by the mode\n",
"age_mode = bd['Age'].mode()[0]\n",
"bd['Age'].fillna(age_mode, inplace=True)\n",
"\n",
......@@ -218,6 +222,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Discrete attributes\n",
"disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
" 'Alterations_early_childhood_develop', 'Social_inclusion',\n",
" 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
......@@ -226,6 +231,7 @@
" 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
" ]\n",
"\n",
"# Numerical attributes\n",
"num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
]
},
......@@ -252,7 +258,9 @@
"fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
"plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
"\n",
"# Generate countplot for each attribute\n",
"for i, disc_att in enumerate(disc_atts):\n",
" # For each possible value of the attribute, consider the PRE-POST and ALTA-ABANDONO combinations in the same subplot\n",
" ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
" hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
" ax=axs[i])\n",
......@@ -268,9 +276,7 @@
" xytext = (0, 9), \n",
" textcoords = 'offset points')\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
]
},
......@@ -287,12 +293,12 @@
"metadata": {},
"outputs": [],
"source": [
"# Function to plot countplot \n",
"# Function to plot normalized countplot \n",
"def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
" \"\"\"\n",
" group: 1 (all), 2 (pre), 3 (post) \n",
" \"\"\"\n",
"\n",
" \n",
" # Define data to work with based on group\n",
" if group == 1:\n",
" df = bd \n",
......@@ -340,7 +346,6 @@
"plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
"\n",
"for i, disc_att in enumerate(disc_atts):\n",
"\n",
" # # 1: ALL \n",
" # plot_count_perc_norm(i, 1, disc_att)\n",
" # axs[i, 0].set_title(\"\\nALL\")\n",
......@@ -362,11 +367,7 @@
" axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
" axs[i, 1].tick_params(axis='x', rotation=90)\n",
"\n",
" \n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
"plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
]
},
......@@ -413,15 +414,12 @@
" plt.subplot(len(num_atts), 1, i+1)\n",
" sns.boxplot(\n",
" data=combined_pre_post,\n",
" x = num_att,\n",
" y = 'Group',\n",
" hue='Situacion_tratamiento',\n",
" x = num_att, # attribute value in the x axis\n",
" y = 'Group', # pre and post in y axis\n",
" hue='Situacion_tratamiento', # side by side abandono vs alta\n",
" )\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
]
},
......@@ -458,10 +456,7 @@
" line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
" axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
]
},
......@@ -492,7 +487,7 @@
" 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
"target_var = 'Situacion_tratamiento'\n",
"\n",
"# Columns that are already numeric and we don't need to redefine \n",
"# Columns that are already numeric and we do not need to redefine \n",
"no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
]
},
......@@ -607,7 +602,7 @@
"one_hot_cols_dic = {}\n",
"\n",
"for one_hot_var in one_hot_vars:\n",
" # Create one hot encoding version of attribute and concatenate new columns to main df\n",
" # Create one hot encoding version of attribute and concatenate new columns to main df using specified prefix\n",
" encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
" bd = pd.concat([bd, encoded_var], axis=1)\n",
" one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
......@@ -685,6 +680,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Keep target column and Pandemia_inicio_fin_tratamiento to update df and split again into PRE and POST\n",
"columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
"bd = bd[columns_to_keep]"
]
......@@ -695,6 +691,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Cleaning attribute names\n",
"name_mapping = {\n",
" 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
" 'Ed_Primary education': 'Ed_Primary',\n",
......@@ -764,8 +761,8 @@
"metadata": {},
"outputs": [],
"source": [
"bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
"#print(bd.columns)"
"# Renaming columns\n",
"bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]"
]
},
{
......@@ -805,7 +802,7 @@
"metadata": {},
"outputs": [],
"source": [
"binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
"binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF']] #, name_mapping['Risk_stigma_REDEF']]\n",
"cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
]
},
......@@ -825,16 +822,16 @@
" for j, var_j in enumerate(cols):\n",
" # Fill lower triangle of matrix\n",
" if i > j:\n",
" # Binary with binary correlation: tetrachoric\n",
" # Binary with binary correlation -> tetrachoric\n",
" if var_i in binary_vars and var_j in binary_vars:\n",
" corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
" # Continuous with continuous correlation: \n",
" # Continuous with continuous correlation -> Spearman\n",
" elif var_i in cont_vars and var_j in cont_vars:\n",
" # Returning nan sometimes:\n",
" # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
" # corr = corr_tuple[0]\n",
" corr = df[var_i].corr(df[var_j], method='spearman')\n",
" # Binary vs Continuous correlation:\n",
" # Binary vs Continuous correlation -> Point Biserial\n",
" else:\n",
" if var_i in binary_vars:\n",
" bin_var = var_i\n",
......@@ -868,7 +865,7 @@
" else:\n",
" cols = corr_cols\n",
" \n",
" # Title plot and select datat based on group and sit_tto\n",
" # Title plot and select data based on group and sit_tto\n",
" if group == 1:\n",
" plot_title = \"Correl Matrix - ALL\"\n",
" if sit_tto == 1:\n",
......@@ -947,7 +944,7 @@
"plt.subplots_adjust(hspace=0.75, wspace=2)\n",
"corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
"\n",
"# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
"# Go through possible values for 'Situacion_tratamiento': 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
"for sit_tto in range(1,4):\n",
" # ALL\n",
" plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n",
......@@ -990,7 +987,6 @@
" # Go through matrices\n",
" for i, var_i in enumerate(cols):\n",
" for j, var_j in enumerate(cols):\n",
" # If difference greater than certain threshold, print variables \n",
" val_pre = m_pre[i][j]\n",
" val_post = m_post[i][j]\n",
" diff = abs(val_pre - val_post)\n",
......@@ -1079,6 +1075,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Drop target\n",
"bd = bd.drop(columns=['Situacion_tratamiento'])\n",
"# print(len(bd.columns))\n",
"\n",
......@@ -1101,10 +1098,18 @@
"metadata": {},
"outputs": [],
"source": [
"# Numpy matrices for features and target\n",
"X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n",
"X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n",
"feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n",
"feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Export datasets\n",
"conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n",
"conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment