Commit 0bbb8d6a authored by Joaquin Torres's avatar Joaquin Torres

EDA cleaned

parent d545dd10
gen_train_data/input/
gen_train_data/output/
EDA/input/
\ No newline at end of file
EDA/input/17_abril.sav
EDA/output/datasets
\ No newline at end of file
......@@ -4,14 +4,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### EDA"
"_Exploratory Data Analysis_ \\\n",
"_Author: Joaquín Torres Bravo_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Libraries"
"### Libraries"
]
},
{
......@@ -25,7 +26,6 @@
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
"\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import f_classif\n",
......@@ -36,19 +36,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Preparing Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Reading and filtering"
"### First Steps"
]
},
{
"cell_type": "code",
"execution_count": 139,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -61,37 +54,11 @@
"bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Defining sets of patients"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_post['Group'] = 'Post'\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:19: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_pre['Group'] = 'Pre'\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pre-pandemic\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
......@@ -117,22 +84,9 @@
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRE: 22861\n",
"\tALTA: 2792\n",
"\tABANDONO: 20069\n",
"POST: 10677\n",
"\tALTA: 1882\n",
"\tABANDONO: 8795\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Printing size of different datasets\n",
"print(f\"PRE: {len(conj_pre)}\")\n",
......@@ -144,20 +98,6 @@
"print(f\"\\tABANDONO: {len(post_abandono)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### First Steps"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Inspecting the dataframes"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -191,114 +131,37 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Replacing unknown values with the mode"
"### Missing and Unknown Values"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
"['Live with families or friends' 'live alone' 'live in institutions']\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 9.0 represents unknown according to Variables.docx \n",
"print(bd['Social_inclusion'].unique())\n",
"mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
"# print(mode_soc_inc)\n",
"bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
"print(bd['Social_inclusion'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)' '9']\n",
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)']\n"
]
}
],
"source": [
"print(bd['Social_inclusion'].unique())\n",
"\n",
"print(bd['Alterations_early_childhood_develop'].unique())\n",
"mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
"bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
"print(bd['Alterations_early_childhood_develop'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NaN, 'Yes', 'No']\n",
"Categories (3, object): [99.0, 'No', 'Yes']\n",
"[NaN, 'Yes', 'No']\n",
"Categories (2, object): ['No', 'Yes']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
" bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
]
}
],
"source": [
"print(bd['Alterations_early_childhood_develop'].unique())\n",
"\n",
"print(bd['Risk_stigma'].unique())\n",
"mode_stigma = bd['Risk_stigma'].mode()[0]\n",
"bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
"print(bd['Risk_stigma'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n",
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n"
]
}
],
"source": [
"print(bd['Risk_stigma'].unique())\n",
"\n",
"print(bd['NumHijos'].unique())\n",
"mode_hijos = bd['NumHijos'].mode()[0]\n",
"bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
"print(bd['NumHijos'].unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Quantifying Null Values"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -323,53 +186,11 @@
"print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Replacing missing values with mode"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Age'].fillna(age_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"age_mode = bd['Age'].mode()[0]\n",
"bd['Age'].fillna(age_mode, inplace=True)\n",
......@@ -388,14 +209,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Distribution of variables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Classifying variables into numerical and discrete/categorical "
"### Distribution of Variables"
]
},
{
......@@ -419,14 +233,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Distribution of discrete attributes"
"#### Discrete"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Count plots"
"##### Countplots"
]
},
{
......@@ -464,7 +278,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Normalized count plots"
"##### Normalized Countplots"
]
},
{
......@@ -560,14 +374,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Distribution of numeric attributes"
"#### Numerical"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Summary statistics"
"##### Summary Stats"
]
},
{
......@@ -583,7 +397,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Boxplots"
"##### Boxplots"
]
},
{
......@@ -615,7 +429,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Histograms"
"##### Histograms"
]
},
{
......@@ -655,19 +469,50 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Correlation Analysis"
"### Correlation Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Turning binary variables into 0/1 values"
"#### Groups of Variables"
]
},
{
"cell_type": "code",
"execution_count": 146,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
" 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
"ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
" 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
" 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
"target_var = 'Situacion_tratamiento'\n",
"\n",
"# Columns that are already numeric and we don't need to redefine \n",
"no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### One-hot Encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Binary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -739,43 +584,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Defining groups of variables"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
" 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
"ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
" 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
" 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
"target_var = 'Situacion_tratamiento'"
"##### Categorical"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"# Columns that are already numeric and we don't need to redefine \n",
"no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### One-hot encode categorical variables"
]
},
{
"cell_type": "code",
"execution_count": 147,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -796,21 +610,19 @@
" # Create one hot encoding version of attribute and concatenate new columns to main df\n",
" encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
" bd = pd.concat([bd, encoded_var], axis=1)\n",
" one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n",
"\n",
"# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])"
" one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Defining final version of columns of interest"
"#### Final Columns"
]
},
{
"cell_type": "code",
"execution_count": 148,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -848,38 +660,40 @@
"corr_cols = soc_vars_enc + ind_vars_enc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Drop unknown columns\n",
"corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Excluding unknown columns and renaming"
"##### Renaming and Filtering"
]
},
{
"cell_type": "code",
"execution_count": 149,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Drop unknown columns\n",
"corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
"columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
"bd = bd[columns_to_keep]"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary', 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection', 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone', 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex', 'Num_Children', 'Smoking', 'Bio_Vulner', 'Opiods_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE', 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw', 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None', 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX', 'Previous_Treatments', 'Treatment_Adherence']\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name_mapping = {\n",
" 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
......@@ -900,7 +714,7 @@
" 'SocInc_live in institutions': 'SocInc_Instit',\n",
" 'Risk_stigma_REDEF': 'Risk_Stigma',\n",
" 'Structural_conflic': 'Structural_Conflict',\n",
" # 'Age': 'Age',\n",
" 'Age': 'Age',\n",
" 'Sex_REDEF': 'Sex',\n",
" 'NumHijos': 'Num_Children',\n",
" 'Smoking_REDEF': 'Smoking',\n",
......@@ -920,26 +734,28 @@
" 'Años_consumo_droga': 'Years_Drug_Use',\n",
" 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n",
" 'Tx_previos_REDEF': 'Previous_Treatments',\n",
" 'Adherencia_tto_recalc': 'Treatment_Adherence'\n",
" 'Adherencia_tto_recalc': 'Treatment_Adherence',\n",
" 'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n",
" 'Situacion_tratamiento': 'Situacion_tratamiento',\n",
" 'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n",
"}\n",
"\n",
"# Update lists of feature names\n",
"corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n",
"print(corr_cols)\n",
"soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n",
"ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]\n",
"\n",
"bd = bd.rename(columns=name_mapping)"
"ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]"
]
},
{
"cell_type": "code",
"execution_count": 133,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create bd with just corr_cols and target\n",
"bd = bd[corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']]"
"# Export feature names\n",
"np.save('./output/feature_names/feature_names.npy', corr_cols)\n",
"np.save('./output/feature_names/soc_vars_names.npy', soc_vars_enc)\n",
"np.save('./output/feature_names/ind_vars_names.npy', ind_vars_enc)"
]
},
{
......@@ -948,25 +764,17 @@
"metadata": {},
"outputs": [],
"source": [
"# Export feature names\n",
"np.save('./output/feature_names.npy', corr_cols)\n",
"np.save('./output/soc_vars_names.npy', soc_vars_enc)\n",
"np.save('./output/ind_vars_names.npy', ind_vars_enc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Update main data frames"
"bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
"#print(bd.columns)"
]
},
{
"cell_type": "code",
"execution_count": 134,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Update main dfs\n",
"# Pre-pandemic\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"# Pre-pandemic abandono\n",
......@@ -988,110 +796,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Building correlation matrix"
"#### Plotting Correlation Matrices"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary',\n",
" 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection',\n",
" 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed',\n",
" 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable',\n",
" 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone',\n",
" 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex',\n",
" 'Sex', 'Num_Children', 'Smoking', 'Smoking', 'Bio_Vulner',\n",
" 'Opiods_DXCIE', 'Cannabis_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE',\n",
" 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw',\n",
" 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None',\n",
" 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX',\n",
" 'Previous_Treatments', 'Treatment_Adherence', 'Situacion_tratamiento',\n",
" 'Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(bd.columns)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ed_Not_Complete_Primary\n",
"2\n",
"Ed_Primary\n",
"2\n",
"Ed_Secondary\n",
"2\n",
"Ed_Secondary_Technical\n",
"2\n",
"Ed_Tertiary\n",
"2\n",
"Social_Protection\n",
"2\n",
"JobIn_Unstable\n",
"2\n",
"JobIn_Stable\n",
"2\n",
"JobIn_Unemployed\n",
"2\n",
"Hous_Institutional\n",
"2\n",
"Hous_Stable\n",
"2\n",
"Hous_Unstable\n",
"2\n",
"Early_Alterations\n",
"2\n",
"SocInc_Family_Friends\n",
"2\n",
"SocInc_Alone\n",
"2\n",
"SocInc_Instit\n",
"2\n",
"Risk_Stigma\n",
"2\n",
"Structural_Conflict\n",
"107\n",
"age\n",
"74\n",
"Sex\n"
]
},
{
"ename": "AttributeError",
"evalue": "'DataFrame' object has no attribute 'unique'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19584\\340002156.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcorr_cols\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\Joaquín Torres\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 6292\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6293\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6294\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6295\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6296\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'unique'"
]
}
],
"source": [
"# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\n",
"\n",
"for col in corr_cols:\n",
" print(col)\n",
" print(len(bd[col].unique()))\n",
"#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\n",
"#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
"cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
]
},
{
......@@ -1149,7 +864,7 @@
" # Define columns based on sit_tto arg\n",
" if sit_tto == 1:\n",
" # Include target as another variable\n",
" cols = [target_var + '_REDEF'] + corr_cols\n",
" cols = ['Treatment_Outcome'] + corr_cols\n",
" else:\n",
" cols = corr_cols\n",
" \n",
......@@ -1246,10 +961,8 @@
"\n",
" corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
" \n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format in the \"./EDA_plots\" folder\n",
"plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
]
},
......@@ -1257,7 +970,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Finding significative differences between PRE and POST"
"#### Finding Differences PRE vs POST"
]
},
{
......@@ -1326,11 +1039,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"keep"
]
},
"metadata": {},
"outputs": [],
"source": [
"print(\"------SIT_TTO 1: NO FILTERING------\")\n",
......@@ -1340,11 +1049,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"keep"
]
},
"metadata": {},
"outputs": [],
"source": [
"print(\"------SIT_TTO 2: ABANDONO-----\")\n",
......@@ -1354,11 +1059,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"keep"
]
},
"metadata": {},
"outputs": [],
"source": [
"print(\"------SIT_TTO 3: ALTA-----\")\n",
......@@ -1369,46 +1070,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Feature Analysis and Selection"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Building final datasets to work with"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Work with columns of interest\n",
"cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n",
"temp_bd = bd[cols_of_interest]\n",
"print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Dropping unknown columns/categories for analysis purposes\n",
"unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n",
"temp_bd = temp_bd.drop(columns=unknown_cols)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(temp_bd.info())"
"### Final Datasets"
]
},
{
......@@ -1417,53 +1079,20 @@
"metadata": {},
"outputs": [],
"source": [
"bd = bd.drop(columns=['Situacion_tratamiento'])\n",
"# print(len(bd.columns))\n",
"\n",
"# For conj_pre dataframe\n",
"conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
"\n",
"# For conj_post dataframe\n",
"conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
" (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
"conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(conj_pre.info())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(conj_post.info())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n",
"X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n",
"X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n",
"feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(feat)"
"conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
" (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
"conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
"\n",
"# print(conj_post.columns)\n",
"# print(conj_pre.columns)"
]
},
{
......@@ -1472,25 +1101,27 @@
"metadata": {},
"outputs": [],
"source": [
"print(X_pre.shape)\n",
"print(X_post.shape)\n",
"print(y_pre.shape)\n",
"print(y_post.shape)\n",
"print(len(feat))"
"X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n",
"X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n",
"feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n",
"# Export datasets\n",
"conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n",
"conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### FSS Filter methods"
"### Feature Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Mutual Info"
"#### Mutual Info"
]
},
{
......@@ -1527,7 +1158,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"###### ANOVA"
"#### ANOVA"
]
},
{
......@@ -1562,6 +1193,13 @@
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Variance Threshold"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -1593,23 +1231,6 @@
"plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Export PRE and POST datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conj_pre.to_csv('pre_dataset.csv', index=False)\n",
"conj_post.to_csv('post_dataset.csv', index=False)"
]
}
],
"metadata": {
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Libraries"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import f_classif\n",
"from sklearn.feature_selection import mutual_info_classif"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### First Steps"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"bd_all = pd.read_spss('./input/17_abril.sav')\n",
"\n",
"# Filter the dataset to work only with alcohol patients\n",
"bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
"\n",
"# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
"bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_post['Group'] = 'Post'\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:19: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_pre['Group'] = 'Pre'\n"
]
}
],
"source": [
"# Pre-pandemic\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"# Pre-pandemic abandono\n",
"pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
"# Pre-pandemic alta\n",
"pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
"\n",
"# Post-pandemic\n",
"# Merging last two classes to balance sets\n",
"conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
" (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
"# Post-pandemic abandono\n",
"post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
"# Post-pandemic alta\n",
"post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n",
"\n",
"# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n",
"conj_post['Group'] = 'Post'\n",
"conj_pre['Group'] = 'Pre'\n",
"combined_pre_post = pd.concat([conj_post, conj_pre])"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRE: 22861\n",
"\tALTA: 2792\n",
"\tABANDONO: 20069\n",
"POST: 10677\n",
"\tALTA: 1882\n",
"\tABANDONO: 8795\n"
]
}
],
"source": [
"# Printing size of different datasets\n",
"print(f\"PRE: {len(conj_pre)}\")\n",
"print(f\"\\tALTA: {len(pre_alta)}\")\n",
"print(f\"\\tABANDONO: {len(pre_abandono)}\")\n",
"\n",
"print(f\"POST: {len(conj_post)}\")\n",
"print(f\"\\tALTA: {len(post_alta)}\")\n",
"print(f\"\\tABANDONO: {len(post_abandono)}\")"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRE\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 22861 entries, 0 to 85164\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 22861 non-null float64 \n",
" 1 Education 22861 non-null object \n",
" 2 Social_protection 22861 non-null object \n",
" 3 Job_insecurity 22861 non-null object \n",
" 4 Housing 22861 non-null object \n",
" 5 Alterations_early_childhood_develop 22861 non-null object \n",
" 6 Social_inclusion 22861 non-null object \n",
" 7 Risk_stigma 21606 non-null category\n",
" 8 Structural_conflic 22861 non-null float64 \n",
" 9 Age 22852 non-null float64 \n",
" 10 Sex 22861 non-null object \n",
" 11 NumHijos 21647 non-null float64 \n",
" 12 Smoking 22861 non-null object \n",
" 13 Biological_vulnerability 22861 non-null object \n",
" 14 Alcohol_DxCIE 22861 non-null object \n",
" 15 Opiaceos_DxCIE 22861 non-null object \n",
" 16 Cannabis_DXCIE 22861 non-null object \n",
" 17 BZD_DxCIE 22861 non-null object \n",
" 18 Cocaina_DxCIE 22861 non-null object \n",
" 19 Alucinogenos_DXCIE 22861 non-null object \n",
" 20 Tabaco_DXCIE 22861 non-null object \n",
" 21 FrecuenciaConsumo30Dias 22861 non-null object \n",
" 22 Años_consumo_droga 22342 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 22861 non-null object \n",
" 24 Tx_previos 22861 non-null object \n",
" 25 Adherencia_tto_recalc 22861 non-null float64 \n",
" 26 Tiempo_tx 22861 non-null float64 \n",
" 27 Readmisiones_estudios 22861 non-null object \n",
" 28 Situacion_tratamiento 22861 non-null object \n",
" 29 Periodos_COVID 22861 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 22861 non-null object \n",
" 31 Nreadmision 22861 non-null float64 \n",
" 32 Readmisiones_PRECOVID 22861 non-null float64 \n",
" 33 Readmisiones_COVID 22861 non-null float64 \n",
" 34 Group 22861 non-null object \n",
"dtypes: category(1), float64(10), object(24)\n",
"memory usage: 6.1+ MB\n",
"None\n",
"-------------------------------\n",
"PRE-ABANDONO\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 20069 entries, 0 to 85164\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 20069 non-null float64 \n",
" 1 Education 20069 non-null object \n",
" 2 Social_protection 20069 non-null object \n",
" 3 Job_insecurity 20069 non-null object \n",
" 4 Housing 20069 non-null object \n",
" 5 Alterations_early_childhood_develop 20069 non-null object \n",
" 6 Social_inclusion 20069 non-null object \n",
" 7 Risk_stigma 18919 non-null category\n",
" 8 Structural_conflic 20069 non-null float64 \n",
" 9 Age 20061 non-null float64 \n",
" 10 Sex 20069 non-null object \n",
" 11 NumHijos 18958 non-null float64 \n",
" 12 Smoking 20069 non-null object \n",
" 13 Biological_vulnerability 20069 non-null object \n",
" 14 Alcohol_DxCIE 20069 non-null object \n",
" 15 Opiaceos_DxCIE 20069 non-null object \n",
" 16 Cannabis_DXCIE 20069 non-null object \n",
" 17 BZD_DxCIE 20069 non-null object \n",
" 18 Cocaina_DxCIE 20069 non-null object \n",
" 19 Alucinogenos_DXCIE 20069 non-null object \n",
" 20 Tabaco_DXCIE 20069 non-null object \n",
" 21 FrecuenciaConsumo30Dias 20069 non-null object \n",
" 22 Años_consumo_droga 19609 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 20069 non-null object \n",
" 24 Tx_previos 20069 non-null object \n",
" 25 Adherencia_tto_recalc 20069 non-null float64 \n",
" 26 Tiempo_tx 20069 non-null float64 \n",
" 27 Readmisiones_estudios 20069 non-null object \n",
" 28 Situacion_tratamiento 20069 non-null object \n",
" 29 Periodos_COVID 20069 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 20069 non-null object \n",
" 31 Nreadmision 20069 non-null float64 \n",
" 32 Readmisiones_PRECOVID 20069 non-null float64 \n",
" 33 Readmisiones_COVID 20069 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 5.2+ MB\n",
"None\n",
"-------------------------------\n",
"PRE-ALTA\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 2792 entries, 23 to 85159\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 2792 non-null float64 \n",
" 1 Education 2792 non-null object \n",
" 2 Social_protection 2792 non-null object \n",
" 3 Job_insecurity 2792 non-null object \n",
" 4 Housing 2792 non-null object \n",
" 5 Alterations_early_childhood_develop 2792 non-null object \n",
" 6 Social_inclusion 2792 non-null object \n",
" 7 Risk_stigma 2687 non-null category\n",
" 8 Structural_conflic 2792 non-null float64 \n",
" 9 Age 2791 non-null float64 \n",
" 10 Sex 2792 non-null object \n",
" 11 NumHijos 2689 non-null float64 \n",
" 12 Smoking 2792 non-null object \n",
" 13 Biological_vulnerability 2792 non-null object \n",
" 14 Alcohol_DxCIE 2792 non-null object \n",
" 15 Opiaceos_DxCIE 2792 non-null object \n",
" 16 Cannabis_DXCIE 2792 non-null object \n",
" 17 BZD_DxCIE 2792 non-null object \n",
" 18 Cocaina_DxCIE 2792 non-null object \n",
" 19 Alucinogenos_DXCIE 2792 non-null object \n",
" 20 Tabaco_DXCIE 2792 non-null object \n",
" 21 FrecuenciaConsumo30Dias 2792 non-null object \n",
" 22 Años_consumo_droga 2733 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 2792 non-null object \n",
" 24 Tx_previos 2792 non-null object \n",
" 25 Adherencia_tto_recalc 2792 non-null float64 \n",
" 26 Tiempo_tx 2792 non-null float64 \n",
" 27 Readmisiones_estudios 2792 non-null object \n",
" 28 Situacion_tratamiento 2792 non-null object \n",
" 29 Periodos_COVID 2792 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 2792 non-null object \n",
" 31 Nreadmision 2792 non-null float64 \n",
" 32 Readmisiones_PRECOVID 2792 non-null float64 \n",
" 33 Readmisiones_COVID 2792 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 744.5+ KB\n",
"None\n",
"-------------------------------\n",
"\n",
"\n",
"\n",
"\n",
"POST\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 10677 entries, 11 to 85156\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 10677 non-null float64 \n",
" 1 Education 10677 non-null object \n",
" 2 Social_protection 10677 non-null object \n",
" 3 Job_insecurity 10677 non-null object \n",
" 4 Housing 10677 non-null object \n",
" 5 Alterations_early_childhood_develop 10677 non-null object \n",
" 6 Social_inclusion 10677 non-null object \n",
" 7 Risk_stigma 10085 non-null category\n",
" 8 Structural_conflic 10677 non-null float64 \n",
" 9 Age 10676 non-null float64 \n",
" 10 Sex 10677 non-null object \n",
" 11 NumHijos 10103 non-null float64 \n",
" 12 Smoking 10677 non-null object \n",
" 13 Biological_vulnerability 10677 non-null object \n",
" 14 Alcohol_DxCIE 10677 non-null object \n",
" 15 Opiaceos_DxCIE 10677 non-null object \n",
" 16 Cannabis_DXCIE 10677 non-null object \n",
" 17 BZD_DxCIE 10677 non-null object \n",
" 18 Cocaina_DxCIE 10677 non-null object \n",
" 19 Alucinogenos_DXCIE 10677 non-null object \n",
" 20 Tabaco_DXCIE 10677 non-null object \n",
" 21 FrecuenciaConsumo30Dias 10677 non-null object \n",
" 22 Años_consumo_droga 10478 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 10677 non-null object \n",
" 24 Tx_previos 10677 non-null object \n",
" 25 Adherencia_tto_recalc 10677 non-null float64 \n",
" 26 Tiempo_tx 10677 non-null float64 \n",
" 27 Readmisiones_estudios 10677 non-null object \n",
" 28 Situacion_tratamiento 10677 non-null object \n",
" 29 Periodos_COVID 10677 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 10677 non-null object \n",
" 31 Nreadmision 10677 non-null float64 \n",
" 32 Readmisiones_PRECOVID 10677 non-null float64 \n",
" 33 Readmisiones_COVID 10677 non-null float64 \n",
" 34 Group 10677 non-null object \n",
"dtypes: category(1), float64(10), object(24)\n",
"memory usage: 2.9+ MB\n",
"None\n",
"-------------------------------\n",
"POST-ABANDONO\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8795 entries, 11 to 85156\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 8795 non-null float64 \n",
" 1 Education 8795 non-null object \n",
" 2 Social_protection 8795 non-null object \n",
" 3 Job_insecurity 8795 non-null object \n",
" 4 Housing 8795 non-null object \n",
" 5 Alterations_early_childhood_develop 8795 non-null object \n",
" 6 Social_inclusion 8795 non-null object \n",
" 7 Risk_stigma 8308 non-null category\n",
" 8 Structural_conflic 8795 non-null float64 \n",
" 9 Age 8794 non-null float64 \n",
" 10 Sex 8795 non-null object \n",
" 11 NumHijos 8325 non-null float64 \n",
" 12 Smoking 8795 non-null object \n",
" 13 Biological_vulnerability 8795 non-null object \n",
" 14 Alcohol_DxCIE 8795 non-null object \n",
" 15 Opiaceos_DxCIE 8795 non-null object \n",
" 16 Cannabis_DXCIE 8795 non-null object \n",
" 17 BZD_DxCIE 8795 non-null object \n",
" 18 Cocaina_DxCIE 8795 non-null object \n",
" 19 Alucinogenos_DXCIE 8795 non-null object \n",
" 20 Tabaco_DXCIE 8795 non-null object \n",
" 21 FrecuenciaConsumo30Dias 8795 non-null object \n",
" 22 Años_consumo_droga 8627 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 8795 non-null object \n",
" 24 Tx_previos 8795 non-null object \n",
" 25 Adherencia_tto_recalc 8795 non-null float64 \n",
" 26 Tiempo_tx 8795 non-null float64 \n",
" 27 Readmisiones_estudios 8795 non-null object \n",
" 28 Situacion_tratamiento 8795 non-null object \n",
" 29 Periodos_COVID 8795 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 8795 non-null object \n",
" 31 Nreadmision 8795 non-null float64 \n",
" 32 Readmisiones_PRECOVID 8795 non-null float64 \n",
" 33 Readmisiones_COVID 8795 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 2.3+ MB\n",
"None\n",
"-------------------------------\n",
"POST-ALTA\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 1882 entries, 258 to 85149\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 1882 non-null float64 \n",
" 1 Education 1882 non-null object \n",
" 2 Social_protection 1882 non-null object \n",
" 3 Job_insecurity 1882 non-null object \n",
" 4 Housing 1882 non-null object \n",
" 5 Alterations_early_childhood_develop 1882 non-null object \n",
" 6 Social_inclusion 1882 non-null object \n",
" 7 Risk_stigma 1777 non-null category\n",
" 8 Structural_conflic 1882 non-null float64 \n",
" 9 Age 1882 non-null float64 \n",
" 10 Sex 1882 non-null object \n",
" 11 NumHijos 1778 non-null float64 \n",
" 12 Smoking 1882 non-null object \n",
" 13 Biological_vulnerability 1882 non-null object \n",
" 14 Alcohol_DxCIE 1882 non-null object \n",
" 15 Opiaceos_DxCIE 1882 non-null object \n",
" 16 Cannabis_DXCIE 1882 non-null object \n",
" 17 BZD_DxCIE 1882 non-null object \n",
" 18 Cocaina_DxCIE 1882 non-null object \n",
" 19 Alucinogenos_DXCIE 1882 non-null object \n",
" 20 Tabaco_DXCIE 1882 non-null object \n",
" 21 FrecuenciaConsumo30Dias 1882 non-null object \n",
" 22 Años_consumo_droga 1851 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 1882 non-null object \n",
" 24 Tx_previos 1882 non-null object \n",
" 25 Adherencia_tto_recalc 1882 non-null float64 \n",
" 26 Tiempo_tx 1882 non-null float64 \n",
" 27 Readmisiones_estudios 1882 non-null object \n",
" 28 Situacion_tratamiento 1882 non-null object \n",
" 29 Periodos_COVID 1882 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 1882 non-null object \n",
" 31 Nreadmision 1882 non-null float64 \n",
" 32 Readmisiones_PRECOVID 1882 non-null float64 \n",
" 33 Readmisiones_COVID 1882 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 501.9+ KB\n",
"None\n",
"-------------------------------\n"
]
}
],
"source": [
"print(\"PRE\")\n",
"print(conj_pre.info())\n",
"print (\"-------------------------------\")\n",
"print(\"PRE-ABANDONO\")\n",
"print(pre_abandono.info())\n",
"print (\"-------------------------------\")\n",
"print(\"PRE-ALTA\")\n",
"print(pre_alta.info())\n",
"print (\"-------------------------------\")\n",
"\n",
"print(\"\\n\\n\\n\")\n",
"\n",
"print (\"POST\")\n",
"print(conj_post.info())\n",
"print (\"-------------------------------\")\n",
"print(\"POST-ABANDONO\")\n",
"print(post_abandono.info())\n",
"print (\"-------------------------------\")\n",
"print(\"POST-ALTA\")\n",
"print(post_alta.info())\n",
"print (\"-------------------------------\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing and Unknown Values"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
"['Live with families or friends' 'live alone' 'live in institutions']\n",
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)' '9']\n",
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)']\n",
"[NaN, 'Yes', 'No']\n",
"Categories (3, object): [99.0, 'No', 'Yes']\n",
"[NaN, 'Yes', 'No']\n",
"Categories (2, object): ['No', 'Yes']\n",
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n",
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\1003504044.py:14: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
" bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
]
}
],
"source": [
"# 9.0 represents unknown according to Variables.docx \n",
"print(bd['Social_inclusion'].unique())\n",
"mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
"bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
"print(bd['Social_inclusion'].unique())\n",
"\n",
"print(bd['Alterations_early_childhood_develop'].unique())\n",
"mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
"bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
"print(bd['Alterations_early_childhood_develop'].unique())\n",
"\n",
"print(bd['Risk_stigma'].unique())\n",
"mode_stigma = bd['Risk_stigma'].mode()[0]\n",
"bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
"print(bd['Risk_stigma'].unique())\n",
"\n",
"print(bd['NumHijos'].unique())\n",
"mode_hijos = bd['NumHijos'].mode()[0]\n",
"bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
"print(bd['NumHijos'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total missing values Age: 10\n",
"Total missing values Años_consumo_droga: 718\n",
"Total missing values Risk_stigma: 1847\n",
"Total missing values NumHijos: 1788\n",
"\tCONJUNTO PREPANDEMIA\n",
"\t\tMissing values Age: 9\n",
"\t\tMissing values Años_consumo_droga: 519\n",
"\t\tMissing values Risk_stigma: 1255\n",
"\t\tMissing values NumHijos: 1214\n",
"\tCONJUNTO POSTPANDEMIA\n",
"\t\tMissing values Age: 1\n",
"\t\tMissing values Años_consumo_droga: 199\n",
"\t\tMissing values Risk_stigma: 592\n",
"\t\tMissing values NumHijos: 574\n"
]
}
],
"source": [
"print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
"print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
"print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n",
"print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n",
"\n",
"print(\"\\tCONJUNTO PREPANDEMIA\")\n",
"print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n",
"\n",
"print(\"\\tCONJUNTO POSTPANDEMIA\")\n",
"print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n",
"print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Age'].fillna(age_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
]
}
],
"source": [
"age_mode = bd['Age'].mode()[0]\n",
"bd['Age'].fillna(age_mode, inplace=True)\n",
"\n",
"años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n",
"bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
"\n",
"risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n",
"bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
"\n",
"num_hijos_mode = bd['NumHijos'].mode()[0]\n",
"bd['NumHijos'].fillna(num_hijos_mode, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distribution of Variables"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
" 'Alterations_early_childhood_develop', 'Social_inclusion',\n",
" 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
" 'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n",
" 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n",
" 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
" ]\n",
"\n",
"num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Discrete"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Countplots"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
"plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
"\n",
"for i, disc_att in enumerate(disc_atts):\n",
" ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
" hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
" ax=axs[i])\n",
" ax.set_title(disc_att, fontsize=16, fontweight='bold')\n",
" ax.get_legend().set_title(\"Groups\")\n",
" \n",
" # Adding count annotations\n",
" for p in ax.patches:\n",
" if p.get_label() == '_nolegend_':\n",
" ax.annotate(format(p.get_height(), '.0f'), \n",
" (p.get_x() + p.get_width() / 2., p.get_height()), \n",
" ha = 'center', va = 'center', \n",
" xytext = (0, 9), \n",
" textcoords = 'offset points')\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Normalized Countplots"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Function to plot countplot \n",
"def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
" \"\"\"\n",
" group: 1 (all), 2 (pre), 3 (post) \n",
" \"\"\"\n",
"\n",
" # Define data to work with based on group\n",
" if group == 1:\n",
" df = bd \n",
" elif group == 2:\n",
" df = conj_pre\n",
" elif group == 3:\n",
" df = conj_post\n",
"\n",
" # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n",
" # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n",
" grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n",
" # Calculate total count for each 'Situacion_tratamiento' group\n",
" total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n",
" # Divide each count by its corresponding total count and calculate percentage\n",
" grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n",
" \n",
" # Follow the same order in plot as in computations\n",
" col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n",
"\n",
" # Create countplot and split each bar into two based on the value of sit_tto\n",
" ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n",
"\n",
" # Adjust y-axis to represent percentages out of the total count\n",
" ax.set_ylim(0, 100)\n",
"\n",
" percentages = grouped_counts['percentage']\n",
" for i, p in enumerate(ax.patches):\n",
" # Skip going over the legend values\n",
" if p.get_label() == \"_nolegend_\":\n",
" # Set height to corresponding percentage and annotate result\n",
" height = percentages[i]\n",
" p.set_height(height)\n",
" ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n",
" ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n",
" textcoords='offset points')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n",
"plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
"\n",
"for i, disc_att in enumerate(disc_atts):\n",
"\n",
" # # 1: ALL \n",
" # plot_count_perc_norm(i, 1, disc_att)\n",
" # axs[i, 0].set_title(\"\\nALL\")\n",
" # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
" # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
" # axs[i, 0].tick_params(axis='x', rotation=90)\n",
" \n",
" # 2: PRE\n",
" plot_count_perc_norm(i, 2, disc_att)\n",
" axs[i, 0].set_title(\"\\nPRE\")\n",
" axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
" axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
" axs[i, 0].tick_params(axis='x', rotation=90)\n",
"\n",
" # 3: POST\n",
" plot_count_perc_norm(i, 3, disc_att)\n",
" axs[i, 1].set_title(\"\\nPOST\")\n",
" axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n",
" axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
" axs[i, 1].tick_params(axis='x', rotation=90)\n",
"\n",
" \n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
"plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Numerical"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Summary Stats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(bd[num_atts].describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Boxplots"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n",
"plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
"\n",
"for i, num_att in enumerate(num_atts):\n",
" plt.subplot(len(num_atts), 1, i+1)\n",
" sns.boxplot(\n",
" data=combined_pre_post,\n",
" x = num_att,\n",
" y = 'Group',\n",
" hue='Situacion_tratamiento',\n",
" )\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Histograms"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n",
"plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
"\n",
"for i, num_att in enumerate(num_atts):\n",
"\n",
" # 1: All alcohol patients\n",
" sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n",
" line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n",
" axs[i, 0].set_title(f\"\\nDistr. of {num_att} - ALL\")\n",
"\n",
" # 2: PRE\n",
" sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
" line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n",
" axs[i, 1].set_title(f\"\\nDistr. of {num_att} - PRE\")\n",
"\n",
" # Subplot 3: POST\n",
" sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
" line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
" axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n",
"\n",
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Correlation Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Groups of Variables"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
" 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
"ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
" 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
" 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
"target_var = 'Situacion_tratamiento'\n",
"\n",
"# Columns that are already numeric and we don't need to redefine \n",
"no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### One-hot Encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Binary"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Alterations_early_childhood_develop'\n",
"alterations_mapping = {\n",
" 'No alterations (first exposure at 11 or more years)' : 0,\n",
" 'Alterations (first exposure before 11 years old)': 1,\n",
"}\n",
"\n",
"bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# Social protection\n",
"bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Risk_stigma'\n",
"bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Sex'\n",
"bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Smoking'\n",
"bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Biological_vulnerability'\n",
"bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Droga_DxCIE'\n",
"bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n",
"bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n",
"bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n",
"bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n",
"bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n",
"bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'OtrosDx_Psiquiatrico'\n",
"bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Tx_previos'\n",
"bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n",
"\n",
"# --------------------------------------------------------------------------\n",
"\n",
"# 'Situacion_tratamiento (!!!!!)\n",
"# Important to define properly\n",
"bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n",
"\n",
"# --------------------------------------------------------------------------"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Categorical"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"# Specify columns to one hot encode; empty list otherwise\n",
"one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n",
"\n",
"one_hots_vars_prefix = {\n",
" 'Education': 'Ed',\n",
" 'Job_insecurity': 'JobIn',\n",
" 'Housing': 'Hous', \n",
" 'Social_inclusion': 'SocInc',\n",
" 'FrecuenciaConsumo30Dias': 'Frec30',\n",
"}\n",
"\n",
"one_hot_cols_dic = {}\n",
"\n",
"for one_hot_var in one_hot_vars:\n",
" # Create one hot encoding version of attribute and concatenate new columns to main df\n",
" encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
" bd = pd.concat([bd, encoded_var], axis=1)\n",
" one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Final Columns"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"soc_vars_enc = []\n",
"for soc_var in social_vars:\n",
" # If no need to redefine, append directly\n",
" if soc_var in no_redef_cols:\n",
" soc_vars_enc.append(soc_var)\n",
" # If need to redefine\n",
" else:\n",
" # Check if it was one-hot encoded\n",
" if soc_var in one_hot_vars:\n",
" # Append all one hot columns\n",
" soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n",
" # If not, use redefined version through mapping\n",
" else:\n",
" soc_vars_enc.append(soc_var + '_REDEF')\n",
"\n",
"ind_vars_enc = []\n",
"for ind_var in ind_vars:\n",
" # If no need to redefine, append directly\n",
" if ind_var in no_redef_cols:\n",
" ind_vars_enc.append(ind_var)\n",
" # If need to redefine\n",
" else:\n",
" # Check if it was one-hot encoded\n",
" if ind_var in one_hot_vars:\n",
" # Append all one hot columns\n",
" ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n",
" # If not, use redefined version through mapping\n",
" else:\n",
" ind_vars_enc.append(ind_var + '_REDEF')\n",
"\n",
"# Final version of columns we need to use for correlation analysis\n",
"corr_cols = soc_vars_enc + ind_vars_enc"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"# Drop unknown columns\n",
"corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
"ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Renaming and Filtering"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
"bd = bd[columns_to_keep]"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"name_mapping = {\n",
" 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
" 'Ed_Primary education': 'Ed_Primary',\n",
" 'Ed_Secondary Education': 'Ed_Secondary',\n",
" 'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n",
" 'Ed_Tertiary': 'Ed_Tertiary',\n",
" 'Social_protection_REDEF': 'Social_Protection',\n",
" 'JobIn_Non-stable': 'JobIn_Unstable',\n",
" 'JobIn_Stable': 'JobIn_Stable',\n",
" 'JobIn_Unemployed': 'JobIn_Unemployed',\n",
" 'Hous_Institutional': 'Hous_Institutional',\n",
" 'Hous_Stable': 'Hous_Stable',\n",
" 'Hous_Unstable': 'Hous_Unstable',\n",
" 'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n",
" 'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n",
" 'SocInc_live alone': 'SocInc_Alone',\n",
" 'SocInc_live in institutions': 'SocInc_Instit',\n",
" 'Risk_stigma_REDEF': 'Risk_Stigma',\n",
" 'Structural_conflic': 'Structural_Conflict',\n",
" 'Age': 'Age',\n",
" 'Sex_REDEF': 'Sex',\n",
" 'NumHijos': 'Num_Children',\n",
" 'Smoking_REDEF': 'Smoking',\n",
" 'Biological_vulnerability_REDEF': 'Bio_Vulner',\n",
" 'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n",
" 'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n",
" 'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n",
" 'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n",
" 'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n",
" 'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n",
" 'Frec30_1 día/semana': 'Freq_1dpw',\n",
" 'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n",
" 'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n",
" 'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n",
" 'Frec30_No consumio': 'Freq_None',\n",
" 'Frec30_Todos los días': 'Freq_Everyday',\n",
" 'Años_consumo_droga': 'Years_Drug_Use',\n",
" 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n",
" 'Tx_previos_REDEF': 'Previous_Treatments',\n",
" 'Adherencia_tto_recalc': 'Treatment_Adherence',\n",
" 'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n",
" 'Situacion_tratamiento': 'Situacion_tratamiento',\n",
" 'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n",
"}\n",
"\n",
"# Update lists of feature names\n",
"corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n",
"soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n",
"ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"# Export feature names\n",
"np.save('./output/feature_names.npy', corr_cols)\n",
"np.save('./output/soc_vars_names.npy', soc_vars_enc)\n",
"np.save('./output/ind_vars_names.npy', ind_vars_enc)"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
"#print(bd.columns)"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"# Update main dfs\n",
"# Pre-pandemic\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
"# Pre-pandemic abandono\n",
"pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
"# Pre-pandemic alta\n",
"pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
"\n",
"# Post-pandemic\n",
"# Merging last two classes to balance sets\n",
"conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
" (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
"# Post-pandemic abandono\n",
"post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
"# Post-pandemic alta\n",
"post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Correlation Matrices"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [],
"source": [
"binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
"cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"def get_corr_matrix(df, cols):\n",
" \n",
" # Initialize nxn matrix to zeroes\n",
" n = len(cols)\n",
" corr_matrix = np.zeros((n,n))\n",
"\n",
" for i, var_i in enumerate(cols):\n",
" for j, var_j in enumerate(cols):\n",
" # Fill lower triangle of matrix\n",
" if i > j:\n",
" # Binary with binary correlation: tetrachoric\n",
" if var_i in binary_vars and var_j in binary_vars:\n",
" corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
" # Continuous with continuous correlation: \n",
" elif var_i in cont_vars and var_j in cont_vars:\n",
" # Returning nan sometimes:\n",
" # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
" # corr = corr_tuple[0]\n",
" corr = df[var_i].corr(df[var_j], method='spearman')\n",
" # Binary vs Continuous correlation:\n",
" else:\n",
" if var_i in binary_vars:\n",
" bin_var = var_i\n",
" cont_var = var_j\n",
" else:\n",
" bin_var = var_j\n",
" cont_var = var_i\n",
" corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n",
" # Assign value to matrix\n",
" corr_matrix[i][j] = corr \n",
" \n",
" return corr_matrix"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"def plot_heatmap(sit_tto: int, group:int) -> None:\n",
" \"\"\"\n",
" sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
" group: 1 (all alcohol patients), 2 (pre), 3 (post)\n",
" \"\"\"\n",
"\n",
" # Define columns based on sit_tto arg\n",
" if sit_tto == 1:\n",
" # Include target as another variable\n",
" cols = ['Treatment_Outcome'] + corr_cols\n",
" else:\n",
" cols = corr_cols\n",
" \n",
" # Title plot and select datat based on group and sit_tto\n",
" if group == 1:\n",
" plot_title = \"Correl Matrix - ALL\"\n",
" if sit_tto == 1:\n",
" bd_ca = bd[cols]\n",
" elif sit_tto == 2:\n",
" bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n",
" elif sit_tto == 3:\n",
" bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n",
" elif group == 2:\n",
" plot_title = \"Correl Matrix - PRE\"\n",
" if sit_tto == 1: \n",
" bd_ca = conj_pre[cols]\n",
" elif sit_tto == 2:\n",
" bd_ca = pre_abandono[cols]\n",
" elif sit_tto == 3:\n",
" bd_ca = pre_alta[cols]\n",
" elif group == 3:\n",
" plot_title = \"Correl Matrix - POST\"\n",
" if sit_tto == 1: \n",
" bd_ca = conj_post[cols]\n",
" elif sit_tto == 2:\n",
" bd_ca = post_abandono[cols]\n",
" elif sit_tto == 3:\n",
" bd_ca = post_alta[cols]\n",
" \n",
" # Complete title\n",
" if sit_tto == 2:\n",
" plot_title += \" - ABANDONO\"\n",
" elif sit_tto == 3:\n",
" plot_title += \" - ALTA\"\n",
"\n",
" corr_matrix = get_corr_matrix(bd_ca, cols)\n",
"\n",
" # Create a mask for the upper triangle\n",
" mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
"\n",
" # Create heatmap correlation matrix\n",
" dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n",
"\n",
" # Group ind vs social vars by color and modify tick label names\n",
" for tick_label in dataplot.axes.xaxis.get_ticklabels():\n",
" if tick_label.get_text() in ind_vars_enc:\n",
" tick_label.set_color('green')\n",
" elif tick_label.get_text() in soc_vars_enc:\n",
" tick_label.set_color('purple') \n",
" for tick_label in dataplot.axes.yaxis.get_ticklabels():\n",
" if tick_label.get_text() in ind_vars_enc:\n",
" tick_label.set_color('green')\n",
" elif tick_label.get_text() in soc_vars_enc:\n",
" tick_label.set_color('purple') \n",
"\n",
" # Increase the size of xtick labels\n",
" # dataplot.tick_params(axis='x', labelsize=12)\n",
"\n",
" # Increase the size of ytick labels\n",
" # dataplot.tick_params(axis='y', labelsize=12)\n",
"\n",
" # Add legend and place it in lower left \n",
" plt.legend(handles=[\n",
" plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n",
" plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n",
" ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n",
"\n",
" plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n",
"\n",
" return corr_matrix"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n",
"plt.subplots_adjust(hspace=0.75, wspace=2)\n",
"corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
"\n",
"# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
"for sit_tto in range(1,4):\n",
" # ALL\n",
" plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n",
" _ = plot_heatmap(sit_tto, 1)\n",
" # PRE\n",
" plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n",
" corr_matrix_pre = plot_heatmap(sit_tto, 2)\n",
" # POST\n",
" plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n",
" corr_matrix_post = plot_heatmap(sit_tto, 3)\n",
"\n",
" corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
" \n",
"plt.tight_layout()\n",
"\n",
"plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment