diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index c65d87c3cff70b74de5da705f861e82ea909aeee..bcc6f79f5f7d61584456866ce74564eb6e0b7ab8 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -651,16 +651,6 @@ "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# res_vars = ['Tiempo_tx', 'Readmisiones_estudios', 'Periodos_COVID', 'Pandemia_inicio_fin_tratamiento', \n", - "# 'Nreadmision', 'Readmisiones_PRECOVID', 'Readmisiones_COVID']" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -670,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -705,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -743,15 +733,89 @@ "corr_cols = soc_vars_enc + ind_vars_enc" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### Excluding unknown columns and renaming" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop unknown columns\n", + "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Export column names for future programs\n", + "name_mapping = {\n", + " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", + " 'Ed_Primary education': 'Ed_Primary',\n", + " 'Ed_Secondary Education': 'Ed_Secondary',\n", + " 'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n", + " 'Ed_Tertiary': 'Ed_Tertiary',\n", + " 'Social_protection_REDEF': 'Social_Protection',\n", + " 'JobIn_Non-stable': 'JobIn_Unstable',\n", + " 'JobIn_Stable': 'JobIn_Stable',\n", + " 'JobIn_Unemployed': 'JobIn_Unemployed',\n", + " 'Hous_Institutional': 'Hous_Institutional',\n", + " 'Hous_Stable': 'Hous_Stable',\n", + " 'Hous_Unstable': 'Hous_Unstable',\n", + " 'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n", + " 'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n", + " 'SocInc_live alone': 'SocInc_Alone',\n", + " 'SocInc_live in institutions': 'SocInc_Instit',\n", + " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", + " 'Structural_conflic': 'Structural_Conflict',\n", + " 'Age': 'Age',\n", + " 'Sex_REDEF': 'Sex',\n", + " 'NumHijos': 'Num_Children',\n", + " 'Smoking_REDEF': 'Smoking',\n", + " 'Biological_vulnerability_REDEF': 'Bio_Vulner',\n", + " 'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n", + " 'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n", + " 'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n", + " 'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n", + " 'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n", + " 'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n", + " 'Frec30_1 día/semana': 'Freq_1dpw',\n", + " 'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n", + " 'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n", + " 'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n", + " 'Frec30_No consumio': 'Freq_None',\n", + " 'Frec30_Todos los días': 'Freq_Everyday',\n", + " 'Años_consumo_droga': 'Years_Drug_Use',\n", + " 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n", + " 'Tx_previos_REDEF': 'Previous_Treatments',\n", + " 'Adherencia_tto_recalc': 'Treatment_Adherence'\n", + "}\n", + "\n", + "# Update lists of feature names\n", + "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", + "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", + "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Export feature names\n", + "np.save('./output/feature_names.npy', corr_cols)\n", "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n", - "np.save('./output/ind_vars_names.npy', soc_vars_enc)" + "np.save('./output/ind_vars_names.npy', ind_vars_enc)" ] }, { diff --git a/EDA/output/feature_names.npy b/EDA/output/feature_names.npy new file mode 100644 index 0000000000000000000000000000000000000000..dde21314476ea31ab662cc606627d0c1fcb59a9f Binary files /dev/null and b/EDA/output/feature_names.npy differ diff --git a/EDA/output/ind_vars_names.npy b/EDA/output/ind_vars_names.npy index 42fd230fd886993d69ea0d48313202779250def3..11b839a2e8c6272897971b2e6574032b4a05d76f 100644 Binary files a/EDA/output/ind_vars_names.npy and b/EDA/output/ind_vars_names.npy differ diff --git a/EDA/output/soc_vars_names.npy b/EDA/output/soc_vars_names.npy index 42fd230fd886993d69ea0d48313202779250def3..11b839a2e8c6272897971b2e6574032b4a05d76f 100644 Binary files a/EDA/output/soc_vars_names.npy and b/EDA/output/soc_vars_names.npy differ diff --git a/gen_train_data/gen_train_data.ipynb b/gen_train_data/gen_train_data.ipynb index afeceb9c7b04b1aa8ec20ca0442a4fdc756a7dba..2d6f59fb321720158f7e9676aab80e5235d87fe6 100644 --- a/gen_train_data/gen_train_data.ipynb +++ b/gen_train_data/gen_train_data.ipynb @@ -53,7 +53,21 @@ "(10677, 39)\n", "(22861,)\n", "(10677,)\n", - "39\n" + "['Ed_Not Complete primary school' 'Ed_Primary education'\n", + " 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n", + " 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n", + " 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n", + " 'Alterations_early_childhood_develop_REDEF'\n", + " 'SocInc_Live with families or friends' 'SocInc_live alone'\n", + " 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n", + " 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n", + " 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n", + " 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n", + " 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n", + " 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n", + " 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n", + " 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n", + " 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n" ] } ], @@ -67,7 +81,7 @@ "print(X_post.shape)\n", "print(y_pre.shape)\n", "print(y_post.shape)\n", - "print(len(feat))" + "print((feat))" ] }, {