From 7a2bcb50c86d5492e282b4762d94349986fdd9ef Mon Sep 17 00:00:00 2001 From: joaquintb Date: Thu, 27 Jun 2024 13:01:04 +0200 Subject: [PATCH] Features renamed --- EDA/EDA.ipynb | 98 +++++++++++++++++++++++----- EDA/output/feature_names.npy | Bin 0 -> 3716 bytes EDA/output/ind_vars_names.npy | Bin 3572 -> 1784 bytes EDA/output/soc_vars_names.npy | Bin 3572 -> 1784 bytes gen_train_data/gen_train_data.ipynb | 18 ++++- 5 files changed, 97 insertions(+), 19 deletions(-) create mode 100644 EDA/output/feature_names.npy diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index c65d87c..bcc6f79 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -651,16 +651,6 @@ "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# res_vars = ['Tiempo_tx', 'Readmisiones_estudios', 'Periodos_COVID', 'Pandemia_inicio_fin_tratamiento', \n", - "# 'Nreadmision', 'Readmisiones_PRECOVID', 'Readmisiones_COVID']" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -670,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -705,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -743,15 +733,89 @@ "corr_cols = soc_vars_enc + ind_vars_enc" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### Excluding unknown columns and renaming" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop unknown columns\n", + "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Export column names for future programs\n", + "name_mapping = {\n", + " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", + " 'Ed_Primary education': 'Ed_Primary',\n", + " 'Ed_Secondary Education': 'Ed_Secondary',\n", + " 'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n", + " 'Ed_Tertiary': 'Ed_Tertiary',\n", + " 'Social_protection_REDEF': 'Social_Protection',\n", + " 'JobIn_Non-stable': 'JobIn_Unstable',\n", + " 'JobIn_Stable': 'JobIn_Stable',\n", + " 'JobIn_Unemployed': 'JobIn_Unemployed',\n", + " 'Hous_Institutional': 'Hous_Institutional',\n", + " 'Hous_Stable': 'Hous_Stable',\n", + " 'Hous_Unstable': 'Hous_Unstable',\n", + " 'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n", + " 'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n", + " 'SocInc_live alone': 'SocInc_Alone',\n", + " 'SocInc_live in institutions': 'SocInc_Instit',\n", + " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", + " 'Structural_conflic': 'Structural_Conflict',\n", + " 'Age': 'Age',\n", + " 'Sex_REDEF': 'Sex',\n", + " 'NumHijos': 'Num_Children',\n", + " 'Smoking_REDEF': 'Smoking',\n", + " 'Biological_vulnerability_REDEF': 'Bio_Vulner',\n", + " 'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n", + " 'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n", + " 'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n", + " 'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n", + " 'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n", + " 'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n", + " 'Frec30_1 día/semana': 'Freq_1dpw',\n", + " 'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n", + " 'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n", + " 'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n", + " 'Frec30_No consumio': 'Freq_None',\n", + " 'Frec30_Todos los días': 'Freq_Everyday',\n", + " 'Años_consumo_droga': 'Years_Drug_Use',\n", + " 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n", + " 'Tx_previos_REDEF': 'Previous_Treatments',\n", + " 'Adherencia_tto_recalc': 'Treatment_Adherence'\n", + "}\n", + "\n", + "# Update lists of feature names\n", + "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", + "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", + "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Export feature names\n", + "np.save('./output/feature_names.npy', corr_cols)\n", "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n", - "np.save('./output/ind_vars_names.npy', soc_vars_enc)" + "np.save('./output/ind_vars_names.npy', ind_vars_enc)" ] }, { diff --git a/EDA/output/feature_names.npy b/EDA/output/feature_names.npy new file mode 100644 index 0000000000000000000000000000000000000000..dde21314476ea31ab662cc606627d0c1fcb59a9f GIT binary patch literal 3716 zcmcJSTW`}q6olPZens94Qg~>Ckf?a+RTPO((F9RmqR_Y(QJP|x0;>8e`iDE8#X0Q4~#tkrYJ?B0pPc3*Z@5q1Uu$(<`=;TA@lkIX^FufgSRcH@yq)O@7y23FT2b@K zDC1^5O*x&kjN7{LA*^rKNbT~T8d??~^pWv@1lErq_Qrm&53GSPi(ZFyq0e(s#JvhX zNK(XYKdigSKEh3XyN0O&yu_C+fj%DKEj7=$MNi0I7OWq?sc-vC8~%L(KK2sN!`yuH z9Yt>64`Hrl+}2KR=E6}YmT`L>Xt5VOtHu`%WWCJ)Dc}Y9%yp)=?`1V>g?aFf`o(K_ zX?y?hnVdU<{W(>e_vrE*HRl3P(N@>XIg*kNy5ap-^1aRLz9zGNJoUP{?_G1;8F=G5 zQ?qiQ8(w^5FX5!#@TqNVivT{j(fw6m9qf~98r@CevnKKPf%)+u6UehGd=sdT17T07 z6U^gWqY=>N_k~<-bKuE$0gbEBDwG;Kc{V<28D{2F@2;ULV}t`SjTn`ixzSD)Z2_ PCE$6uK69g;_F_B&hMMCZ literal 0 HcmV?d00001 diff --git a/EDA/output/ind_vars_names.npy b/EDA/output/ind_vars_names.npy index 42fd230fd886993d69ea0d48313202779250def3..11b839a2e8c6272897971b2e6574032b4a05d76f 100644 GIT binary patch delta 266 zcmew&{eyRcIFpg_L`hX9LyL(H5sdK@XDYMCGcYg&Ogtz(S%pz!vJJDzWC3QL$rVg0 zK(-Ey4OSE~c`K9hD-@KXBLp z*)dSQ%VYx%k;x0#J%Dr!nEiso186SGu= z#5q7*3B({rx=l9aR-VkkBg5+mmB|Ovd6OAAjbZwM_DD=F0s3PNj|gM%?o+=vfUw3KQWu1amA6(5z<2O|Co|FGwS92iPct(eve zhndXW%suxt$!D{3+PYXuUXu58kd6C!x)svx&gOdB2X4Y@nGC=K`?d+E$=LXvd2zKp za{W_|IX(r!`FCE4De-l@$!`$QpEVaT$MfP0Y&(Z>47I!vi8*$}0RL}`kyyP7 z?iKvM7Kp_>h5W*WoRAyh1}Adwo@GDIrDM!GKB3zd@OQ5m4>bOj9P1!#T?$8nZ8nQ_ zp4SBK5V+w3x{j@;wM!L8;=3qLlTKH0bZt_5@f}Xgxu035KIb7-9J%|*GyOx~Foz$! zoacG<&|zU4PShOr@49pi(o5tWoymT2k7=tmDdYH`Y7xiFIJy=c$3^sVoRl(-aV@$I z7kQWOh%h}sozbg&2Z`Hc-MLoLVIMu-qsOUXf2XNk*6F9AK%cqS&|{r5UWyT2_W52^ zeG8~PbgaIx>44Uf8IR*rCnTM#_Qf7b?M*&S6CBdLEYZE0YOGq|tGacqD-@KXBLp z*)dSQ%VYx%k;x0#J%Dr!nEiso186SGu= z#5q7*3B({rx=l9aR-VkkBg5+mmB|Ovd6OAAjbZwM_DD=F0s3PNj|gM%?o+=vfUw3KQWu1amA6(5z<2O|Co|FGwS92iPct(eve zhndXW%suxt$!D{3+PYXuUXu58kd6C!x)svx&gOdB2X4Y@nGC=K`?d+E$=LXvd2zKp za{W_|IX(r!`FCE4De-l@$!`$QpEVaT$MfP0Y&(Z>47I!vi8*$}0RL}`kyyP7 z?iKvM7Kp_>h5W*WoRAyh1}Adwo@GDIrDM!GKB3zd@OQ5m4>bOj9P1!#T?$8nZ8nQ_ zp4SBK5V+w3x{j@;wM!L8;=3qLlTKH0bZt_5@f}Xgxu035KIb7-9J%|*GyOx~Foz$! zoacG<&|zU4PShOr@49pi(o5tWoymT2k7=tmDdYH`Y7xiFIJy=c$3^sVoRl(-aV@$I z7kQWOh%h}sozbg&2Z`Hc-MLoLVIMu-qsOUXf2XNk*6F9AK%cqS&|{r5UWyT2_W52^ zeG8~PbgaIx>44Uf8IR*rCnTM#_Qf7b?M*&S6CBdLEYZE0YOGq|tGacq