Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
V
Variable derivation
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Uncover
Variable derivation
Commits
b5ea0ecf
Commit
b5ea0ecf
authored
Nov 15, 2021
by
Alberto Blázquez Herranz
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added categorical verification + dependency between scripts.
parent
cd5bcc30
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
24 deletions
+42
-24
__pycache__/numeric_converter.cpython-37.pyc
__pycache__/numeric_converter.cpython-37.pyc
+0
-0
derived_variables_generator.py
derived_variables_generator.py
+11
-3
numeric_converter.py
numeric_converter.py
+31
-21
No files found.
__pycache__/numeric_converter.cpython-37.pyc
0 → 100644
View file @
b5ea0ecf
File added
derived_variables_generator.py
View file @
b5ea0ecf
...
...
@@ -8,7 +8,9 @@ Created on Wed Nov 10 12:41:12 2021
import
datetime
import
pandas
as
pd
import
sys
import
math
import
numeric_converter
import
zipfile
import
csv
datafile_path
=
sys
.
argv
[
1
]
csv_separator
=
","
...
...
@@ -19,6 +21,9 @@ if len(sys.argv) == 3:
datafile
=
pd
.
read_csv
(
datafile_path
,
csv_separator
)
datafile
=
numeric_converter
.
numeric_conversion
(
datafile
)
age_ranges
=
list
(
range
(
30
,
100
,
10
))
...
...
@@ -83,5 +88,8 @@ if "DATAD" in datafile.columns:
new_datafile_path
=
datafile_path
.
replace
(
".csv"
,
"_derived.csv"
)
datafile
.
to_csv
(
new_datafile_path
,
index
=
False
)
\ No newline at end of file
new_datafile_path
=
datafile_path
.
replace
(
".csv"
,
"_numeric_derived"
)
datafile
.
to_csv
(
new_datafile_path
+
".csv"
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONNUMERIC
)
with
zipfile
.
ZipFile
(
new_datafile_path
+
".zip"
,
'w'
)
as
myzip
:
myzip
.
write
(
new_datafile_path
+
".csv"
)
numeric_converter.py
View file @
b5ea0ecf
...
...
@@ -7,13 +7,13 @@ Created on Wed Nov 10 11:52:54 2021
import
pandas
as
pd
import
sys
'''
datafile_path = sys.argv[1]
csv_separator = ","
if len(sys.argv) == 3:
csv_separator = sys.argv[2]
'''
categorical_variables
=
[
"DMRGENDR"
,
"DMRBORN"
,
"DMRRETH1"
,
"DMROCCU"
,
"DMRHREDU"
,
"DSXOS"
,
"DSXHO"
,
"DSXIC"
,
"TRXAV"
,
"TRXRIB"
,
"TRXLR"
,
"TRXRM"
,
"TRXIA"
,
"TRXIB"
,
"TRXCH"
,
"TRXAB"
,
"TRXCS"
,
"TRXHEP"
,
"TRXAF"
,
"TRXCP"
,
"TRXOT"
,
"TRXECM"
,
"TRXIV"
,
"TRXNIV"
,
"TRXNO"
,
"TRXOX"
,
"TRXRR"
,
"TRXTR"
,
"TRXVA"
,
"TRXPE"
,
"TRXPV"
,
"TRXIT"
,
"TRXNMB"
,
"TRXAC"
,
"TRXINA"
,
"TRXIS"
,
"TRXIM"
,
"TRXVC"
,
"TRXVD"
,
"TRXZN"
,
"CSXCOT"
,
"CSXCTR"
,
"SMXASAH"
,
"SMXFEA"
,
"SMXCOA"
,
"SMXSTA"
,
"SMXSBA"
,
"SMXRNA"
,
"SMXMYA"
,
"SMXARA"
,
"SMXCPA"
,
"SMXAPA"
,
"SMXINA"
,
"SMXNAA"
,
"SMXDIA"
,
"SMXFAA"
,
"SMXHEA"
,
"SMXCNA"
,
"SMXACA"
,
"SMXSLA"
,
"SMXTLA"
,
"SMXSYA"
,
"SMXWHA"
,
"SMXLYA"
,
"SMXANA"
,
"SMXIWA"
,
"SMXSRA"
,
"SMXBLA"
,
"CMXPRG"
,
"CMXCVD"
,
"CMXCMP"
,
"CMXHT"
,
"CMXDI"
,
"CMXCKD"
,
"CMXCLD"
,
"CMXCPD"
,
"CMXASM"
,
"CMXCND"
,
"CMXRHE"
,
"CMXCCI"
,
"CMXCBD"
,
"CMXDE"
,
"CMXPU"
,
"CMXST"
,
"CMXLY"
,
"CMXAP"
,
"RFXSM"
,
"RFXFSM"
,
"RFXOB"
,
"RFXTB"
,
"RFXIMD"
,
"RFXHIV"
,
"RFXAIDS"
,
"RFXUI"
,
"RFXHC"
,
"RFXONC"
,
"RFXMN"
,
"HMRACI"
,
"HMRARB"
,
"HMRAHO"
,
"HMRNS"
,
"HMROS"
,
"HMRCS"
,
"HMRIS"
,
"HMRAV"
,
"HMRAB"
,
"HMRCOV"
,
"IMDXCT"
,
"IMDXCTCR"
,
"IMDXCTTE"
,
"IMDXCTAB"
,
"IMDXXR"
,
"IMDXPN"
,
...
...
@@ -21,27 +21,37 @@ categorical_variables = ["DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU
numeric_variables
=
[
"DMRAGEYR"
,
"DMXHT"
,
"DMXWT"
,
"DMXBMI"
,
"DATLGT"
,
"DATLGTI"
,
"DATSSDHn"
,
"CSXBTPA"
,
"CSXBTPHn"
,
"CSXOSTA"
,
"CSXOSTHn"
,
"CSXCHRA"
,
"CSXCHRHn"
,
"CSXRRA"
,
"CSXRRHn"
,
"CSXRRI"
,
"CSXSYA"
,
"CSXSYHn"
,
"CSXDIA"
,
"CSXDIHn"
,
"SMTFE"
,
"SMTCO"
,
"SMTST"
,
"SMTSB"
,
"SMXSEA"
,
"DATIMD"
,
"IMDXCTLD"
,
"IMDXEQ"
,
"DATLBDHn"
,
"LBXHGBA"
,
"LBXHGBHn"
,
"LBXESRA"
,
"LBXESRHn"
,
"LBXWBCSIA"
,
"LBXWBCSIHn"
,
"LBXLYMNOA"
,
"LBXLYMNOHn"
,
"LBXNENOA"
,
"LBXNENOHn"
,
"LBXHCTA"
,
"LBXHCTHn"
,
"LBXPLTSIA"
,
"LBXPLTSIHn"
,
"LBXGHA"
,
"LBXGHHn"
,
"LBXAPTTA"
,
"LBXAPTTHn"
,
"LBXAPTRA"
,
"LBXAPTRHn"
,
"LBXPTA"
,
"LBXPRHn"
,
"LBXINRA"
,
"LBXINRHn"
,
"LBXSATSIA"
,
"LBXSATSIHn"
,
"LBXSTBA"
,
"LBXSTBHn"
,
"LBXSCBA"
,
"LBXSCBHn"
,
"LBXSUBA"
,
"LBXSUBHn"
,
"LBXSASSIA"
,
"LBXSASSIHn"
,
"LBXSGLA"
,
"LBXSGLHn"
,
"LBXSBUA"
,
"LBXSBUHn"
,
"LBXSBLA"
,
"LBXSBLHn"
,
"LBXSCRA"
,
"LBXSCRHn"
,
"LBXSNASIA"
,
"LBXSNASIHn"
,
"LBXSCLSIA"
,
"LBXSCLSIHn"
,
"LBXSKSIA"
,
"LBXSKSIHn"
,
"LBXSPCA"
,
"LBXSPCHn"
,
"LBXCRPA"
,
"LBXCRPHn"
,
"LBXSLDSIA"
,
"LBXSLDSIHn"
,
"LBXCTRA"
,
"LBXCTRHn"
,
"LBXCDDA"
,
"LBXCDDHn"
,
"LBXFERSIA"
,
"LBXFERSIHn"
,
"LBXIL6A"
,
"LBXIL6Hn"
,
"LBDFBSIA"
,
"LBDFBSIHn"
,
"LBDSALSIA"
,
"LBDSALSIHn"
,
"LBXSAPSIA"
,
"LBXSAPSIHn"
,
"LBXSGTSIA"
,
"LBXSGTSIHn"
,
"LBXCFDA"
,
"LBXCFDHn"
,
"LBXFIOA"
,
"LBXFIOHn"
,
"LBXPOA"
,
"LBXPOHn"
,
"LBXPCOA"
,
"LBXPCOHn"
,
"LBXSC3SIA"
,
"LBXSC3SIHn"
,
"LBXPHA"
,
"LBXPHHn"
,
"LBXBEH"
,
"LBXBEHn"
,
"LBXA4A"
,
"LBXA4Hn"
,
"LBXTCA"
,
"LBXTCHn"
,
"LBXTRA"
,
"LBXTRHn"
,
"LBXSCKA"
,
"LBXSCKHn"
,
"LBXPSCKA"
,
"LBXPSCKHn"
]
datafile
=
pd
.
read_csv
(
datafile_path
,
csv_separator
)
convert_col
=
[
x
for
x
in
datafile
.
columns
if
x
in
categorical_variables
]
for
col
in
convert_col
:
def
numeric_conversion
(
datafile
):
unique_values
=
datafile
[
col
]
.
unique
(
)
#datafile = pd.read_csv(datafile_path, csv_separator
)
numeric_column_dict
=
{
str
(
x
):
i
for
i
,
x
in
enumerate
(
unique_values
)}
numeric_column
=
[
numeric_column_dict
[
str
(
x
)]
for
x
in
datafile
[
col
]
]
convert_col
=
[
x
for
x
in
datafile
.
columns
if
x
in
categorical_variables
]
datafile
[
col
+
"_numeric"
]
=
numeric_column
verify_num_col
=
[
x
for
x
in
datafile
.
columns
if
x
in
numeric_variables
]
for
col
in
verify_num_col
:
for
col
in
convert_col
:
unique_values
=
datafile
[
col
]
.
unique
()
numeric_column_dict
=
{
str
(
x
):
i
for
i
,
x
in
enumerate
(
unique_values
)}
numeric_column
=
[
numeric_column_dict
[
str
(
x
)]
for
x
in
datafile
[
col
]]
datafile
[
col
+
"_numeric"
]
=
numeric_column
datafile
[
col
]
=
[
float
(
x
)
for
x
in
datafile
[
col
]]
new_datafile_path
=
datafile_path
.
replace
(
".csv"
,
"_numeric.csv"
)
datafile
.
to_csv
(
new_datafile_path
,
index
=
False
)
\ No newline at end of file
verify_num_col
=
[
x
for
x
in
datafile
.
columns
if
x
in
numeric_variables
]
for
col
in
verify_num_col
:
datafile
[
col
]
=
[
float
(
x
)
for
x
in
datafile
[
col
]]
verify_cat_col
=
[
x
for
x
in
datafile
.
columns
if
x
in
categorical_variables
]
for
col
in
verify_cat_col
:
datafile
[
col
]
=
[
str
(
x
)
for
x
in
datafile
[
col
]]
#new_datafile_path = datafile_path.replace(".csv", "_numeric.csv")
#datafile.to_csv(new_datafile_path, index = False, quoting=csv.QUOTE_NONNUMERIC)
return
(
datafile
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment