derived_variables_generator.py 2.45 KB
Newer Older
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
1 2 3 4 5 6 7 8 9 10
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 10 12:41:12 2021

@author: ctb
"""

import datetime
import pandas as pd
import sys
11 12 13
import numeric_converter
import zipfile
import csv
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
14 15 16 17 18 19 20 21 22 23

datafile_path = sys.argv[1]
csv_separator = ","

if len(sys.argv) == 3:
    csv_separator = sys.argv[2]

datafile = pd.read_csv(datafile_path, csv_separator)


24 25 26
datafile = numeric_converter.numeric_conversion(datafile)


Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
age_ranges = list(range(30, 100, 10))


if "DMRAGEYR" in datafile.columns:
    
    derived_age_range = []
    for x in datafile["DMRAGEYR"]:
        age_range = -1
        for i, mark in enumerate(age_ranges):
            if x > mark:
                age_range= str(mark) + "-" + str(age_ranges[i+1] - 1)
                break
        
        if age_range == -1:
            age_range = "18-29"
            
        derived_age_range.append(age_range)
    
    datafile["AGE_RANGE"] = derived_age_range
        
    

49
if "DATDS" in datafile.columns:
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
50 51
    
    derived_outcome_month = []
52
    for x in datafile["DATDS"]:
53 54 55 56 57 58 59 60 61 62 63 64
        if str(x) == 'nan': #math.isnan(x):
            derived_outcome_month.append(None)
        else:
            month_num = x.split("/")
            if len(month_num) == 3:
                month_num = month_num[1]
                datetime_object = datetime.datetime.strptime(month_num, "%m")
                full_month_name = datetime_object.strftime("%B")

                derived_outcome_month.append(full_month_name)
            else:
                derived_outcome_month.append(None)
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
65
    
66
    datafile["DISCHARGE_DATE"] = derived_outcome_month
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
67 68


69 70 71 72 73

if "DATAD" in datafile.columns:
    
    derived_outcome_month = []
    for x in datafile["DATAD"]:
74 75 76 77 78 79 80 81 82 83 84 85
        if str(x) == 'nan':  # math.isnan(x):
            derived_outcome_month.append(None)
        else:
            month_num = x.split("/")
            if len(month_num) == 3:
                month_num = month_num[1]
                datetime_object = datetime.datetime.strptime(month_num, "%m")
                full_month_name = datetime_object.strftime("%B")

                derived_outcome_month.append(full_month_name)
            else:
                derived_outcome_month.append(None)
86 87 88 89
    
    datafile["ADMISSION_DATE"] = derived_outcome_month

    
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
90
    
91 92 93 94 95
new_datafile_path = datafile_path.replace(".csv", "_numeric_derived")
datafile.to_csv(new_datafile_path+".csv", index = False, quoting=csv.QUOTE_NONNUMERIC)

with zipfile.ZipFile(new_datafile_path+".zip", 'w') as myzip:
    myzip.write(new_datafile_path+".csv")