derived_variables_generator.py 1.29 KB
Newer Older
Alberto Blázquez Herranz's avatar
Alberto Blázquez Herranz committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 10 12:41:12 2021

@author: ctb
"""

import datetime
import pandas as pd
import sys

datafile_path = sys.argv[1]
csv_separator = ","

if len(sys.argv) == 3:
    csv_separator = sys.argv[2]

datafile = pd.read_csv(datafile_path, csv_separator)


age_ranges = list(range(30, 100, 10))


if "DMRAGEYR" in datafile.columns:
    
    derived_age_range = []
    for x in datafile["DMRAGEYR"]:
        age_range = -1
        for i, mark in enumerate(age_ranges):
            if x > mark:
                age_range= str(mark) + "-" + str(age_ranges[i+1] - 1)
                break
        
        if age_range == -1:
            age_range = "18-29"
            
        derived_age_range.append(age_range)
    
    datafile["AGE_RANGE"] = derived_age_range
        
    

if "DSXOS" in datafile.columns:
    
    derived_outcome_month = []
    for x in datafile["DSXOS"]:
        month_num = x.split("/")[1]
        
        datetime_object = datetime.datetime.strptime(month_num, "%m")
        full_month_name = datetime_object.strftime("%B")

        derived_outcome_month.append(full_month_name)
    
    datafile["MONTH_DISCHARGE"] = derived_outcome_month


    
new_datafile_path = datafile_path.replace(".csv", "_derived.csv")
datafile.to_csv(new_datafile_path, index = False)