add_missing_values.r 2.38 KB
Newer Older
pxp9's avatar
pxp9 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
codebook <- read.csv("new_harmon.csv", sep = ",")

binary <- unlist(codebook[codebook["Variable.type"] == "Binary",]["Harmonised.variable.name"])
categorical <- unlist(codebook[codebook["Variable.type"] == "Categorical",]["Harmonised.variable.name"])
continuous <- unlist(codebook[codebook["Variable.type"] == "Continuous",]["Harmonised.variable.name"])

categoric_vars <- c(binary, categorical)

replace_with_Missing_categoric <- function(column){
  
  for (i in 1:length(column)){
    
    x <- column[i]
    
    if (is.na(x)){
      x <- "Missing"
      
    }else{
      
      if(x == "" | x == "NA" | x == "Unknown")
        x <- "Missing"
    }
    
    column[i] <- x
    
  }
  
  return (column)
}

replace_with_Missing_num_categoric <- function(column){
  
  for (i in 1:length(column)){
    
    x <- column[i]
    
    if (is.na(x)){
      x <- 9999
      
    }else{
      
      if(x == "" | x == "NA" | x == "Unknown")
        x <- 9999
    }
    
    column[i] <- x
    
  }
  
  return (column)
}


replace_with_Missing_continuous <- function(column){
  
  for (i in 1:length(column)){
    
    x <- column[i]
    
    if (is.na(x)){
      x <- ""
      
    }else{
      
      if(x == "NA" | x == "Unknown")
        x <- ""
    }
    
    column[i] <- x
    
  }
  
  return (column)
  
}


add_missing_values <- function(path_to_file){
  
  
  setwd("C:/Users/victor/Documents/TFG/r-analytics-master/ressources/current_db")
  data <- read.csv(path_to_file, sep = ",")
  
  data_colnames <- colnames(data)
  
  num_categoric <- data_colnames[grepl("_numeric", data_colnames)]
  
  aux <- length(data_colnames)
  
  for (i in 1:aux){
    colname <- data_colnames[i]
    
    progress <- round((100*i/aux),digits = 0)
    print(paste(progress,"%", sep = ""))
    
    if(colname %in% categoric_vars){
      column <- unlist(data[colname])
      data[colname] <- replace_with_Missing_categoric(column)
    }
    
    if(colname %in% num_categoric){
      column <- unlist(data[colname])
      data[colname] <- replace_with_Missing_num_categoric(column)
    }
    
    if(colname %in% continuous){
      column <- unlist(data[colname])
      data[colname] <- replace_with_Missing_continuous(column)
    }
    
    
  }
  
  return(data)
  
  
}