valid_variables_script_local.R 13.7 KB
Newer Older
1 2 3
rm(list=ls())

dir_name <- readline("Introduce the name of the directory please: ")
4
# C:\Users\guill\Documents\harmonize_scripts
5
# /Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization
6 7 8 9 10 11 12 13 14

setwd(dir_name)

source("dependency_installer.R")


dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list)

15
codebook_file <- "20220315_Data_Harmonisation.xlsb.xlsx"
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42

codebook_demo <- read.xlsx(codebook_file , sheet = 2 )
codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) 

codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) 
codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) 

codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) 
codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) 

codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) 
codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) 

codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) 
codebook_dates <- read.xlsx(codebook_file , sheet = 11 )

codebook <- rbind(codebook_demo , codebook_com_and_rf)
codebook <- rbind(codebook , codebook_home_med)
codebook <- rbind(codebook , codebook_si_sympt)
codebook <- rbind(codebook , codebook_treatments)
codebook <- rbind(codebook , codebook_labo)
codebook <- rbind(codebook , codebook_complications)
codebook <- rbind(codebook , codebook_imaging_data)

codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] 
codebook <- rbind(codebook , codebook_lifestyle_diet)
codebook <- rbind(codebook , codebook_dates)
43 44 45 46 47 48 49 50 51 52 53 54


codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)

names(codebook_col_names) <- c("col_names")

setwd(paste(dir_name ,"/harmonized_data", sep=""))

file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""

if (grepl(".csv" , file_name , fixed = TRUE)){
55 56
  harmonized_data <- read.csv(file_name)
  #harmonized_data <- read.csv(file_name, sep = ";")
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
  harmonized_data <- read.xlsx(file_name)
}

#----------------------------------------------------------------------------

#Test if column names are valid
check_column_names <- function(col_names){
  
  str_res <- "The column names:"
  valid_colnames <- c()
  repeated_colnames <- c()
  for(i in 1:(nrow(col_names))){
    col_name <- col_names[i,1]
    number_of_column <- check_valid_name(col_name) 
    if( number_of_column == 0){
      str_res<- paste(str_res, col_name, sep=" ")
    }else if (number_of_column == 1){
      valid_colnames = c(valid_colnames, col_name)
    }else{
      repeated_colnames = c(repeated_colnames , col_name)
    }
  }
  
  str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
  
  new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames)
  return (new_list)
  
}


#Test if a single variable name is valid
check_valid_name <- function(col_name){
  
  valid <- 0
  
  if(col_name %in% codebook_col_names$col_names){
    
    valid <- length(grep(col_name, names(harmonized_data)))
    
  }
  
  return (valid)
  
}

104
 check_valid_values_continuous <- function(colname , codebook_param , column){
105
   
106 107 108
   missing_values_count <- length(column[column == "."])
   total_values <- length(column)
   
GNajeral's avatar
GNajeral committed
109 110 111
   missing_values_percentage <- round((missing_values_count/(total_values))*100, 2)
   
   
112 113

   column <- as.numeric(column[column != "."])
114 115
   string_values_count <- length(column[is.na(column)])
   
116
   possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
117
   
118
   value_format <- strsplit(possible_values_format, " / ")[[1]]
119

120 121 122 123 124 125 126 127 128 129
   min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
   max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
   if(min_value == ""){
     min_value <- str_trim(sub(",.*", "", value_format[1]))
     max_value <- str_trim(sub(".*,", "", value_format[1]))
   }
   min_value <- as.double(min_value)
   max_value <- as.double(max_value)

   failing_values <- column[column < min_value | column > max_value]
130
   number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
131 132

   str_res <- ""
GNajeral's avatar
GNajeral committed
133 134 135 136 137 138 139 140
   if (number_of_failing_values == 0){
     if(missing_values_percentage > 90){
       str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total")
     }
     else{
       str_res <- paste("No failing values")
     }
   }else{
141 142 143
     failing_values <- failing_values[!is.na(failing_values)]
     failing_value_counts <- table(failing_values)

144 145 146 147
     str_res <- paste(colname, "has wrong values:")
     
     str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
     
148 149 150 151
     str_res <- paste(str_res, "Missing values:")
     
     str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
     
152 153 154 155 156 157
     str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(continuous),", sep = " ")
     
     first <- names(failing_value_counts)[1]
     
     last <- names(failing_value_counts)[length(failing_value_counts)]
     
158 159
     str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
     
GNajeral's avatar
GNajeral committed
160 161 162
     str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" ))
     
     str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
163
     
164 165 166 167 168 169 170 171 172 173 174 175 176
     
     
     # if(number_of_failing_values < 30) {
     #   # list_values <- paste( failing_values , collapse = " ")
     #   str_res <- paste(str_res, "Wrong values:")
     #  
     #   for (i in seq_along(failing_value_counts)) {
     #     value <- names(failing_value_counts)[i]
     #     count <- failing_value_counts[i]
     #     str_res <- paste(str_res, paste(value, paste("(", count, ")", sep = ""), collapse = " "), sep = " ")
     #   }
     # }
     
177 178 179 180 181 182 183 184
   }

   return(str_res)
 }



 check_valid_values_categorical <- function(colname, codebook_param, column) {
185 186 187 188
   
   missing_values_count <- length(column[column == "."])
   total_values <- length(column)
   
GNajeral's avatar
GNajeral committed
189 190 191
   missing_values_percentage <- round((missing_values_count/(total_values))*100, 2)
   
   
192
   column <- as.numeric(column[column != "."])
193 194
   string_values_count <- length(column[is.na(column)])
   
195 196 197 198 199 200 201 202 203 204 205 206 207
   possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
   possible_values_list <- str_split(possible_values_format, "/")[[1]]

   possible_values_list <- lapply(possible_values_list, str_trim)

   str_res <- ""
   min_value <- 0
   max_value <- 0
   if (length(possible_values_list[[1]]) == 2) {
     separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
     min_value <- strtoi(separate_range[1])
     max_value <- strtoi(separate_range[2])
   } else {
208 209
     possible_values_list <- possible_values_list[possible_values_list != "."]
     possible_values_list <- lapply(possible_values_list, strtoi)
210
     min_value <- possible_values_list[1]
211
     max_value <- possible_values_list[length(possible_values_list)]
212 213 214
   }

   failing_values <- column[column < min_value | column > max_value]
215 216
   number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
   
GNajeral's avatar
GNajeral committed
217 218 219 220 221 222 223
   if (number_of_failing_values == 0){
     if(missing_values_percentage > 90){
       str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total")
     }
     else{
       str_res <- paste("No failing values")
     }
224 225 226 227
   } else {
     range_as_str <- paste(min_value, "-", max_value, " (categorical)")
     failing_values <- failing_values[!is.na(failing_values)]
     failing_value_counts <- table(failing_values)
228 229 230 231 232
     
     str_res <- paste(colname, "has wrong values:")
     
     str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
     
233 234 235 236
     str_res <- paste(str_res, "Missing values:")
     
     str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
     
237 238 239 240 241 242
     str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(categorical),", sep = " ")
     
     first <- names(failing_value_counts)[1]
     
     last <- names(failing_value_counts)[length(failing_value_counts)]
     
243 244
     str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
     
GNajeral's avatar
GNajeral committed
245 246
     str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" ))
     
247 248
     str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
     
249

250 251 252 253 254 255 256 257 258
   #   str_res <- paste(colname, "has failing values:")
   # 
   #   for (i in seq_along(failing_value_counts)) {
   #     value <- names(failing_value_counts)[i]
   #     count <- failing_value_counts[i]
   #     str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
   #   }
   # 
   #   str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
259
   }
260 261 262 263 264

   return(str_res)
 }

 check_valid_values_binary <- function(colname, column) {
265 266 267 268 269
   
   missing_values_count <- length(column[column == "."])
   
   total_values <- length(column)
   
GNajeral's avatar
GNajeral committed
270 271
   missing_values_percentage <- round((missing_values_count/(total_values))*100, 2)
   
272
   column <- as.numeric(column[column != "."])
273 274
   string_values_count <- length(column[is.na(column)])
   
275
   failing_values <- column[column < 0 | column > 1]
276 277
   number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
   
278
   str_res <- ""
GNajeral's avatar
GNajeral committed
279 280 281 282 283 284 285 286
   if (number_of_failing_values == 0){
     if(missing_values_percentage > 90){
       str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total")
     }
    else{
     str_res <- paste("No failing values")
     }
   }
287 288 289 290
   else {
     range_as_str <- "0-1 (binary)"
     failing_values <- failing_values[!is.na(failing_values)]
     failing_value_counts <- table(failing_values)
291 292 293 294 295 296 297 298 299
     
     str_res <- paste(colname, "has wrong values:")
     
     str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
     
     str_res <- paste(str_res, "Missing values:")
     
     str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
     
GNajeral's avatar
GNajeral committed
300
     str_res <- paste(str_res, "should be in the range:", paste("0", "1", sep = "-"), "(binary),", sep = " ")
301 302 303 304 305 306 307
     
     first <- names(failing_value_counts)[1]
     
     last <- names(failing_value_counts)[length(failing_value_counts)]
     
     str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
     
GNajeral's avatar
GNajeral committed
308 309 310
     str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" ))
     
     str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
311
     
312

313 314 315 316 317 318 319 320 321 322
   #   str_res <- paste(colname, "has failing values:")
   # 
   #   for (i in seq_along(failing_value_counts)) {
   #     value <- names(failing_value_counts)[i]
   #     count <- failing_value_counts[i]
   #     str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
   #   }
   # 
   #   str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
    }
323 324 325 326

   return(str_res)
 }

327

328
check_valid_values <- function(valid_colnames, codebook_param){
GNajeral's avatar
GNajeral committed
329 330

  res <- paste("Total patients:", length(valid_colnames[,1]))
331 332 333
  
  for(i in  1:(ncol(valid_colnames))){
    name <- names(valid_colnames)[i]
334 335 336 337 338
    
    if (grepl("DAT", name, fixed=TRUE)){
      next
    }
    
339 340 341 342
    column <- valid_colnames[,i]
    
    column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
    
343
    if (is.na(column_type) ) {
344 345 346 347
      variable <- paste("Variable ", name, " wrong", sep = " ")
      res <- paste(res,  variable , sep="\n")
      next
    }
348
    
349 350 351 352 353 354 355
    if(is.na(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == name])){
      variable <- paste("Variable ", name, " with no value format", sep = " ")
      res <- paste(res,  variable , sep="\n")
      next
    }
    
    
356 357 358 359 360 361 362
    result = switch(  
      column_type,  
      "Continuous"= check_valid_values_continuous(name , codebook_param , column),
      "Binary"= check_valid_values_binary(name , column),  
      "Categorical"= check_valid_values_categorical(name, codebook_param , column),
      "Calendar date" = paste("No failing values"),
      "ISO country code"= paste("No failing values"),
363 364 365
      {
        paste("some column " , column_type , sep = " ")
      }
366 367
    )
    
GNajeral's avatar
GNajeral committed
368
    if (any(result != "No failing values")){
369 370 371 372 373 374 375 376 377
      res <- paste(res , result, sep="\n")
    }
  }
  
  return(res)
  
}


GNajeral's avatar
GNajeral committed
378

379 380 381 382 383 384 385 386 387 388 389 390 391
data_colnames <- as.data.frame(colnames(harmonized_data))

check_valid_columns <- check_column_names(data_colnames)

columns_not_valid <- check_valid_columns$not_colnames

valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)


result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
392 393 394 395 396
# Split the string into separate elements
split_columns <- strsplit(columns_not_valid, split = " ", fixed = TRUE)

# Print each element in a separate line
cat(paste(unlist(split_columns), collapse = "\n"), "\n")
397
cat(result)
GNajeral's avatar
GNajeral committed
398