From 3e986eb35381b19c2dd09b0e33c8dc8ccf7a6eee Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Thu, 13 Apr 2023 18:47:08 +0200 Subject: [PATCH] Update to local script to show the range in which failing variables lie --- valid_variables_script2.R | 2 +- valid_variables_script_local.R | 56 +++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/valid_variables_script2.R b/valid_variables_script2.R index 830e657..525784e 100644 --- a/valid_variables_script2.R +++ b/valid_variables_script2.R @@ -16,7 +16,7 @@ source("necessary_functions_connection.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) -codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx" +codebook_file <- "20220315_Data_Harmonisation.xlsb.xlsx" codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index 6450472..9afd01e 100644 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -2,6 +2,7 @@ rm(list=ls()) dir_name <- readline("Introduce the name of the directory please: ") # C:\Users\guill\Documents\harmonize_scripts +# /Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization setwd(dir_name) @@ -11,7 +12,7 @@ source("dependency_installer.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) -codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx" +codebook_file <- "20220315_Data_Harmonisation.xlsb.xlsx" codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) @@ -101,11 +102,13 @@ check_valid_name <- function(col_name){ } check_valid_values_continuous <- function(colname , codebook_param , column){ + column <- as.numeric(column[column != "."]) possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - + value_format <- strsplit(possible_values_format, " / ")[[1]] + min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) if(min_value == ""){ @@ -114,9 +117,6 @@ check_valid_name <- function(col_name){ } min_value <- as.double(min_value) max_value <- as.double(max_value) - print(colname) - print(min_value) - print(max_value) failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) @@ -128,15 +128,36 @@ check_valid_name <- function(col_name){ failing_values <- failing_values[!is.na(failing_values)] failing_value_counts <- table(failing_values) - str_res <- paste(colname, "has failing values:") - - for (i in seq_along(failing_value_counts)) { - value <- names(failing_value_counts)[i] - count <- failing_value_counts[i] - str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") - } - - str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ") + str_res <- paste(colname, "has wrong values:") + + str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = "")) + + str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(continuous),", sep = " ") + + first <- names(failing_value_counts)[1] + + last <- names(failing_value_counts)[length(failing_value_counts)] + + print(colname) + print("first") + print(first) + print("last") + print(last) + + str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "]", sep = "" )) + + + # if(number_of_failing_values < 30) { + # # list_values <- paste( failing_values , collapse = " ") + # str_res <- paste(str_res, "Wrong values:") + # + # for (i in seq_along(failing_value_counts)) { + # value <- names(failing_value_counts)[i] + # count <- failing_value_counts[i] + # str_res <- paste(str_res, paste(value, paste("(", count, ")", sep = ""), collapse = " "), sep = " ") + # } + # } + } return(str_res) @@ -324,6 +345,13 @@ check_valid_values <- function(valid_colnames, codebook_param){ next } + if(is.na(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == name])){ + variable <- paste("Variable ", name, " with no value format", sep = " ") + res <- paste(res, variable , sep="\n") + next + } + + result = switch( column_type, "Continuous"= check_valid_values_continuous(name , codebook_param , column), -- 2.24.1