From 12a192eb6f739977b6b664de3b1db56b0a7c8a8d Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Fri, 31 Mar 2023 12:35:30 +0200 Subject: [PATCH] valid variables script local, values not duplicated --- valid_variables_script_local.R | 212 +++++++++++++++++++++++++-------- 1 file changed, 160 insertions(+), 52 deletions(-) diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index 909c3ef..0c5b2f3 100755 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -1,6 +1,7 @@ rm(list=ls()) dir_name <- readline("Introduce the name of the directory please: ") +# C:\Users\guill\Documents\harmonize_scripts setwd(dir_name) @@ -98,94 +99,201 @@ check_valid_name <- function(col_name){ } -check_valid_values_continuous <- function(colname , codebook_param , column){ +# check_valid_values_continuous <- function(colname , codebook_param , column){ +# +# column <- column[column != "."] +# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] +# possible_values_list = str_split(possible_values_format , "/")[[1]] +# +# range_as_str <- str_trim(possible_values_list[1]) +# missing_value_format <- str_trim(str_trim(possible_values_list[2])) +# +# separate_range <- str_split(range_as_str , "-")[[1]] +# min_value <- strtoi(separate_range[1]) +# max_value <- strtoi(separate_range[2]) +# +# failing_values <- column[column < min_value | column > max_value] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# str_res <- "" +# if (number_of_failing_values == 0) +# str_res <- "No failing values" +# else{ +# failing_values <- failing_values[!is.na(failing_values)] +# failing_value_counts <- table(failing_values) +# +# str_res <- paste(colname, "has failing values:") +# +# for (i in seq_along(failing_value_counts)) { +# value <- names(failing_value_counts)[i] +# count <- failing_value_counts[i] +# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") +# } +# +# str_res <- paste(str_res, "should be in range", range_as_str, "(continuous)", sep = " ") +# } +# +# return(str_res) +# } +# +# +# +# check_valid_values_categorical <- function(colname, codebook_param, column) { +# column <- column[column != "."] +# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] +# possible_values_list <- str_split(possible_values_format, "/")[[1]] +# +# possible_values_list <- lapply(possible_values_list, str_trim) +# +# str_res <- "" +# min_value <- 0 +# max_value <- 0 +# if (length(possible_values_list[[1]]) == 2) { +# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] +# min_value <- strtoi(separate_range[1]) +# max_value <- strtoi(separate_range[2]) +# } else { +# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] +# min_value <- possible_values_list[1] +# max_value <- possible_values_list[length(possible_values_list) - 1] +# } +# +# failing_values <- column[column < min_value | column > max_value] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# if (number_of_failing_values == 0) { +# str_res <- "No failing values" +# } else { +# range_as_str <- paste(min_value, "-", max_value, " (categorical)") +# failing_values <- failing_values[!is.na(failing_values)] +# failing_value_counts <- table(failing_values) +# +# str_res <- paste(colname, "has failing values:") +# +# for (i in seq_along(failing_value_counts)) { +# value <- names(failing_value_counts)[i] +# count <- failing_value_counts[i] +# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") +# } +# +# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") +# } +# +# return(str_res) +# } +# +# check_valid_values_binary <- function(colname, column) { +# column <- column[column != "."] +# failing_values <- column[column < 0 | column > 1] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# str_res <- "" +# if (number_of_failing_values == 0) +# str_res <- "No failing values" +# else { +# range_as_str <- "0-1 (binary)" +# failing_values <- failing_values[!is.na(failing_values)] +# failing_value_counts <- table(failing_values) +# +# str_res <- paste(colname, "has failing values:") +# +# for (i in seq_along(failing_value_counts)) { +# value <- names(failing_value_counts)[i] +# count <- failing_value_counts[i] +# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") +# } +# +# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") +# } +# +# return(str_res) +# } + +check_valid_values_continuous <- function(colname, codebook_param, column) { column <- column[column != "."] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - possible_values_list = str_split(possible_values_format , "/")[[1]] + possible_values_list = str_split(possible_values_format, "/")[[1]] - # Fallará cuando el codebook no tenga min-max / . range_as_str <- str_trim(possible_values_list[1]) missing_value_format <- str_trim(str_trim(possible_values_list[2])) - separate_range <- str_split(range_as_str , "-")[[1]] - min_value <- strtoi(separate_range[1]) - max_value <- strtoi(separate_range[2]) + separate_range <- str_split(range_as_str, "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) str_res <- "" - if (number_of_failing_values == 0) + if (number_of_failing_values == 0) { str_res <- "No failing values" - else{ - failing_values <- failing_values[!is.na(failing_values)] - str_res <- paste( colname , paste(unlist(failing_values) , collapse =" ")) - #str_res <- paste(colname , collapse =" ") - str_res <- paste(str_res , "should be in range" , range_as_str, "(continuous)", sep = " ") - } + } else { + range_as_str <- paste(min_value, "-", max_value, "(continuous)") - - return(str_res) -} - -check_valid_values_binary <- function(colname , column){ - column <- column[column != "."] - failing_values <- column[column < 0 | column > 1] - number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - - str_res <- "" - if (number_of_failing_values == 0) - str_res <- "No failing values" - else{ - range_as_str <- "0-1 (binary)" - failing_values <- failing_values[!is.na(failing_values)] - str_res <- paste(colname , paste(unlist(failing_values) , collapse =" ")) - #str_res <- paste(colname , collapse =" ") - str_res <- paste(str_res , "should be in range" , range_as_str, sep = " ") + str_res <- paste(colname, "has", number_of_failing_values, "failing values") + str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } - return(str_res) } -check_valid_values_categorical <- function(colname , codebook_param , column){ + +check_valid_values_categorical <- function(colname, codebook_param, column) { column <- column[column != "."] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - possible_values_list <- str_split(possible_values_format , "/")[[1]] + possible_values_list <- str_split(possible_values_format, "/")[[1]] - possible_values_list <- lapply(possible_values_list , str_trim) + possible_values_list <- lapply(possible_values_list, str_trim) str_res <- "" min_value <- 0 - max_value <- 0 - if (length(possible_values_list[[1]]) == 2){ - separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] - min_value <- strtoi(separate_range[1]) - max_value <- strtoi(separate_range[2]) - - }else{ - - possible_values_list <- lapply(possible_values_list , strtoi)[[1]] + max_value <- 0 + if (length(possible_values_list[[1]]) == 2) { + separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + } else { + possible_values_list <- lapply(possible_values_list, strtoi)[[1]] min_value <- possible_values_list[1] max_value <- possible_values_list[length(possible_values_list) - 1] } - failing_values <- column[column < min_value | column > max_value ] + failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - if(number_of_failing_values == 0){ + if (number_of_failing_values == 0) { str_res <- "No failing values" - }else{ - range_as_str <- paste(min_value , "-" , max_value , " (categorical)") - failing_values <- failing_values[!is.na(failing_values)] - #str_res <- paste(colname , paste(unlist(failing_values) , collapse =" ")) - str_res <- paste(colname , collapse =" ") - str_res <- paste(str_res , "should be in range" , range_as_str, sep = " ") + } else { + range_as_str <- paste(min_value, "-", max_value, " (categorical)") + + str_res <- paste(colname, "has", number_of_failing_values, "failing values") + str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") + } + + return(str_res) +} + +check_valid_values_binary <- function(colname, column) { + column <- column[column != "."] + failing_values <- column[column < 0 | column > 1] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else { + range_as_str <- "0-1 (binary)" + + str_res <- paste(colname, "has", number_of_failing_values, "failing values") + str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } + return(str_res) } + + check_valid_values <- function(valid_colnames, codebook_param){ res <- "" -- 2.24.1