diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index 881db4fd54bf91be3403b06185b8cbfcaeb0a5a6..6450472119b880d58996fb75471927c94fb82055 100644 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -51,8 +51,8 @@ file_name <- readline("Introduce the name of the file to check the values: ") harmonized_data <- "" if (grepl(".csv" , file_name , fixed = TRUE)){ - harmonized_data <- read.csv(file_name) - #harmonized_data <- read.csv(file_name, sep = ";") + #harmonized_data <- read.csv(file_name) + harmonized_data <- read.csv(file_name, sep = ";") }else if (grepl(".xlsx" , file_name , fixed = TRUE)){ harmonized_data <- read.xlsx(file_name) } @@ -100,208 +100,208 @@ check_valid_name <- function(col_name){ } - # check_valid_values_continuous <- function(colname , codebook_param , column){ - # - # column <- as.numeric(column[column != "."]) - # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - # - # value_format <- strsplit(possible_values_format, " / ")[[1]] - # min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) - # max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) - # if(min_value == ""){ - # min_value <- str_trim(sub(",.*", "", value_format[1])) - # max_value <- str_trim(sub(".*,", "", value_format[1])) - # } - # min_value <- as.double(min_value) - # max_value <- as.double(max_value) - # print(colname) - # print(min_value) - # print(max_value) - # - # failing_values <- column[column < min_value | column > max_value] - # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - # - # str_res <- "" - # if (number_of_failing_values == 0) - # str_res <- "No failing values" - # else{ - # failing_values <- failing_values[!is.na(failing_values)] - # failing_value_counts <- table(failing_values) - # - # str_res <- paste(colname, "has failing values:") - # - # for (i in seq_along(failing_value_counts)) { - # value <- names(failing_value_counts)[i] - # count <- failing_value_counts[i] - # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") - # } - # - # str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ") - # } - # - # return(str_res) - # } - # - # - # - # check_valid_values_categorical <- function(colname, codebook_param, column) { - # column <- as.numeric(column[column != "."]) - # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - # possible_values_list <- str_split(possible_values_format, "/")[[1]] - # - # possible_values_list <- lapply(possible_values_list, str_trim) - # - # str_res <- "" - # min_value <- 0 - # max_value <- 0 - # if (length(possible_values_list[[1]]) == 2) { - # separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] - # min_value <- strtoi(separate_range[1]) - # max_value <- strtoi(separate_range[2]) - # } else { - # possible_values_list <- lapply(possible_values_list, strtoi)[[1]] - # min_value <- possible_values_list[1] - # max_value <- possible_values_list[length(possible_values_list) - 1] - # } - # - # failing_values <- column[column < min_value | column > max_value] - # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - # - # if (number_of_failing_values == 0) { - # str_res <- "No failing values" - # } else { - # range_as_str <- paste(min_value, "-", max_value, " (categorical)") - # failing_values <- failing_values[!is.na(failing_values)] - # failing_value_counts <- table(failing_values) - # - # str_res <- paste(colname, "has failing values:") - # - # for (i in seq_along(failing_value_counts)) { - # value <- names(failing_value_counts)[i] - # count <- failing_value_counts[i] - # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") - # } - # - # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") - # } - # - # return(str_res) - # } - # - # check_valid_values_binary <- function(colname, column) { - # column <- as.numeric(column[column != "."]) - # failing_values <- column[column < 0 | column > 1] - # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - # - # str_res <- "" - # if (number_of_failing_values == 0) - # str_res <- "No failing values" - # else { - # range_as_str <- "0-1 (binary)" - # failing_values <- failing_values[!is.na(failing_values)] - # failing_value_counts <- table(failing_values) - # - # str_res <- paste(colname, "has failing values:") - # - # for (i in seq_along(failing_value_counts)) { - # value <- names(failing_value_counts)[i] - # count <- failing_value_counts[i] - # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") - # } - # - # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") - # } - # - # return(str_res) - # } - -check_valid_values_continuous <- function(colname, codebook_param, column) { - - column <- as.numeric(column[column != "."]) - possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - - value_format <- strsplit(possible_values_format, " / ")[[1]] - min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) - max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) - if(min_value == ""){ - min_value <- str_trim(sub(",.*", "", value_format[1])) - max_value <- str_trim(sub(".*,", "", value_format[1])) - } - min_value <- as.double(min_value) - max_value <- as.double(max_value) - print(colname) - print(min_value) - print(max_value) - - failing_values <- column[column < min_value | column > max_value] - number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - - str_res <- "" - if (number_of_failing_values == 0) { - str_res <- "No failing values" - } else { - range_as_str <- paste(min_value, "-", max_value, "(continuous)") - - str_res <- paste(colname, "has", number_of_failing_values, "failing values") - str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") - } - - return(str_res) -} - - -check_valid_values_categorical <- function(colname, codebook_param, column) { - column <- as.numeric(column[column != "."]) - possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - possible_values_list <- str_split(possible_values_format, "/")[[1]] - - possible_values_list <- lapply(possible_values_list, str_trim) - - str_res <- "" - min_value <- 0 - max_value <- 0 - if (length(possible_values_list[[1]]) == 2) { - separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] - min_value <- strtoi(separate_range[1]) - max_value <- strtoi(separate_range[2]) - } else { - possible_values_list <- lapply(possible_values_list, strtoi)[[1]] - min_value <- possible_values_list[1] - max_value <- possible_values_list[length(possible_values_list) - 1] - } - - failing_values <- column[column < min_value | column > max_value] - number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - - if (number_of_failing_values == 0) { - str_res <- "No failing values" - } else { - range_as_str <- paste(min_value, "-", max_value, " (categorical)") - - str_res <- paste(colname, "has", number_of_failing_values, "failing values") - str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") - } - - return(str_res) -} - -check_valid_values_binary <- function(colname, column) { - column <- as.numeric(column[column != "."]) - failing_values <- column[column < 0 | column > 1] - number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - - str_res <- "" - if (number_of_failing_values == 0) - str_res <- "No failing values" - else { - range_as_str <- "0-1 (binary)" - - str_res <- paste(colname, "has", number_of_failing_values, "failing values") - str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") - } - - return(str_res) -} + check_valid_values_continuous <- function(colname , codebook_param , column){ + + column <- as.numeric(column[column != "."]) + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + + value_format <- strsplit(possible_values_format, " / ")[[1]] + min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) + max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) + if(min_value == ""){ + min_value <- str_trim(sub(",.*", "", value_format[1])) + max_value <- str_trim(sub(".*,", "", value_format[1])) + } + min_value <- as.double(min_value) + max_value <- as.double(max_value) + print(colname) + print(min_value) + print(max_value) + + failing_values <- column[column < min_value | column > max_value] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else{ + failing_values <- failing_values[!is.na(failing_values)] + failing_value_counts <- table(failing_values) + + str_res <- paste(colname, "has failing values:") + + for (i in seq_along(failing_value_counts)) { + value <- names(failing_value_counts)[i] + count <- failing_value_counts[i] + str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + } + + str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ") + } + + return(str_res) + } + + + + check_valid_values_categorical <- function(colname, codebook_param, column) { + column <- as.numeric(column[column != "."]) + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + possible_values_list <- str_split(possible_values_format, "/")[[1]] + + possible_values_list <- lapply(possible_values_list, str_trim) + + str_res <- "" + min_value <- 0 + max_value <- 0 + if (length(possible_values_list[[1]]) == 2) { + separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + } else { + possible_values_list <- lapply(possible_values_list, strtoi)[[1]] + min_value <- possible_values_list[1] + max_value <- possible_values_list[length(possible_values_list) - 1] + } + + failing_values <- column[column < min_value | column > max_value] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + if (number_of_failing_values == 0) { + str_res <- "No failing values" + } else { + range_as_str <- paste(min_value, "-", max_value, " (categorical)") + failing_values <- failing_values[!is.na(failing_values)] + failing_value_counts <- table(failing_values) + + str_res <- paste(colname, "has failing values:") + + for (i in seq_along(failing_value_counts)) { + value <- names(failing_value_counts)[i] + count <- failing_value_counts[i] + str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + } + + str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") + } + + return(str_res) + } + + check_valid_values_binary <- function(colname, column) { + column <- as.numeric(column[column != "."]) + failing_values <- column[column < 0 | column > 1] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else { + range_as_str <- "0-1 (binary)" + failing_values <- failing_values[!is.na(failing_values)] + failing_value_counts <- table(failing_values) + + str_res <- paste(colname, "has failing values:") + + for (i in seq_along(failing_value_counts)) { + value <- names(failing_value_counts)[i] + count <- failing_value_counts[i] + str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + } + + str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") + } + + return(str_res) + } + +# check_valid_values_continuous <- function(colname, codebook_param, column) { +# +# column <- as.numeric(column[column != "."]) +# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] +# +# value_format <- strsplit(possible_values_format, " / ")[[1]] +# min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) +# max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) +# if(min_value == ""){ +# min_value <- str_trim(sub(",.*", "", value_format[1])) +# max_value <- str_trim(sub(".*,", "", value_format[1])) +# } +# min_value <- as.double(min_value) +# max_value <- as.double(max_value) +# print(colname) +# print(min_value) +# print(max_value) +# +# failing_values <- column[column < min_value | column > max_value] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# str_res <- "" +# if (number_of_failing_values == 0) { +# str_res <- "No failing values" +# } else { +# range_as_str <- paste(min_value, "-", max_value, "(continuous)") +# +# str_res <- paste(colname, "has", number_of_failing_values, "failing values") +# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") +# } +# +# return(str_res) +# } +# +# +# check_valid_values_categorical <- function(colname, codebook_param, column) { +# column <- as.numeric(column[column != "."]) +# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] +# possible_values_list <- str_split(possible_values_format, "/")[[1]] +# +# possible_values_list <- lapply(possible_values_list, str_trim) +# +# str_res <- "" +# min_value <- 0 +# max_value <- 0 +# if (length(possible_values_list[[1]]) == 2) { +# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] +# min_value <- strtoi(separate_range[1]) +# max_value <- strtoi(separate_range[2]) +# } else { +# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] +# min_value <- possible_values_list[1] +# max_value <- possible_values_list[length(possible_values_list) - 1] +# } +# +# failing_values <- column[column < min_value | column > max_value] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# if (number_of_failing_values == 0) { +# str_res <- "No failing values" +# } else { +# range_as_str <- paste(min_value, "-", max_value, " (categorical)") +# +# str_res <- paste(colname, "has", number_of_failing_values, "failing values") +# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") +# } +# +# return(str_res) +# } +# +# check_valid_values_binary <- function(colname, column) { +# column <- as.numeric(column[column != "."]) +# failing_values <- column[column < 0 | column > 1] +# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) +# +# str_res <- "" +# if (number_of_failing_values == 0) +# str_res <- "No failing values" +# else { +# range_as_str <- "0-1 (binary)" +# +# str_res <- paste(colname, "has", number_of_failing_values, "failing values") +# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") +# } +# +# return(str_res) +# } check_valid_values <- function(valid_colnames, codebook_param){