From 097715bbee92e58682b0d15674e99afb2803210b Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Tue, 11 Apr 2023 17:58:37 +0200 Subject: [PATCH] Fixed local script --- valid_variables_script_local.R | 291 +++++++++++++++++---------------- 1 file changed, 147 insertions(+), 144 deletions(-) diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index 5432ed8..881db4f 100644 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -100,153 +100,163 @@ check_valid_name <- function(col_name){ } -# check_valid_values_continuous <- function(colname , codebook_param , column){ -# -# column <- column[column != "."] -# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] -# possible_values_list = str_split(possible_values_format , "/")[[1]] -# -# range_as_str <- str_trim(possible_values_list[1]) -# missing_value_format <- str_trim(str_trim(possible_values_list[2])) -# -# separate_range <- str_split(range_as_str , "-")[[1]] -# min_value <- strtoi(separate_range[1]) -# max_value <- strtoi(separate_range[2]) -# -# failing_values <- column[column < min_value | column > max_value] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# str_res <- "" -# if (number_of_failing_values == 0) -# str_res <- "No failing values" -# else{ -# failing_values <- failing_values[!is.na(failing_values)] -# failing_value_counts <- table(failing_values) -# -# str_res <- paste(colname, "has failing values:") -# -# for (i in seq_along(failing_value_counts)) { -# value <- names(failing_value_counts)[i] -# count <- failing_value_counts[i] -# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") -# } -# -# str_res <- paste(str_res, "should be in range", range_as_str, "(continuous)", sep = " ") -# } -# -# return(str_res) -# } -# -# -# -# check_valid_values_categorical <- function(colname, codebook_param, column) { -# column <- column[column != "."] -# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] -# possible_values_list <- str_split(possible_values_format, "/")[[1]] -# -# possible_values_list <- lapply(possible_values_list, str_trim) -# -# str_res <- "" -# min_value <- 0 -# max_value <- 0 -# if (length(possible_values_list[[1]]) == 2) { -# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] -# min_value <- strtoi(separate_range[1]) -# max_value <- strtoi(separate_range[2]) -# } else { -# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] -# min_value <- possible_values_list[1] -# max_value <- possible_values_list[length(possible_values_list) - 1] -# } -# -# failing_values <- column[column < min_value | column > max_value] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# if (number_of_failing_values == 0) { -# str_res <- "No failing values" -# } else { -# range_as_str <- paste(min_value, "-", max_value, " (categorical)") -# failing_values <- failing_values[!is.na(failing_values)] -# failing_value_counts <- table(failing_values) -# -# str_res <- paste(colname, "has failing values:") -# -# for (i in seq_along(failing_value_counts)) { -# value <- names(failing_value_counts)[i] -# count <- failing_value_counts[i] -# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") -# } -# -# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") -# } -# -# return(str_res) -# } -# -# check_valid_values_binary <- function(colname, column) { -# column <- column[column != "."] -# failing_values <- column[column < 0 | column > 1] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# str_res <- "" -# if (number_of_failing_values == 0) -# str_res <- "No failing values" -# else { -# range_as_str <- "0-1 (binary)" -# failing_values <- failing_values[!is.na(failing_values)] -# failing_value_counts <- table(failing_values) -# -# str_res <- paste(colname, "has failing values:") -# -# for (i in seq_along(failing_value_counts)) { -# value <- names(failing_value_counts)[i] -# count <- failing_value_counts[i] -# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") -# } -# -# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") -# } -# -# return(str_res) -# } + # check_valid_values_continuous <- function(colname , codebook_param , column){ + # + # column <- as.numeric(column[column != "."]) + # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + # + # value_format <- strsplit(possible_values_format, " / ")[[1]] + # min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) + # max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) + # if(min_value == ""){ + # min_value <- str_trim(sub(",.*", "", value_format[1])) + # max_value <- str_trim(sub(".*,", "", value_format[1])) + # } + # min_value <- as.double(min_value) + # max_value <- as.double(max_value) + # print(colname) + # print(min_value) + # print(max_value) + # + # failing_values <- column[column < min_value | column > max_value] + # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + # + # str_res <- "" + # if (number_of_failing_values == 0) + # str_res <- "No failing values" + # else{ + # failing_values <- failing_values[!is.na(failing_values)] + # failing_value_counts <- table(failing_values) + # + # str_res <- paste(colname, "has failing values:") + # + # for (i in seq_along(failing_value_counts)) { + # value <- names(failing_value_counts)[i] + # count <- failing_value_counts[i] + # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + # } + # + # str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ") + # } + # + # return(str_res) + # } + # + # + # + # check_valid_values_categorical <- function(colname, codebook_param, column) { + # column <- as.numeric(column[column != "."]) + # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + # possible_values_list <- str_split(possible_values_format, "/")[[1]] + # + # possible_values_list <- lapply(possible_values_list, str_trim) + # + # str_res <- "" + # min_value <- 0 + # max_value <- 0 + # if (length(possible_values_list[[1]]) == 2) { + # separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] + # min_value <- strtoi(separate_range[1]) + # max_value <- strtoi(separate_range[2]) + # } else { + # possible_values_list <- lapply(possible_values_list, strtoi)[[1]] + # min_value <- possible_values_list[1] + # max_value <- possible_values_list[length(possible_values_list) - 1] + # } + # + # failing_values <- column[column < min_value | column > max_value] + # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + # + # if (number_of_failing_values == 0) { + # str_res <- "No failing values" + # } else { + # range_as_str <- paste(min_value, "-", max_value, " (categorical)") + # failing_values <- failing_values[!is.na(failing_values)] + # failing_value_counts <- table(failing_values) + # + # str_res <- paste(colname, "has failing values:") + # + # for (i in seq_along(failing_value_counts)) { + # value <- names(failing_value_counts)[i] + # count <- failing_value_counts[i] + # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + # } + # + # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") + # } + # + # return(str_res) + # } + # + # check_valid_values_binary <- function(colname, column) { + # column <- as.numeric(column[column != "."]) + # failing_values <- column[column < 0 | column > 1] + # number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + # + # str_res <- "" + # if (number_of_failing_values == 0) + # str_res <- "No failing values" + # else { + # range_as_str <- "0-1 (binary)" + # failing_values <- failing_values[!is.na(failing_values)] + # failing_value_counts <- table(failing_values) + # + # str_res <- paste(colname, "has failing values:") + # + # for (i in seq_along(failing_value_counts)) { + # value <- names(failing_value_counts)[i] + # count <- failing_value_counts[i] + # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") + # } + # + # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") + # } + # + # return(str_res) + # } check_valid_values_continuous <- function(colname, codebook_param, column) { - - column <- column[column != "."] + + column <- as.numeric(column[column != "."]) possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] - possible_values_list = str_split(possible_values_format, "/")[[1]] - - range_as_str <- str_trim(possible_values_list[1]) - missing_value_format <- str_trim(str_trim(possible_values_list[2])) - - separate_range <- str_split(range_as_str, "-")[[1]] - min_value <- strtoi(separate_range[1]) - max_value <- strtoi(separate_range[2]) - + + value_format <- strsplit(possible_values_format, " / ")[[1]] + min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) + max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) + if(min_value == ""){ + min_value <- str_trim(sub(",.*", "", value_format[1])) + max_value <- str_trim(sub(".*,", "", value_format[1])) + } + min_value <- as.double(min_value) + max_value <- as.double(max_value) + print(colname) + print(min_value) + print(max_value) + failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - + str_res <- "" if (number_of_failing_values == 0) { str_res <- "No failing values" } else { range_as_str <- paste(min_value, "-", max_value, "(continuous)") - + str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } - + return(str_res) } check_valid_values_categorical <- function(colname, codebook_param, column) { - column <- column[column != "."] + column <- as.numeric(column[column != "."]) possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_list <- str_split(possible_values_format, "/")[[1]] - + possible_values_list <- lapply(possible_values_list, str_trim) - + str_res <- "" min_value <- 0 max_value <- 0 @@ -259,43 +269,40 @@ check_valid_values_categorical <- function(colname, codebook_param, column) { min_value <- possible_values_list[1] max_value <- possible_values_list[length(possible_values_list) - 1] } - + failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - + if (number_of_failing_values == 0) { str_res <- "No failing values" } else { range_as_str <- paste(min_value, "-", max_value, " (categorical)") - + str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } - + return(str_res) } check_valid_values_binary <- function(colname, column) { - column <- column[column != "."] + column <- as.numeric(column[column != "."]) failing_values <- column[column < 0 | column > 1] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) - + str_res <- "" if (number_of_failing_values == 0) str_res <- "No failing values" else { range_as_str <- "0-1 (binary)" - + str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } - + return(str_res) } - - - check_valid_values <- function(valid_colnames, codebook_param){ res <- "" @@ -307,13 +314,8 @@ check_valid_values <- function(valid_colnames, codebook_param){ next } - #if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){ - # next - #} - column <- valid_colnames[,i] - # Esto falla si tu codebook no es mismo que new_harmon.csv column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] if (is.na(column_type) ) { @@ -334,7 +336,7 @@ check_valid_values <- function(valid_colnames, codebook_param){ } ) - if (result != "No failing values"){ + if (any(result != "No failing values")){ res <- paste(res , result, sep="\n") } } @@ -344,6 +346,7 @@ check_valid_values <- function(valid_colnames, codebook_param){ } + data_colnames <- as.data.frame(colnames(harmonized_data)) check_valid_columns <- check_column_names(data_colnames) @@ -359,4 +362,4 @@ result <- "" result<-check_valid_values(valid_colnames_with_data, codebook) print(columns_not_valid) cat(result) - + -- 2.24.1