From e778774de28eaf0ce871f931aec811bae752dd20 Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Mon, 17 Apr 2023 15:38:05 +0200 Subject: [PATCH] updated local script --- valid_variables_script_local.R | 141 ++++++++++----------------------- 1 file changed, 42 insertions(+), 99 deletions(-) diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index 02f7c9e..bf0b44e 100644 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -106,6 +106,9 @@ check_valid_name <- function(col_name){ missing_values_count <- length(column[column == "."]) total_values <- length(column) + missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) + + column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) @@ -127,9 +130,14 @@ check_valid_name <- function(col_name){ number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count str_res <- "" - if (number_of_failing_values == 0) - str_res <- "No failing values" - else{ + if (number_of_failing_values == 0){ + if(missing_values_percentage > 90){ + str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") + } + else{ + str_res <- paste("No failing values") + } + }else{ failing_values <- failing_values[!is.na(failing_values)] failing_value_counts <- table(failing_values) @@ -149,7 +157,9 @@ check_valid_name <- function(col_name){ str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) - str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) + str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) + + str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) @@ -176,6 +186,9 @@ check_valid_name <- function(col_name){ missing_values_count <- length(column[column == "."]) total_values <- length(column) + missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) + + column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) @@ -201,8 +214,13 @@ check_valid_name <- function(col_name){ failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count - if (number_of_failing_values == 0) { - str_res <- "No failing values" + if (number_of_failing_values == 0){ + if(missing_values_percentage > 90){ + str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") + } + else{ + str_res <- paste("No failing values") + } } else { range_as_str <- paste(min_value, "-", max_value, " (categorical)") failing_values <- failing_values[!is.na(failing_values)] @@ -224,6 +242,8 @@ check_valid_name <- function(col_name){ str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) + str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) + str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) @@ -247,6 +267,8 @@ check_valid_name <- function(col_name){ total_values <- length(column) + missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) + column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) @@ -254,8 +276,14 @@ check_valid_name <- function(col_name){ number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count str_res <- "" - if (number_of_failing_values == 0) - str_res <- "No failing values" + if (number_of_failing_values == 0){ + if(missing_values_percentage > 90){ + str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") + } + else{ + str_res <- paste("No failing values") + } + } else { range_as_str <- "0-1 (binary)" failing_values <- failing_values[!is.na(failing_values)] @@ -269,7 +297,7 @@ check_valid_name <- function(col_name){ str_res <- paste(str_res, paste(missing_values_count, ",", sep = "")) - str_res <- paste(str_res, "should be in the range:", paste("0", "1", sep = "-"), "(categorical),", sep = " ") + str_res <- paste(str_res, "should be in the range:", paste("0", "1", sep = "-"), "(binary),", sep = " ") first <- names(failing_value_counts)[1] @@ -277,7 +305,9 @@ check_valid_name <- function(col_name){ str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) - str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) + str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) + + str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) # str_res <- paste(colname, "has failing values:") @@ -294,97 +324,10 @@ check_valid_name <- function(col_name){ return(str_res) } -# check_valid_values_continuous <- function(colname, codebook_param, column) { -# -# column <- as.numeric(column[column != "."]) -# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] -# -# value_format <- strsplit(possible_values_format, " / ")[[1]] -# min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) -# max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) -# if(min_value == ""){ -# min_value <- str_trim(sub(",.*", "", value_format[1])) -# max_value <- str_trim(sub(".*,", "", value_format[1])) -# } -# min_value <- as.double(min_value) -# max_value <- as.double(max_value) -# print(colname) -# print(min_value) -# print(max_value) -# -# failing_values <- column[column < min_value | column > max_value] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# str_res <- "" -# if (number_of_failing_values == 0) { -# str_res <- "No failing values" -# } else { -# range_as_str <- paste(min_value, "-", max_value, "(continuous)") -# -# str_res <- paste(colname, "has", number_of_failing_values, "failing values") -# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") -# } -# -# return(str_res) -# } -# -# -# check_valid_values_categorical <- function(colname, codebook_param, column) { -# column <- as.numeric(column[column != "."]) -# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] -# possible_values_list <- str_split(possible_values_format, "/")[[1]] -# -# possible_values_list <- lapply(possible_values_list, str_trim) -# -# str_res <- "" -# min_value <- 0 -# max_value <- 0 -# if (length(possible_values_list[[1]]) == 2) { -# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] -# min_value <- strtoi(separate_range[1]) -# max_value <- strtoi(separate_range[2]) -# } else { -# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] -# min_value <- possible_values_list[1] -# max_value <- possible_values_list[length(possible_values_list) - 1] -# } -# -# failing_values <- column[column < min_value | column > max_value] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# if (number_of_failing_values == 0) { -# str_res <- "No failing values" -# } else { -# range_as_str <- paste(min_value, "-", max_value, " (categorical)") -# -# str_res <- paste(colname, "has", number_of_failing_values, "failing values") -# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") -# } -# -# return(str_res) -# } -# -# check_valid_values_binary <- function(colname, column) { -# column <- as.numeric(column[column != "."]) -# failing_values <- column[column < 0 | column > 1] -# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) -# -# str_res <- "" -# if (number_of_failing_values == 0) -# str_res <- "No failing values" -# else { -# range_as_str <- "0-1 (binary)" -# -# str_res <- paste(colname, "has", number_of_failing_values, "failing values") -# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") -# } -# -# return(str_res) -# } check_valid_values <- function(valid_colnames, codebook_param){ - - res <- "" + + res <- paste("Total patients:", length(valid_colnames[,1])) for(i in 1:(ncol(valid_colnames))){ name <- names(valid_colnames)[i] -- 2.24.1