Commit e270001d authored by GNajeral's avatar GNajeral

Updated version of local script giving more precise statistics

parent 8f14a740
......@@ -52,8 +52,8 @@ file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){
#harmonized_data <- read.csv(file_name)
harmonized_data <- read.csv(file_name, sep = ";")
harmonized_data <- read.csv(file_name)
#harmonized_data <- read.csv(file_name, sep = ";")
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name)
}
......@@ -103,8 +103,13 @@ check_valid_name <- function(col_name){
check_valid_values_continuous <- function(colname , codebook_param , column){
missing_values_count <- length(column[column == "."])
total_values <- length(column)
column <- as.numeric(column[column != "."])
string_values_count <- length(column[is.na(column)])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
value_format <- strsplit(possible_values_format, " / ")[[1]]
......@@ -119,7 +124,7 @@ check_valid_name <- function(col_name){
max_value <- as.double(max_value)
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
str_res <- ""
if (number_of_failing_values == 0)
......@@ -132,13 +137,20 @@ check_valid_name <- function(col_name){
str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
str_res <- paste(str_res, "Missing values:")
str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(continuous),", sep = " ")
first <- names(failing_value_counts)[1]
last <- names(failing_value_counts)[length(failing_value_counts)]
str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "]", sep = "" ))
str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
# if(number_of_failing_values < 30) {
......@@ -160,7 +172,13 @@ check_valid_name <- function(col_name){
check_valid_values_categorical <- function(colname, codebook_param, column) {
missing_values_count <- length(column[column == "."])
total_values <- length(column)
column <- as.numeric(column[column != "."])
string_values_count <- length(column[is.na(column)])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]]
......@@ -174,14 +192,15 @@ check_valid_name <- function(col_name){
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
} else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
possible_values_list <- possible_values_list[possible_values_list != "."]
possible_values_list <- lapply(possible_values_list, strtoi)
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
max_value <- possible_values_list[length(possible_values_list)]
}
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
......@@ -193,13 +212,20 @@ check_valid_name <- function(col_name){
str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
str_res <- paste(str_res, "Missing values:")
str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(categorical),", sep = " ")
first <- names(failing_value_counts)[1]
last <- names(failing_value_counts)[length(failing_value_counts)]
str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "]", sep = "" ))
str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
# str_res <- paste(colname, "has failing values:")
#
......@@ -216,10 +242,17 @@ check_valid_name <- function(col_name){
}
check_valid_values_binary <- function(colname, column) {
missing_values_count <- length(column[column == "."])
total_values <- length(column)
column <- as.numeric(column[column != "."])
string_values_count <- length(column[is.na(column)])
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
......@@ -227,6 +260,25 @@ check_valid_name <- function(col_name){
range_as_str <- "0-1 (binary)"
failing_values <- failing_values[!is.na(failing_values)]
failing_value_counts <- table(failing_values)
str_res <- paste(colname, "has wrong values:")
str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = ""))
str_res <- paste(str_res, "Missing values:")
str_res <- paste(str_res, paste(missing_values_count, ",", sep = ""))
str_res <- paste(str_res, "should be in the range:", paste("0", "1", sep = "-"), "(categorical),", sep = " ")
first <- names(failing_value_counts)[1]
last <- names(failing_value_counts)[length(failing_value_counts)]
str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" ))
str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" ))
# str_res <- paste(colname, "has failing values:")
#
......@@ -394,6 +446,10 @@ valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_col
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid)
# Split the string into separate elements
split_columns <- strsplit(columns_not_valid, split = " ", fixed = TRUE)
# Print each element in a separate line
cat(paste(unlist(split_columns), collapse = "\n"), "\n")
cat(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment