Commit 2c02e92b authored by GNajeral's avatar GNajeral

Some minor updates to local script for specific usecases

parent 097715bb
...@@ -51,8 +51,8 @@ file_name <- readline("Introduce the name of the file to check the values: ") ...@@ -51,8 +51,8 @@ file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- "" harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){ if (grepl(".csv" , file_name , fixed = TRUE)){
harmonized_data <- read.csv(file_name) #harmonized_data <- read.csv(file_name)
#harmonized_data <- read.csv(file_name, sep = ";") harmonized_data <- read.csv(file_name, sep = ";")
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){ }else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name) harmonized_data <- read.xlsx(file_name)
} }
...@@ -100,208 +100,208 @@ check_valid_name <- function(col_name){ ...@@ -100,208 +100,208 @@ check_valid_name <- function(col_name){
} }
# check_valid_values_continuous <- function(colname , codebook_param , column){ check_valid_values_continuous <- function(colname , codebook_param , column){
#
# column <- as.numeric(column[column != "."]) column <- as.numeric(column[column != "."])
# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
#
# value_format <- strsplit(possible_values_format, " / ")[[1]] value_format <- strsplit(possible_values_format, " / ")[[1]]
# min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
# max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
# if(min_value == ""){ if(min_value == ""){
# min_value <- str_trim(sub(",.*", "", value_format[1])) min_value <- str_trim(sub(",.*", "", value_format[1]))
# max_value <- str_trim(sub(".*,", "", value_format[1])) max_value <- str_trim(sub(".*,", "", value_format[1]))
# } }
# min_value <- as.double(min_value) min_value <- as.double(min_value)
# max_value <- as.double(max_value) max_value <- as.double(max_value)
# print(colname) print(colname)
# print(min_value) print(min_value)
# print(max_value) print(max_value)
#
# failing_values <- column[column < min_value | column > max_value] failing_values <- column[column < min_value | column > max_value]
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
# str_res <- "" str_res <- ""
# if (number_of_failing_values == 0) if (number_of_failing_values == 0)
# str_res <- "No failing values" str_res <- "No failing values"
# else{ else{
# failing_values <- failing_values[!is.na(failing_values)] failing_values <- failing_values[!is.na(failing_values)]
# failing_value_counts <- table(failing_values) failing_value_counts <- table(failing_values)
#
# str_res <- paste(colname, "has failing values:") str_res <- paste(colname, "has failing values:")
#
# for (i in seq_along(failing_value_counts)) { for (i in seq_along(failing_value_counts)) {
# value <- names(failing_value_counts)[i] value <- names(failing_value_counts)[i]
# count <- failing_value_counts[i] count <- failing_value_counts[i]
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# } }
#
# str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ") str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ")
# } }
#
# return(str_res) return(str_res)
# } }
#
#
#
# check_valid_values_categorical <- function(colname, codebook_param, column) { check_valid_values_categorical <- function(colname, codebook_param, column) {
# column <- as.numeric(column[column != "."]) column <- as.numeric(column[column != "."])
# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
# possible_values_list <- str_split(possible_values_format, "/")[[1]] possible_values_list <- str_split(possible_values_format, "/")[[1]]
#
# possible_values_list <- lapply(possible_values_list, str_trim) possible_values_list <- lapply(possible_values_list, str_trim)
#
# str_res <- "" str_res <- ""
# min_value <- 0 min_value <- 0
# max_value <- 0 max_value <- 0
# if (length(possible_values_list[[1]]) == 2) { if (length(possible_values_list[[1]]) == 2) {
# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
# min_value <- strtoi(separate_range[1]) min_value <- strtoi(separate_range[1])
# max_value <- strtoi(separate_range[2]) max_value <- strtoi(separate_range[2])
# } else { } else {
# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
# min_value <- possible_values_list[1] min_value <- possible_values_list[1]
# max_value <- possible_values_list[length(possible_values_list) - 1] max_value <- possible_values_list[length(possible_values_list) - 1]
# } }
#
# failing_values <- column[column < min_value | column > max_value] failing_values <- column[column < min_value | column > max_value]
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
# if (number_of_failing_values == 0) { if (number_of_failing_values == 0) {
# str_res <- "No failing values" str_res <- "No failing values"
# } else { } else {
# range_as_str <- paste(min_value, "-", max_value, " (categorical)") range_as_str <- paste(min_value, "-", max_value, " (categorical)")
# failing_values <- failing_values[!is.na(failing_values)] failing_values <- failing_values[!is.na(failing_values)]
# failing_value_counts <- table(failing_values) failing_value_counts <- table(failing_values)
#
# str_res <- paste(colname, "has failing values:") str_res <- paste(colname, "has failing values:")
#
# for (i in seq_along(failing_value_counts)) { for (i in seq_along(failing_value_counts)) {
# value <- names(failing_value_counts)[i] value <- names(failing_value_counts)[i]
# count <- failing_value_counts[i] count <- failing_value_counts[i]
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# } }
#
# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
# } }
#
# return(str_res) return(str_res)
# } }
#
# check_valid_values_binary <- function(colname, column) { check_valid_values_binary <- function(colname, column) {
# column <- as.numeric(column[column != "."]) column <- as.numeric(column[column != "."])
# failing_values <- column[column < 0 | column > 1] failing_values <- column[column < 0 | column > 1]
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
# str_res <- "" str_res <- ""
# if (number_of_failing_values == 0) if (number_of_failing_values == 0)
# str_res <- "No failing values" str_res <- "No failing values"
# else { else {
# range_as_str <- "0-1 (binary)" range_as_str <- "0-1 (binary)"
# failing_values <- failing_values[!is.na(failing_values)] failing_values <- failing_values[!is.na(failing_values)]
# failing_value_counts <- table(failing_values) failing_value_counts <- table(failing_values)
#
# str_res <- paste(colname, "has failing values:") str_res <- paste(colname, "has failing values:")
#
# for (i in seq_along(failing_value_counts)) { for (i in seq_along(failing_value_counts)) {
# value <- names(failing_value_counts)[i] value <- names(failing_value_counts)[i]
# count <- failing_value_counts[i] count <- failing_value_counts[i]
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# } }
#
# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
# } }
#
# return(str_res) return(str_res)
# } }
check_valid_values_continuous <- function(colname, codebook_param, column) { # check_valid_values_continuous <- function(colname, codebook_param, column) {
#
column <- as.numeric(column[column != "."]) # column <- as.numeric(column[column != "."])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
#
value_format <- strsplit(possible_values_format, " / ")[[1]] # value_format <- strsplit(possible_values_format, " / ")[[1]]
min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) # min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) # max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
if(min_value == ""){ # if(min_value == ""){
min_value <- str_trim(sub(",.*", "", value_format[1])) # min_value <- str_trim(sub(",.*", "", value_format[1]))
max_value <- str_trim(sub(".*,", "", value_format[1])) # max_value <- str_trim(sub(".*,", "", value_format[1]))
} # }
min_value <- as.double(min_value) # min_value <- as.double(min_value)
max_value <- as.double(max_value) # max_value <- as.double(max_value)
print(colname) # print(colname)
print(min_value) # print(min_value)
print(max_value) # print(max_value)
#
failing_values <- column[column < min_value | column > max_value] # failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
str_res <- "" # str_res <- ""
if (number_of_failing_values == 0) { # if (number_of_failing_values == 0) {
str_res <- "No failing values" # str_res <- "No failing values"
} else { # } else {
range_as_str <- paste(min_value, "-", max_value, "(continuous)") # range_as_str <- paste(min_value, "-", max_value, "(continuous)")
#
str_res <- paste(colname, "has", number_of_failing_values, "failing values") # str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} # }
#
return(str_res) # return(str_res)
} # }
#
#
check_valid_values_categorical <- function(colname, codebook_param, column) { # check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- as.numeric(column[column != "."]) # column <- as.numeric(column[column != "."])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]] # possible_values_list <- str_split(possible_values_format, "/")[[1]]
#
possible_values_list <- lapply(possible_values_list, str_trim) # possible_values_list <- lapply(possible_values_list, str_trim)
#
str_res <- "" # str_res <- ""
min_value <- 0 # min_value <- 0
max_value <- 0 # max_value <- 0
if (length(possible_values_list[[1]]) == 2) { # if (length(possible_values_list[[1]]) == 2) {
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] # separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1]) # min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2]) # max_value <- strtoi(separate_range[2])
} else { # } else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]] # possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
min_value <- possible_values_list[1] # min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1] # max_value <- possible_values_list[length(possible_values_list) - 1]
} # }
#
failing_values <- column[column < min_value | column > max_value] # failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
if (number_of_failing_values == 0) { # if (number_of_failing_values == 0) {
str_res <- "No failing values" # str_res <- "No failing values"
} else { # } else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)") # range_as_str <- paste(min_value, "-", max_value, " (categorical)")
#
str_res <- paste(colname, "has", number_of_failing_values, "failing values") # str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} # }
#
return(str_res) # return(str_res)
} # }
#
check_valid_values_binary <- function(colname, column) { # check_valid_values_binary <- function(colname, column) {
column <- as.numeric(column[column != "."]) # column <- as.numeric(column[column != "."])
failing_values <- column[column < 0 | column > 1] # failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
#
str_res <- "" # str_res <- ""
if (number_of_failing_values == 0) # if (number_of_failing_values == 0)
str_res <- "No failing values" # str_res <- "No failing values"
else { # else {
range_as_str <- "0-1 (binary)" # range_as_str <- "0-1 (binary)"
#
str_res <- paste(colname, "has", number_of_failing_values, "failing values") # str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} # }
#
return(str_res) # return(str_res)
} # }
check_valid_values <- function(valid_colnames, codebook_param){ check_valid_values <- function(valid_colnames, codebook_param){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment