Commit 097715bb authored by GNajeral's avatar GNajeral

Fixed local script

parent a89d6844
...@@ -100,153 +100,163 @@ check_valid_name <- function(col_name){ ...@@ -100,153 +100,163 @@ check_valid_name <- function(col_name){
} }
# check_valid_values_continuous <- function(colname , codebook_param , column){ # check_valid_values_continuous <- function(colname , codebook_param , column){
# #
# column <- column[column != "."] # column <- as.numeric(column[column != "."])
# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
# possible_values_list = str_split(possible_values_format , "/")[[1]] #
# # value_format <- strsplit(possible_values_format, " / ")[[1]]
# range_as_str <- str_trim(possible_values_list[1]) # min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
# missing_value_format <- str_trim(str_trim(possible_values_list[2])) # max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
# # if(min_value == ""){
# separate_range <- str_split(range_as_str , "-")[[1]] # min_value <- str_trim(sub(",.*", "", value_format[1]))
# min_value <- strtoi(separate_range[1]) # max_value <- str_trim(sub(".*,", "", value_format[1]))
# max_value <- strtoi(separate_range[2]) # }
# # min_value <- as.double(min_value)
# failing_values <- column[column < min_value | column > max_value] # max_value <- as.double(max_value)
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # print(colname)
# # print(min_value)
# str_res <- "" # print(max_value)
# if (number_of_failing_values == 0) #
# str_res <- "No failing values" # failing_values <- column[column < min_value | column > max_value]
# else{ # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
# failing_values <- failing_values[!is.na(failing_values)] #
# failing_value_counts <- table(failing_values) # str_res <- ""
# # if (number_of_failing_values == 0)
# str_res <- paste(colname, "has failing values:") # str_res <- "No failing values"
# # else{
# for (i in seq_along(failing_value_counts)) { # failing_values <- failing_values[!is.na(failing_values)]
# value <- names(failing_value_counts)[i] # failing_value_counts <- table(failing_values)
# count <- failing_value_counts[i] #
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") # str_res <- paste(colname, "has failing values:")
# } #
# # for (i in seq_along(failing_value_counts)) {
# str_res <- paste(str_res, "should be in range", range_as_str, "(continuous)", sep = " ") # value <- names(failing_value_counts)[i]
# } # count <- failing_value_counts[i]
# # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# return(str_res) # }
# } #
# # str_res <- paste(str_res, "should be in range", value_format, "(continuous)", sep = " ")
# # }
# #
# check_valid_values_categorical <- function(colname, codebook_param, column) { # return(str_res)
# column <- column[column != "."] # }
# possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] #
# possible_values_list <- str_split(possible_values_format, "/")[[1]] #
# #
# possible_values_list <- lapply(possible_values_list, str_trim) # check_valid_values_categorical <- function(colname, codebook_param, column) {
# # column <- as.numeric(column[column != "."])
# str_res <- "" # possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
# min_value <- 0 # possible_values_list <- str_split(possible_values_format, "/")[[1]]
# max_value <- 0 #
# if (length(possible_values_list[[1]]) == 2) { # possible_values_list <- lapply(possible_values_list, str_trim)
# separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] #
# min_value <- strtoi(separate_range[1]) # str_res <- ""
# max_value <- strtoi(separate_range[2]) # min_value <- 0
# } else { # max_value <- 0
# possible_values_list <- lapply(possible_values_list, strtoi)[[1]] # if (length(possible_values_list[[1]]) == 2) {
# min_value <- possible_values_list[1] # separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
# max_value <- possible_values_list[length(possible_values_list) - 1] # min_value <- strtoi(separate_range[1])
# } # max_value <- strtoi(separate_range[2])
# # } else {
# failing_values <- column[column < min_value | column > max_value] # possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # min_value <- possible_values_list[1]
# # max_value <- possible_values_list[length(possible_values_list) - 1]
# if (number_of_failing_values == 0) { # }
# str_res <- "No failing values" #
# } else { # failing_values <- column[column < min_value | column > max_value]
# range_as_str <- paste(min_value, "-", max_value, " (categorical)") # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
# failing_values <- failing_values[!is.na(failing_values)] #
# failing_value_counts <- table(failing_values) # if (number_of_failing_values == 0) {
# # str_res <- "No failing values"
# str_res <- paste(colname, "has failing values:") # } else {
# # range_as_str <- paste(min_value, "-", max_value, " (categorical)")
# for (i in seq_along(failing_value_counts)) { # failing_values <- failing_values[!is.na(failing_values)]
# value <- names(failing_value_counts)[i] # failing_value_counts <- table(failing_values)
# count <- failing_value_counts[i] #
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") # str_res <- paste(colname, "has failing values:")
# } #
# # for (i in seq_along(failing_value_counts)) {
# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") # value <- names(failing_value_counts)[i]
# } # count <- failing_value_counts[i]
# # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# return(str_res) # }
# } #
# # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
# check_valid_values_binary <- function(colname, column) { # }
# column <- column[column != "."] #
# failing_values <- column[column < 0 | column > 1] # return(str_res)
# number_of_failing_values <- length(failing_values[!is.na(failing_values)]) # }
# #
# str_res <- "" # check_valid_values_binary <- function(colname, column) {
# if (number_of_failing_values == 0) # column <- as.numeric(column[column != "."])
# str_res <- "No failing values" # failing_values <- column[column < 0 | column > 1]
# else { # number_of_failing_values <- length(failing_values[!is.na(failing_values)])
# range_as_str <- "0-1 (binary)" #
# failing_values <- failing_values[!is.na(failing_values)] # str_res <- ""
# failing_value_counts <- table(failing_values) # if (number_of_failing_values == 0)
# # str_res <- "No failing values"
# str_res <- paste(colname, "has failing values:") # else {
# # range_as_str <- "0-1 (binary)"
# for (i in seq_along(failing_value_counts)) { # failing_values <- failing_values[!is.na(failing_values)]
# value <- names(failing_value_counts)[i] # failing_value_counts <- table(failing_values)
# count <- failing_value_counts[i] #
# str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") # str_res <- paste(colname, "has failing values:")
# } #
# # for (i in seq_along(failing_value_counts)) {
# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") # value <- names(failing_value_counts)[i]
# } # count <- failing_value_counts[i]
# # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
# return(str_res) # }
# } #
# str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
# }
#
# return(str_res)
# }
check_valid_values_continuous <- function(colname, codebook_param, column) { check_valid_values_continuous <- function(colname, codebook_param, column) {
column <- column[column != "."] column <- as.numeric(column[column != "."])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format, "/")[[1]]
value_format <- strsplit(possible_values_format, " / ")[[1]]
range_as_str <- str_trim(possible_values_list[1]) min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
missing_value_format <- str_trim(str_trim(possible_values_list[2])) max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
if(min_value == ""){
separate_range <- str_split(range_as_str, "-")[[1]] min_value <- str_trim(sub(",.*", "", value_format[1]))
min_value <- strtoi(separate_range[1]) max_value <- str_trim(sub(".*,", "", value_format[1]))
max_value <- strtoi(separate_range[2]) }
min_value <- as.double(min_value)
max_value <- as.double(max_value)
print(colname)
print(min_value)
print(max_value)
failing_values <- column[column < min_value | column > max_value] failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- "" str_res <- ""
if (number_of_failing_values == 0) { if (number_of_failing_values == 0) {
str_res <- "No failing values" str_res <- "No failing values"
} else { } else {
range_as_str <- paste(min_value, "-", max_value, "(continuous)") range_as_str <- paste(min_value, "-", max_value, "(continuous)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} }
return(str_res) return(str_res)
} }
check_valid_values_categorical <- function(colname, codebook_param, column) { check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- column[column != "."] column <- as.numeric(column[column != "."])
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]] possible_values_list <- str_split(possible_values_format, "/")[[1]]
possible_values_list <- lapply(possible_values_list, str_trim) possible_values_list <- lapply(possible_values_list, str_trim)
str_res <- "" str_res <- ""
min_value <- 0 min_value <- 0
max_value <- 0 max_value <- 0
...@@ -259,43 +269,40 @@ check_valid_values_categorical <- function(colname, codebook_param, column) { ...@@ -259,43 +269,40 @@ check_valid_values_categorical <- function(colname, codebook_param, column) {
min_value <- possible_values_list[1] min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1] max_value <- possible_values_list[length(possible_values_list) - 1]
} }
failing_values <- column[column < min_value | column > max_value] failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if (number_of_failing_values == 0) { if (number_of_failing_values == 0) {
str_res <- "No failing values" str_res <- "No failing values"
} else { } else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)") range_as_str <- paste(min_value, "-", max_value, " (categorical)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} }
return(str_res) return(str_res)
} }
check_valid_values_binary <- function(colname, column) { check_valid_values_binary <- function(colname, column) {
column <- column[column != "."] column <- as.numeric(column[column != "."])
failing_values <- column[column < 0 | column > 1] failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)]) number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- "" str_res <- ""
if (number_of_failing_values == 0) if (number_of_failing_values == 0)
str_res <- "No failing values" str_res <- "No failing values"
else { else {
range_as_str <- "0-1 (binary)" range_as_str <- "0-1 (binary)"
str_res <- paste(colname, "has", number_of_failing_values, "failing values") str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
} }
return(str_res) return(str_res)
} }
check_valid_values <- function(valid_colnames, codebook_param){ check_valid_values <- function(valid_colnames, codebook_param){
res <- "" res <- ""
...@@ -307,13 +314,8 @@ check_valid_values <- function(valid_colnames, codebook_param){ ...@@ -307,13 +314,8 @@ check_valid_values <- function(valid_colnames, codebook_param){
next next
} }
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i] column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
if (is.na(column_type) ) { if (is.na(column_type) ) {
...@@ -334,7 +336,7 @@ check_valid_values <- function(valid_colnames, codebook_param){ ...@@ -334,7 +336,7 @@ check_valid_values <- function(valid_colnames, codebook_param){
} }
) )
if (result != "No failing values"){ if (any(result != "No failing values")){
res <- paste(res , result, sep="\n") res <- paste(res , result, sep="\n")
} }
} }
...@@ -344,6 +346,7 @@ check_valid_values <- function(valid_colnames, codebook_param){ ...@@ -344,6 +346,7 @@ check_valid_values <- function(valid_colnames, codebook_param){
} }
data_colnames <- as.data.frame(colnames(harmonized_data)) data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames) check_valid_columns <- check_column_names(data_colnames)
...@@ -359,4 +362,4 @@ result <- "" ...@@ -359,4 +362,4 @@ result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook) result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid) print(columns_not_valid)
cat(result) cat(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment