rm(list=ls()) dir_name <- readline("Introduce the name of the directory please: ") setwd(dir_name) source("dependency_installer.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) #source("connection_parameters.R") #source("necessary_functions_connection.R") codebook <- read.csv("new_harmon.csv" , sep = ",") codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) names(codebook_col_names) <- c("col_names") setwd(paste(dir_name ,"/harmonized_data", sep="")) file_name <- readline("Introduce the name of the file to check the values: ") harmonized_data <- "" if (grepl(".csv" , file_name , fixed = TRUE)){ harmonized_data <- read.csv(file_name) }else if (grepl(".xlsx" , file_name , fixed = TRUE)){ harmonized_data <- read.xlsx(file_name) } #---------------------------------------------------------------------------- #Test if column names are valid check_column_names <- function(col_names){ str_res <- "The column names:" valid_colnames <- c() repeated_colnames <- c() for(i in 1:(nrow(col_names))){ col_name <- col_names[i,1] number_of_column <- check_valid_name(col_name) if( number_of_column == 0){ str_res<- paste(str_res, col_name, sep=" ") }else if (number_of_column == 1){ valid_colnames = c(valid_colnames, col_name) }else{ repeated_colnames = c(repeated_colnames , col_name) } } str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames) return (new_list) } #Test if a single variable name is valid check_valid_name <- function(col_name){ valid <- 0 if(col_name %in% codebook_col_names$col_names){ valid <- length(grep(col_name, names(harmonized_data))) } return (valid) } check_valid_values_continuous <- function(colname , codebook_param , column){ column <- column[column != "."] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_list = str_split(possible_values_format , "/")[[1]] # Fallará cuando el codebook no tenga min-max / . range_as_str <- str_trim(possible_values_list[1]) missing_value_format <- str_trim(str_trim(possible_values_list[2])) separate_range <- str_split(range_as_str , "-")[[1]] min_value <- strtoi(separate_range[1]) max_value <- strtoi(separate_range[2]) failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) str_res <- "" if (number_of_failing_values == 0) str_res <- "No failing values" else{ failing_values <- failing_values[!is.na(failing_values)] str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) } return(str_res) } check_valid_values_binary <- function(colname , column){ column <- column[column != "."] failing_values <- column[column < 0 | column > 1] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) str_res <- "" if (number_of_failing_values == 0) str_res <- "No failing values" else{ failing_values <- failing_values[!is.na(failing_values)] str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) } return(str_res) } check_valid_values_categorical <- function(colname , codebook_param , column){ column <- column[column != "."] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_list <- str_split(possible_values_format , "/")[[1]] possible_values_list <- lapply(possible_values_list , str_trim) str_res <- "" min_value <- 0 max_value <- 0 if (length(possible_values_list[[1]]) == 2){ separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] min_value <- strtoi(separate_range[1]) max_value <- strtoi(separate_range[2]) }else{ possible_values_list <- lapply(possible_values_list , strtoi)[[1]] min_value <- possible_values_list[1] max_value <- possible_values_list[length(possible_values_list) - 1] } failing_values <- column[column < min_value | column > max_value ] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) if(number_of_failing_values == 0){ str_res <- "No failing values" }else{ failing_values <- failing_values[!is.na(failing_values)] str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) } } check_valid_values <- function(valid_colnames, codebook_param){ res <- "" for(i in 1:(ncol(valid_colnames))){ name <- names(valid_colnames)[i] #if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){ # next #} column <- valid_colnames[,i] # Esto falla si tu codebook no es mismo que new_harmon.csv column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] result = switch( column_type, "Continuous"= check_valid_values_continuous(name , codebook_param , column), "Binary"= check_valid_values_binary(name , column), "Categorical"= check_valid_values_categorical(name, codebook_param , column), "Calendar date" = paste("No failing values"), "ISO country code"= paste("No failing values"), ) if (result != "No failing values"){ res <- paste(res , result, sep="\n") } } return(res) } data_colnames <- as.data.frame(colnames(harmonized_data)) check_valid_columns <- check_column_names(data_colnames) columns_not_valid <- check_valid_columns$not_colnames valid_colnames_column <- as.data.frame(check_valid_columns$colnames) names(valid_colnames_column) = c("valid_colnames") valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames) result <- "" result<-check_valid_values(valid_colnames_with_data, codebook) print(check_valid_columns) #datashield.logout(connections) cat(result) file_name<- paste(hospital_name,"_invalid_values.txt", sep="") dir.create("../invalid_values", showWarnings = FALSE) setwd("../invalid_values") cat(check_valid_columns,file=file_name,sep="\n") cat(result,file=file_name,append=TRUE) #datashield.logout(connections)