rm(list=ls()) dir_name <- readline("Introduce the name of the directory please: ") # C:\Users\guill\Documents\harmonize_scripts # /Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization setwd(dir_name) source("dependency_installer.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) codebook_file <- "20220315_Data_Harmonisation.xlsb.xlsx" codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) codebook_dates <- read.xlsx(codebook_file , sheet = 11 ) codebook <- rbind(codebook_demo , codebook_com_and_rf) codebook <- rbind(codebook , codebook_home_med) codebook <- rbind(codebook , codebook_si_sympt) codebook <- rbind(codebook , codebook_treatments) codebook <- rbind(codebook , codebook_labo) codebook <- rbind(codebook , codebook_complications) codebook <- rbind(codebook , codebook_imaging_data) codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] codebook <- rbind(codebook , codebook_lifestyle_diet) codebook <- rbind(codebook , codebook_dates) codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) names(codebook_col_names) <- c("col_names") setwd(paste(dir_name ,"/harmonized_data", sep="")) file_name <- readline("Introduce the name of the file to check the values: ") harmonized_data <- "" if (grepl(".csv" , file_name , fixed = TRUE)){ harmonized_data <- read.csv(file_name) #harmonized_data <- read.csv(file_name, sep = ";") }else if (grepl(".xlsx" , file_name , fixed = TRUE)){ harmonized_data <- read.xlsx(file_name) } #---------------------------------------------------------------------------- #Test if column names are valid check_column_names <- function(col_names){ str_res <- "The column names:" valid_colnames <- c() repeated_colnames <- c() for(i in 1:(nrow(col_names))){ col_name <- col_names[i,1] number_of_column <- check_valid_name(col_name) if( number_of_column == 0){ str_res<- paste(str_res, col_name, sep=" ") }else if (number_of_column == 1){ valid_colnames = c(valid_colnames, col_name) }else{ repeated_colnames = c(repeated_colnames , col_name) } } str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames) return (new_list) } #Test if a single variable name is valid check_valid_name <- function(col_name){ valid <- 0 if(col_name %in% codebook_col_names$col_names){ valid <- length(grep(col_name, names(harmonized_data))) } return (valid) } check_valid_values_continuous <- function(colname , codebook_param , column){ missing_values_count <- length(column[column == "."]) total_values <- length(column) missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] value_format <- strsplit(possible_values_format, " / ")[[1]] min_value <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) max_value <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) if(min_value == ""){ min_value <- str_trim(sub(",.*", "", value_format[1])) max_value <- str_trim(sub(".*,", "", value_format[1])) } min_value <- as.double(min_value) max_value <- as.double(max_value) failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count str_res <- "" if (number_of_failing_values == 0){ if(missing_values_percentage > 90){ str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") } else{ str_res <- paste("No failing values") } }else{ failing_values <- failing_values[!is.na(failing_values)] failing_value_counts <- table(failing_values) str_res <- paste(colname, "has wrong values:") str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = "")) str_res <- paste(str_res, "Missing values:") str_res <- paste(str_res, paste(missing_values_count, ",", sep = "")) str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(continuous),", sep = " ") first <- names(failing_value_counts)[1] last <- names(failing_value_counts)[length(failing_value_counts)] str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) # if(number_of_failing_values < 30) { # # list_values <- paste( failing_values , collapse = " ") # str_res <- paste(str_res, "Wrong values:") # # for (i in seq_along(failing_value_counts)) { # value <- names(failing_value_counts)[i] # count <- failing_value_counts[i] # str_res <- paste(str_res, paste(value, paste("(", count, ")", sep = ""), collapse = " "), sep = " ") # } # } } return(str_res) } check_valid_values_categorical <- function(colname, codebook_param, column) { missing_values_count <- length(column[column == "."]) total_values <- length(column) missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_list <- str_split(possible_values_format, "/")[[1]] possible_values_list <- lapply(possible_values_list, str_trim) str_res <- "" min_value <- 0 max_value <- 0 if (length(possible_values_list[[1]]) == 2) { separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] min_value <- strtoi(separate_range[1]) max_value <- strtoi(separate_range[2]) } else { possible_values_list <- possible_values_list[possible_values_list != "."] possible_values_list <- lapply(possible_values_list, strtoi) min_value <- possible_values_list[1] max_value <- possible_values_list[length(possible_values_list)] } failing_values <- column[column < min_value | column > max_value] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count if (number_of_failing_values == 0){ if(missing_values_percentage > 90){ str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") } else{ str_res <- paste("No failing values") } } else { range_as_str <- paste(min_value, "-", max_value, " (categorical)") failing_values <- failing_values[!is.na(failing_values)] failing_value_counts <- table(failing_values) str_res <- paste(colname, "has wrong values:") str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = "")) str_res <- paste(str_res, "Missing values:") str_res <- paste(str_res, paste(missing_values_count, ",", sep = "")) str_res <- paste(str_res, "should be in the range:", paste(min_value, max_value, sep = "-"), "(categorical),", sep = " ") first <- names(failing_value_counts)[1] last <- names(failing_value_counts)[length(failing_value_counts)] str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) str_res <- paste(str_res, "Wrong values constitute at least:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) # str_res <- paste(colname, "has failing values:") # # for (i in seq_along(failing_value_counts)) { # value <- names(failing_value_counts)[i] # count <- failing_value_counts[i] # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") # } # # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } return(str_res) } check_valid_values_binary <- function(colname, column) { missing_values_count <- length(column[column == "."]) total_values <- length(column) missing_values_percentage <- round((missing_values_count/(total_values))*100, 2) column <- as.numeric(column[column != "."]) string_values_count <- length(column[is.na(column)]) failing_values <- column[column < 0 | column > 1] number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + string_values_count str_res <- "" if (number_of_failing_values == 0){ if(missing_values_percentage > 90){ str_res <- paste(colname, "Missing values:", missing_values_count, "-", paste(round((missing_values_count/(total_values))*100, 2), "%", sep = ""), "of the total") } else{ str_res <- paste("No failing values") } } else { range_as_str <- "0-1 (binary)" failing_values <- failing_values[!is.na(failing_values)] failing_value_counts <- table(failing_values) str_res <- paste(colname, "has wrong values:") str_res <- paste(str_res, paste(number_of_failing_values, ",", sep = "")) str_res <- paste(str_res, "Missing values:") str_res <- paste(str_res, paste(missing_values_count, ",", sep = "")) str_res <- paste(str_res, "should be in the range:", paste("0", "1", sep = "-"), "(binary),", sep = " ") first <- names(failing_value_counts)[1] last <- names(failing_value_counts)[length(failing_value_counts)] str_res <- paste(str_res, "Values lie in the range:", paste("[", first, " - ",last, "],", sep = "" )) str_res <- paste(str_res, "Missing:", paste(round((missing_values_count/(total_values))*100, 2), "%,", sep = "" )) str_res <- paste(str_res, "Wrong values:", paste(round((number_of_failing_values/(total_values-missing_values_count))*100, 2), "% of the non-missing", sep = "" )) # str_res <- paste(colname, "has failing values:") # # for (i in seq_along(failing_value_counts)) { # value <- names(failing_value_counts)[i] # count <- failing_value_counts[i] # str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ") # } # # str_res <- paste(str_res, "should be in range", range_as_str, sep = " ") } return(str_res) } check_valid_values <- function(valid_colnames, codebook_param){ res <- paste("Total patients:", length(valid_colnames[,1])) for(i in 1:(ncol(valid_colnames))){ name <- names(valid_colnames)[i] if (grepl("DAT", name, fixed=TRUE)){ next } column <- valid_colnames[,i] column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] if (is.na(column_type) ) { variable <- paste("Variable ", name, " wrong", sep = " ") res <- paste(res, variable , sep="\n") next } if(is.na(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == name])){ variable <- paste("Variable ", name, " with no value format", sep = " ") res <- paste(res, variable , sep="\n") next } result = switch( column_type, "Continuous"= check_valid_values_continuous(name , codebook_param , column), "Binary"= check_valid_values_binary(name , column), "Categorical"= check_valid_values_categorical(name, codebook_param , column), "Calendar date" = paste("No failing values"), "ISO country code"= paste("No failing values"), { paste("some column " , column_type , sep = " ") } ) if (any(result != "No failing values")){ res <- paste(res , result, sep="\n") } } return(res) } data_colnames <- as.data.frame(colnames(harmonized_data)) check_valid_columns <- check_column_names(data_colnames) columns_not_valid <- check_valid_columns$not_colnames valid_colnames_column <- as.data.frame(check_valid_columns$colnames) names(valid_colnames_column) = c("valid_colnames") valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames) result <- "" result<-check_valid_values(valid_colnames_with_data, codebook) # Split the string into separate elements split_columns <- strsplit(columns_not_valid, split = " ", fixed = TRUE) # Print each element in a separate line cat(paste(unlist(split_columns), collapse = "\n"), "\n") cat(result)