diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R index a37aa4955791cb63a54f8f7420ecda091bb64d9a..458304722850db603b162bc502fa956d1d4bc73f 100755 --- a/valid_variables_script_local.R +++ b/valid_variables_script_local.R @@ -10,10 +10,35 @@ source("dependency_installer.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) -#source("connection_parameters.R") -#source("necessary_functions_connection.R") +codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx" + +codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) +codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) + +codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) +codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) + +codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) +codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) + +codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) +codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) + +codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) +codebook_dates <- read.xlsx(codebook_file , sheet = 11 ) + +codebook <- rbind(codebook_demo , codebook_com_and_rf) +codebook <- rbind(codebook , codebook_home_med) +codebook <- rbind(codebook , codebook_si_sympt) +codebook <- rbind(codebook , codebook_treatments) +codebook <- rbind(codebook , codebook_labo) +codebook <- rbind(codebook , codebook_complications) +codebook <- rbind(codebook , codebook_imaging_data) + +codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] +codebook <- rbind(codebook , codebook_lifestyle_diet) +codebook <- rbind(codebook , codebook_dates) -codebook <- read.csv("new_harmon.csv" , sep = ",") codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) @@ -74,6 +99,7 @@ check_valid_name <- function(col_name){ } check_valid_values_continuous <- function(colname , codebook_param , column){ + column <- column[column != "."] possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] possible_values_list = str_split(possible_values_format , "/")[[1]] @@ -94,7 +120,9 @@ check_valid_values_continuous <- function(colname , codebook_param , column){ str_res <- "No failing values" else{ failing_values <- failing_values[!is.na(failing_values)] - str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + #str_res <- paste( colname , paste(unlist(failing_values) , collapse =" ")) + str_res <- paste(colname , collapse =" ") + str_res <- paste(str_res , "should be in range" , range_as_str, "(continuous)", sep = " ") } @@ -110,8 +138,11 @@ check_valid_values_binary <- function(colname , column){ if (number_of_failing_values == 0) str_res <- "No failing values" else{ + range_as_str <- "0-1 (binary)" failing_values <- failing_values[!is.na(failing_values)] - str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + #str_res <- paste(colname , paste(unlist(failing_values) , collapse =" ")) + str_res <- paste(colname , collapse =" ") + str_res <- paste(str_res , "should be in range" , range_as_str, sep = " ") } @@ -146,8 +177,11 @@ check_valid_values_categorical <- function(colname , codebook_param , column){ if(number_of_failing_values == 0){ str_res <- "No failing values" }else{ + range_as_str <- paste(min_value , "-" , max_value , " (categorical)") failing_values <- failing_values[!is.na(failing_values)] - str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + #str_res <- paste(colname , paste(unlist(failing_values) , collapse =" ")) + str_res <- paste(colname , collapse =" ") + str_res <- paste(str_res , "should be in range" , range_as_str, sep = " ") } } @@ -158,6 +192,11 @@ check_valid_values <- function(valid_colnames, codebook_param){ for(i in 1:(ncol(valid_colnames))){ name <- names(valid_colnames)[i] + + if (grepl("DAT", name, fixed=TRUE)){ + next + } + #if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){ # next #} @@ -167,6 +206,12 @@ check_valid_values <- function(valid_colnames, codebook_param){ # Esto falla si tu codebook no es mismo que new_harmon.csv column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] + if (is.na(column_type) ) { + variable <- paste("Variable ", name, " wrong", sep = " ") + res <- paste(res, variable , sep="\n") + next + } + result = switch( column_type, "Continuous"= check_valid_values_continuous(name , codebook_param , column), @@ -174,6 +219,9 @@ check_valid_values <- function(valid_colnames, codebook_param){ "Categorical"= check_valid_values_categorical(name, codebook_param , column), "Calendar date" = paste("No failing values"), "ISO country code"= paste("No failing values"), + { + paste("some column " , column_type , sep = " ") + } ) if (result != "No failing values"){ @@ -199,20 +247,6 @@ valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_col result <- "" result<-check_valid_values(valid_colnames_with_data, codebook) -print(check_valid_columns) -#datashield.logout(connections) +print(columns_not_valid) cat(result) - - -file_name<- paste(hospital_name,"_invalid_values.txt", sep="") - - -dir.create("../invalid_values", showWarnings = FALSE) -setwd("../invalid_values") - -cat(check_valid_columns,file=file_name,sep="\n") -cat(result,file=file_name,append=TRUE) - -#datashield.logout(connections) -