rm(list=ls()) dir_name <- readline("Introduce the name of the directory please: ") #/Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization setwd(dir_name) source("dependency_installer.R") source("connection_parameters.R") source("necessary_functions_connection.R") #source("required_folder_checker.R") #source("argument_hasher.R") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") install_dependencies(dep_list) codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx" codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) codebook_dates <- read.xlsx(codebook_file , sheet = 11 ) codebook <- rbind(codebook_demo , codebook_com_and_rf) codebook <- rbind(codebook , codebook_home_med) codebook <- rbind(codebook , codebook_si_sympt) codebook <- rbind(codebook , codebook_treatments) codebook <- rbind(codebook , codebook_labo) codebook <- rbind(codebook , codebook_complications) codebook <- rbind(codebook , codebook_imaging_data) codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] codebook <- rbind(codebook , codebook_lifestyle_diet) codebook <- rbind(codebook , codebook_dates) codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) names(codebook_col_names) <- c("col_names") categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") #---------------------------------------------------------------------------- #Test if column names are valid check_column_names <- function(codebook_param, colnames){ str_res <- "The column names:" valid_colnames <- c() for(i in 1:(nrow(colnames))){ colname <- colnames[i,1] number_of_column <- check_valid_name(colname , colnames) if(number_of_column != 1){ str_res<- paste(str_res, colname, sep=" ") }else{ valid_colnames <- c(valid_colnames, colname) } } str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") result <- list("not_colnames" = str_res , "colnames" = valid_colnames) return (result) } #Test if a single variable name is valid check_valid_name <- function(col_name , col_names){ valid <- 0 if(col_name %in% codebook_col_names$col_names){ valid <- length(grep(col_name, col_names)) } return (valid) } remove_space <- function(x){ searchString <- ' ' replacementString <- '' res = sub(searchString,replacementString,x) return(res) } remove_spaces_from_ds <- function(ds){ res<- lapply(ds,remove_space ) return(as.data.frame(res)) } is_number <- function(x){ res <- FALSE if(length(x)!=0){ x <- str_replace(x,",",".") aux <- as.numeric(x) if(!is.na(aux)) res <- TRUE } return(res) } # A esta funcion la llamamos unicamente con las columnas que el sabemos que el nombre es correcto # Usa codebook param. Si algún cambia el codebook agradeceremos esto. check_values_format <- function(valid_columns, codebook_param){ res <- "" variables_out_of_range = "Variables out of range:" # for(i in 1:length(valid_columns[[1]])){ for(i in 1:9){ current_column <- valid_columns[[1]][[i]] print(current_column) variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column] if (is.na(variable_type)){ next } print(variable_type) if(variable_type == "Continuous"){ ################## ESTO PODRÍA IR EN UNA FUNC DIFERENTE ############# ### parse del formato de una variable continua ## ## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete. value_format <- strsplit(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column], " / ")[[1]] high_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) low_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) if(low_limit == ""){ high_limit <- str_trim(sub(",.*", "", value_format[1])) low_limit <- str_trim(strtrimsub(".*,", "", value_format[1])) } ### parse del formato de una variable continua ## tryCatch( error = function(cnd) { if(grepl("list them with datashield.errors()",cnd)) error <- paste("Unable to analyse data" , datashield.errors() , sep = " ") else error <- paste("Unable to analyse data" , cnd, sep = " ") print(error) res <- c(res, error) }, { ds.dataFrameSubset(df.name = "data", V1.name = paste("data$", current_column, sep=""), V2.name = high_limit, Boolean.operator = "<=", newobj = "inRangeHigh", keep.NAs = TRUE, datasources = connections) ds.dataFrameSubset(df.name = "inRangeHigh", V1.name = paste("inRangeHigh$", current_column, sep=""), V2.name = low_limit, Boolean.operator = ">=", newobj = "inRange", keep.NAs = TRUE, datasources = connections) summary <- ds.summary(paste("inRange$", current_column, sep="")) if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){ variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") print(paste(current_column, "does not follow the established format", sep=" ")) } } ) ################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE ############# }else if (variable_type == "Categorical" || variable_type == "Binary"){ value_format <- lapply(strsplit(mierda, "/") , str_trim)[[1]] tryCatch( error = function(cnd) { if(grepl("list them with datashield.errors()",cnd)) error <- paste("Unable to analyse data" , datashield.errors() , sep = " ") else error <- paste("Unable to analyse data" , cnd, sep = " ") print(error) res <- c(res, error) }, { contingency_table <- ds.table(paste("data$",current_column,sep="")) row_names <- rownames(contingency_table[[1]][[3]]) for (i in 1:length(row_names)) { if(row_names[i] == "NA") next if(!row_names[i] %in% value_format){ variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") print(paste(current_column, "does not follow the established format", sep=" ")) } } } ) } } return (variables_out_of_range) } auxConnections <- connect() connections <- auxConnections[[1]] inp <- auxConnections[[2]] #Conexión a la base de datos # ds.dim("data", datasources = connections) # colnames <- ds.colnames("data") # colnames # # ds.dataFrameSubset(df.name = "data", # V1.name = "data$DMXBMI", # V2.name = "130", # Boolean.operator = "<=", # newobj = "inRangeHigh", # keep.NAs = TRUE, # datasources = connections) # # ds.dataFrameSubset(df.name = "inRangeHigh", # V1.name = "inRangeHigh$DMXBMI", # V2.name = "11", # Boolean.operator = ">=", # newobj = "inRange", # keep.NAs = TRUE, # datasources = connections) # # summary <- ds.summary("inRange$DMXBMI") # if(ds.length("data$DMXBMI")[[1]] > summary[[1]][[2]]){ # res <- c(res, paste(current_column, "does not follow the established format" , sep="\n")) # } #---------------------------------------------------------------------------- #Check valid column names datastructure_name <- "data" data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) data_colnames <- as.data.frame(data_colnames) check_valid_columns <- check_column_names(codebook ,data_colnames) valid_columns <- as.data.frame(check_valid_columns$colnames) res <- "" res <- check_values_format(valid_columns, codebook) print(res) datashield.logout(connections)