From ef2562624c28b01909aa55e46203886e68c7c95b Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Mon, 6 Mar 2023 15:53:04 +0100 Subject: [PATCH] First working script --- valid_variables_script2.R | 56 ++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/valid_variables_script2.R b/valid_variables_script2.R index 5ce27d1..1b27f93 100644 --- a/valid_variables_script2.R +++ b/valid_variables_script2.R @@ -49,9 +49,6 @@ codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) names(codebook_col_names) <- c("col_names") -categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") - - #---------------------------------------------------------------------------- #Test if column names are valid @@ -132,8 +129,8 @@ is_number <- function(x){ check_values_format <- function(valid_columns, codebook_param){ res <- "" variables_out_of_range = "Variables out of range:" -# for(i in 1:length(valid_columns[[1]])){ - for(i in 1:9){ + for(i in 1:length(valid_columns[[1]])){ + #for(i in 1:1){ current_column <- valid_columns[[1]][[i]] print(current_column) variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column] @@ -150,12 +147,13 @@ check_values_format <- function(valid_columns, codebook_param){ ### parse del formato de una variable continua ## ## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete. - value_format <- strsplit(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column], " / ")[[1]] - high_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) - low_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) + possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column] + value_format <- strsplit(possible_values, " / ")[[1]] + low_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) + high_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) if(low_limit == ""){ - high_limit <- str_trim(sub(",.*", "", value_format[1])) - low_limit <- str_trim(strtrimsub(".*,", "", value_format[1])) + low_limit <- str_trim(sub(",.*", "", value_format[1])) + high_limit <- str_trim(strtrimsub(".*,", "", value_format[1])) } ### parse del formato de una variable continua ## @@ -169,6 +167,7 @@ check_values_format <- function(valid_columns, codebook_param){ res <- c(res, error) }, { + print(paste("Higher Limit: ", high_limit)) ds.dataFrameSubset(df.name = "data", V1.name = paste("data$", current_column, sep=""), V2.name = high_limit, @@ -177,7 +176,7 @@ check_values_format <- function(valid_columns, codebook_param){ keep.NAs = TRUE, datasources = connections) - + print(paste("Lower Limit: ", low_limit)) ds.dataFrameSubset(df.name = "inRangeHigh", V1.name = paste("inRangeHigh$", current_column, sep=""), V2.name = low_limit, @@ -190,13 +189,18 @@ check_values_format <- function(valid_columns, codebook_param){ if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){ variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") print(paste(current_column, "does not follow the established format", sep=" ")) + print(paste("It should follow the following format: ", possible_values)) + } + else{ + print(paste(paste("Data in: ", current_column), " was valid")) } } ) ################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE ############# }else if (variable_type == "Categorical" || variable_type == "Binary"){ - value_format <- lapply(strsplit(mierda, "/") , str_trim)[[1]] + possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column] + value_format <- lapply(strsplit(possible_values, "/") , str_trim)[[1]] tryCatch( error = function(cnd) { if(grepl("list them with datashield.errors()",cnd)) @@ -209,14 +213,22 @@ check_values_format <- function(valid_columns, codebook_param){ { contingency_table <- ds.table(paste("data$",current_column,sep="")) row_names <- rownames(contingency_table[[1]][[3]]) + result <- FALSE for (i in 1:length(row_names)) { if(row_names[i] == "NA") next if(!row_names[i] %in% value_format){ variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") print(paste(current_column, "does not follow the established format", sep=" ")) + print(paste("It should follow the following format:", possible_values)) + print(paste("Instead of:", paste(row_names, collapse = " "))) + result <- TRUE + break } } + if(!result){ + print(paste(paste("Data in: ", current_column), " was valid")) + } } ) } @@ -232,28 +244,28 @@ inp <- auxConnections[[2]] #Conexión a la base de datos - # ds.dim("data", datasources = connections) - # colnames <- ds.colnames("data") - # colnames - # + ds.dim("data", datasources = connections) + colnames <- ds.colnames("data") + colnames + # ds.dataFrameSubset(df.name = "data", - # V1.name = "data$DMXBMI", - # V2.name = "130", + # V1.name = "data$DMRAGEYR", + # V2.name = "150", # Boolean.operator = "<=", # newobj = "inRangeHigh", # keep.NAs = TRUE, # datasources = connections) # # ds.dataFrameSubset(df.name = "inRangeHigh", - # V1.name = "inRangeHigh$DMXBMI", - # V2.name = "11", + # V1.name = "inRangeHigh$DMRAGEYR", + # V2.name = "0", # Boolean.operator = ">=", # newobj = "inRange", # keep.NAs = TRUE, # datasources = connections) # - # summary <- ds.summary("inRange$DMXBMI") - # if(ds.length("data$DMXBMI")[[1]] > summary[[1]][[2]]){ + # summary <- ds.summary("inRange$DMRAGEYR") + # if(ds.length("data$DMRAGEYR")[[1]] > summary[[1]][[2]]){ # res <- c(res, paste(current_column, "does not follow the established format" , sep="\n")) # } -- 2.24.1