From 830788517cd6399becc241547ba943fdcf039035 Mon Sep 17 00:00:00 2001 From: GNajeral <90567992+GNajeral@users.noreply.github.com> Date: Wed, 1 Mar 2023 15:12:17 +0100 Subject: [PATCH] valid_variables_script2 file added --- valid_variables_script2.R | 218 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 valid_variables_script2.R diff --git a/valid_variables_script2.R b/valid_variables_script2.R new file mode 100644 index 0000000..6eb231c --- /dev/null +++ b/valid_variables_script2.R @@ -0,0 +1,218 @@ +rm(list=ls()) + +dir_name <- readline("Introduce the name of the directory please: ") +#/Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization + +setwd(dir_name) + +source("dependency_installer.R") +source("connection_parameters.R") +source("necessary_functions_connection.R") +#source("required_folder_checker.R") +#source("argument_hasher.R") + + +dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") +install_dependencies(dep_list) + +codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx" + +codebook_demo <- read.xlsx(codebook_file , sheet = 2 ) +codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) + +codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) +codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) + +codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) +codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) + +codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) +codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) + +codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) +codebook_dates <- read.xlsx(codebook_file , sheet = 11 ) + +codebook <- rbind(codebook_demo , codebook_com_and_rf) +codebook <- rbind(codebook , codebook_home_med) +codebook <- rbind(codebook , codebook_si_sympt) +codebook <- rbind(codebook , codebook_treatments) +codebook <- rbind(codebook , codebook_labo) +codebook <- rbind(codebook , codebook_complications) +codebook <- rbind(codebook , codebook_imaging_data) + +codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] +codebook <- rbind(codebook , codebook_lifestyle_diet) +codebook <- rbind(codebook , codebook_dates) + + +codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) + +names(codebook_col_names) <- c("col_names") + +categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") + + +#---------------------------------------------------------------------------- + +#Test if column names are valid +check_column_names <- function(codebook_param, colnames){ + + str_res <- "The column names:" + valid_colnames <- c() + + for(i in 1:(nrow(colnames))){ + colname <- colnames[i,1] + number_of_column <- check_valid_name(colname , colnames) + if(number_of_column != 1){ + str_res<- paste(str_res, colname, sep=" ") + }else{ + valid_colnames <- c(valid_colnames, colname) + } + } + + str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") + + result <- list("not_colnames" = str_res , "colnames" = valid_colnames) + + return (result) +} + +#Test if a single variable name is valid +check_valid_name <- function(col_name , col_names){ + + valid <- 0 + + if(col_name %in% codebook_col_names$col_names){ + + valid <- length(grep(col_name, col_names)) + + } + + return (valid) + +} + +remove_space <- function(x){ + searchString <- ' ' + replacementString <- '' + res = sub(searchString,replacementString,x) + return(res) +} + +remove_spaces_from_ds <- function(ds){ + + res<- lapply(ds,remove_space ) + + return(as.data.frame(res)) + +} + +is_number <- function(x){ + res <- FALSE + + + if(length(x)!=0){ + x <- str_replace(x,",",".") + + aux <- as.numeric(x) + + + if(!is.na(aux)) + res <- TRUE + } + + + return(res) + +} + +check_values_format <- function(valid_columns){ + res <- "" + for(i in 1:length(valid_columns[[1]])){ + print(i) + current_column <- valid_columns[[1]][[i]] + if(current_column %in% codebook_labo$Harmonised.variable.name){ + index <- which(current_column, codebook_labo$Harmonised.variable.name) + value_format <- strsplit(codebook_si_sympt$Possible.values.format[2], " / ")[[1]] + high_limit <- as.numeric(sub("-.*", "", value_format[1])) + low_limit <- as.numeric(sub(".*-", "", value_format[1])) + if(codebook_labo$Variable.type[index] == "Continuous"){ + ds.dataFrameSubset(df.name = "data", + V1.name = paste("data$", current_column, sep=""), + V2.name = high_limit, + Boolean.operator = "<=", + newobj = "inRangeHigh", + keep.NAs = TRUE, + datasources = connections) + + + ds.dataFrameSubset(df.name = "inRangeHigh", + V1.name = paste("inRangeHigh$", current_column, sep=""), + V2.name = low_limit, + Boolean.operator = ">=", + newobj = "inRange", + keep.NAs = TRUE, + datasources = connections) + + summary <- ds.summary(paste("inRange$", current_column, sep="")) + if(ds.length(paste("data$", current_column, sep="")) > summary[[1]][[2]]){ + res <- c(res, paste(current_column, "does not follow the established format" , sep="\n")) + } + } + } + } + return (res) +} + + + +auxConnections <- connect() +connections <- auxConnections[[1]] +inp <- auxConnections[[2]] + +#Conexión a la base de datos + +ds.dim("data", datasources = connections) +colnames <- ds.colnames("data") +colnames + +# ds.dataFrameSubset(df.name = "data", +# V1.name = "data$LBXAPTTA", +# V2.name = "130", +# Boolean.operator = "<=", +# newobj = "inRangeHigh", +# keep.NAs = TRUE, +# datasources = connections) +# +# lengthHigh <- ds.length(x='inRangeHigh$LBXAPTTA', datasources = connections) +# +# +# ds.dataFrameSubset(df.name = "inRangeHigh", +# V1.name = "inRangeHigh$LBXAPTTA", +# V2.name = "11", +# Boolean.operator = ">=", +# newobj = "inRange", +# keep.NAs = TRUE, +# datasources = connections) +# +# lengthBuenos <- ds.length(x='inRange$LBXAPTTA', datasources = connections) +# +# summary <- ds.summary("inRange$LBXAPTTA") + + +#---------------------------------------------------------------------------- + +#Check valid column names +datastructure_name <- "data" +data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) + +data_colnames <- as.data.frame(data_colnames) + +check_valid_columns <- check_column_names(codebook ,data_colnames) +valid_columns <- as.data.frame(check_valid_columns$colnames) +res <- "" +res <- check_values_format(valid_columns) +print(res) + + + -- 2.24.1