valid_variables_script2.R 9.79 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
rm(list=ls())

dir_name <- readline("Introduce the name of the directory please: ")
#/Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization

setwd(dir_name)

source("dependency_installer.R")
source("connection_parameters.R")
source("necessary_functions_connection.R")
#source("required_folder_checker.R")
#source("argument_hasher.R")


dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list)

codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx"

codebook_demo <- read.xlsx(codebook_file , sheet = 2 )
codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 ) 

codebook_home_med <- read.xlsx(codebook_file , sheet = 4 ) 
codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 ) 

codebook_treatments <- read.xlsx(codebook_file , sheet = 6 ) 
codebook_labo <- read.xlsx(codebook_file , sheet = 7 ) 

codebook_complications <- read.xlsx(codebook_file , sheet = 8 ) 
codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 ) 

codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 ) 
codebook_dates <- read.xlsx(codebook_file , sheet = 11 )

codebook <- rbind(codebook_demo , codebook_com_and_rf)
codebook <- rbind(codebook , codebook_home_med)
codebook <- rbind(codebook , codebook_si_sympt)
codebook <- rbind(codebook , codebook_treatments)
codebook <- rbind(codebook , codebook_labo)
codebook <- rbind(codebook , codebook_complications)
codebook <- rbind(codebook , codebook_imaging_data)

codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")] 
codebook <- rbind(codebook , codebook_lifestyle_diet)
codebook <- rbind(codebook , codebook_dates)


codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)

names(codebook_col_names) <- c("col_names")

#----------------------------------------------------------------------------

#Test if column names are valid
check_column_names <- function(codebook_param, colnames){
  
  str_res <- "The column names:"
  valid_colnames <- c()
  
  for(i in 1:(nrow(colnames))){
    colname <- colnames[i,1]
    number_of_column <- check_valid_name(colname ,  colnames) 
    if(number_of_column != 1){
      str_res<- paste(str_res, colname, sep=" ")
    }else{
      valid_colnames <- c(valid_colnames, colname)
    }
  }
  
  str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
  
  result <- list("not_colnames" = str_res , "colnames" = valid_colnames)
  
  return (result)
}

#Test if a single variable name is valid
check_valid_name <- function(col_name , col_names){
  
  valid <- 0
  
  if(col_name %in% codebook_col_names$col_names){
    
    valid <- length(grep(col_name, col_names))
    
  }
  
  return (valid)
  
}

remove_space <- function(x){
  searchString <- ' '
  replacementString <- ''
  res = sub(searchString,replacementString,x)
  return(res)
}

remove_spaces_from_ds <- function(ds){
  
  res<- lapply(ds,remove_space )
  
  return(as.data.frame(res))
  
}

is_number <- function(x){
  res <- FALSE
  
  
  if(length(x)!=0){
    x <- str_replace(x,",",".")
    
    aux <- as.numeric(x)
    
    
    if(!is.na(aux))
      res <- TRUE
  }
  
  
  return(res)
  
}

126 127 128 129

# A esta funcion la llamamos unicamente con las columnas que el sabemos que el nombre es correcto
# Usa codebook param. Si algún cambia el codebook agradeceremos esto.
check_values_format <- function(valid_columns, codebook_param){
130
  res <- ""
131
  variables_out_of_range = "Variables out of range:"
GNajeral's avatar
GNajeral committed
132 133
  for(i in 1:length(valid_columns[[1]])){
  #for(i in 1:1){
134
    current_column <- valid_columns[[1]][[i]]
135
    print(current_column)
136
    variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column]
137 138 139 140
    
    if (is.na(variable_type)){
      next
    }
GNajeral's avatar
GNajeral committed
141 142
    
    print(variable_type)
143
   
144
    if(variable_type == "Continuous"){
145 146 147 148 149
      
      ################## ESTO PODRÍA IR EN UNA FUNC DIFERENTE #############
      
      ### parse del formato de una variable continua ##
      ## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete.
GNajeral's avatar
GNajeral committed
150 151 152 153
      possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
      value_format <- strsplit(possible_values, " / ")[[1]]
      low_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
      high_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
GNajeral's avatar
GNajeral committed
154
      if(low_limit == ""){
GNajeral's avatar
GNajeral committed
155 156
        low_limit <- str_trim(sub(",.*", "", value_format[1]))
        high_limit <- str_trim(strtrimsub(".*,", "", value_format[1]))
GNajeral's avatar
GNajeral committed
157
      }
158 159
      ### parse del formato de una variable continua ##
      
160 161 162 163 164 165 166 167 168 169
      tryCatch(
        error = function(cnd) {
          if(grepl("list them with datashield.errors()",cnd))
            error <- paste("Unable to analyse data" , datashield.errors() , sep = " ")
          else
            error <- paste("Unable to analyse data" , cnd, sep = " ")
          print(error)
          res <- c(res, error)
        },
        {
GNajeral's avatar
GNajeral committed
170
          print(paste("Higher Limit: ", high_limit))
171 172 173 174 175 176 177 178
          ds.dataFrameSubset(df.name = "data",
                                   V1.name = paste("data$", current_column, sep=""),
                                   V2.name = high_limit,
                                   Boolean.operator = "<=",
                                   newobj = "inRangeHigh",
                                   keep.NAs = TRUE,
                                   datasources = connections)
          
GNajeral's avatar
GNajeral committed
179
          print(paste("Lower Limit: ", low_limit))
180 181 182 183 184 185 186 187 188 189 190 191
          ds.dataFrameSubset(df.name = "inRangeHigh",
                                   V1.name = paste("inRangeHigh$", current_column, sep=""),
                                   V2.name = low_limit,
                                   Boolean.operator = ">=",
                                   newobj = "inRange",
                                   keep.NAs = TRUE,
                                   datasources = connections)
          
          summary <- ds.summary(paste("inRange$", current_column, sep=""))
          if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){
            variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ")
            print(paste(current_column,  "does not follow the established format", sep=" "))
GNajeral's avatar
GNajeral committed
192 193 194 195
            print(paste("It should follow the following format: ", possible_values))
          }
          else{
            print(paste(paste("Data in: ", current_column), " was valid"))
196 197 198
          }
        }
      )
199 200
      ################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE #############
      
201
    }else if (variable_type == "Categorical" || variable_type == "Binary"){
GNajeral's avatar
GNajeral committed
202 203
      possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
      value_format <- lapply(strsplit(possible_values, "/") , str_trim)[[1]]
GNajeral's avatar
GNajeral committed
204 205 206 207 208 209 210 211 212 213 214 215
      tryCatch(
        error = function(cnd) {
          if(grepl("list them with datashield.errors()",cnd))
            error <- paste("Unable to analyse data" , datashield.errors() , sep = " ")
          else
            error <- paste("Unable to analyse data" , cnd, sep = " ")
          print(error)
          res <- c(res, error)
        },
        {
          contingency_table <- ds.table(paste("data$",current_column,sep=""))
          row_names <- rownames(contingency_table[[1]][[3]])
GNajeral's avatar
GNajeral committed
216
          result <- FALSE
GNajeral's avatar
GNajeral committed
217 218 219 220
          for (i in 1:length(row_names)) {
            if(row_names[i] == "NA")
              next
            if(!row_names[i] %in% value_format){
Pepe Márquez Romero's avatar
Pepe Márquez Romero committed
221
              variables_out_of_range <- paste(variables_out_of_range, current_column, "the range should be:" , possible_values ,sep = " ")
Pepe Márquez Romero's avatar
Pepe Márquez Romero committed
222
              variables_out_of_range <- paste(variables_out_of_range , "\n" , sep = "")
GNajeral's avatar
GNajeral committed
223
              print(paste(current_column,  "does not follow the established format", sep=" "))
GNajeral's avatar
GNajeral committed
224 225 226 227
              print(paste("It should follow the following format:", possible_values))
              print(paste("Instead of:", paste(row_names, collapse = " ")))
              result <- TRUE
              break
GNajeral's avatar
GNajeral committed
228 229
            }
          }
GNajeral's avatar
GNajeral committed
230 231 232
          if(!result){
            print(paste(paste("Data in: ", current_column), " was valid"))
          }
GNajeral's avatar
GNajeral committed
233 234
        }
      )
235 236
    }
  }
237
  return (variables_out_of_range)
238 239 240 241 242 243 244 245 246 247
}



auxConnections <- connect()
connections <- auxConnections[[1]]
inp <- auxConnections[[2]]

#Conexión a la base de datos

GNajeral's avatar
GNajeral committed
248 249 250 251
 ds.dim("data", datasources = connections)
 colnames <- ds.colnames("data")
 colnames

252
 # ds.dataFrameSubset(df.name = "data",
GNajeral's avatar
GNajeral committed
253 254
 #                    V1.name = "data$DMRAGEYR",
 #                    V2.name = "150",
255 256 257 258 259 260
 #                    Boolean.operator = "<=",
 #                    newobj = "inRangeHigh",
 #                    keep.NAs = TRUE,
 #                    datasources = connections)
 # 
 # ds.dataFrameSubset(df.name = "inRangeHigh",
GNajeral's avatar
GNajeral committed
261 262
 #                    V1.name = "inRangeHigh$DMRAGEYR",
 #                    V2.name = "0",
263 264 265 266 267
 #                    Boolean.operator = ">=",
 #                    newobj = "inRange",
 #                    keep.NAs = TRUE,
 #                    datasources = connections)
 # 
GNajeral's avatar
GNajeral committed
268 269
 # summary <- ds.summary("inRange$DMRAGEYR")
 # if(ds.length("data$DMRAGEYR")[[1]] > summary[[1]][[2]]){
270 271
 #   res <- c(res, paste(current_column,  "does not follow the established format" , sep="\n"))
 # }
272 273 274 275 276 277 278 279 280 281 282 283 284


#----------------------------------------------------------------------------

#Check valid column names
datastructure_name <- "data"
data_colnames <- ds.colnames(x=datastructure_name, datasources= connections)

data_colnames <- as.data.frame(data_colnames)

check_valid_columns <- check_column_names(codebook ,data_colnames)
valid_columns <- as.data.frame(check_valid_columns$colnames)
res <- ""
285
res <- check_values_format(valid_columns, codebook)
286 287
print(res)

288
datashield.logout(connections)
289