From 3a5868393ae76764f885d8a16365ab4bee42de53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pepe=20M=C3=A1rquez=20Romero?=
 <pepe.marquezromero@gmail.com>
Date: Thu, 9 Feb 2023 17:26:32 +0100
Subject: [PATCH] modificando para que el chequeo de variables no sea en
 datashield, sea en local

---
 .gitignore               |    2 +
 valid_variables_script.R | 1119 ++++++++++++++++++--------------------
 2 files changed, 530 insertions(+), 591 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..615bdc1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.xlsx
+harmonized_data/*.csv
diff --git a/valid_variables_script.R b/valid_variables_script.R
index 6c62d8a..35fa2c9 100755
--- a/valid_variables_script.R
+++ b/valid_variables_script.R
@@ -1,591 +1,528 @@
-rm(list=ls())
-
-setwd("C:/Users/Victor/Documents/TFG/r-analytics-master")
-
-source("required_folder_checker.R")
-source("argument_hasher.R")
-source("dependency_installer.R")
-
-# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source")
-# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source")
-# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source")
-# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source")
-
-
-dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient")
-install_dependencies(dep_list)
-
-#,"DSI","DSOpal","DSLite"
-
-setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
-source("connection_parameters.R")
-source("necessary_functions_connection.R")
-
-
-setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
-
-
-ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
-Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
-Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
-Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
-Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
-Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
-Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
-SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
-Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
-LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
-
-
-
-harmonised_data <- rbind(SiAndSympt,ComAndRF)
-harmonised_data <- rbind(harmonised_data,Treatment)
-harmonised_data <- rbind(harmonised_data,Dates)
-harmonised_data <- rbind(harmonised_data,Demographics)
-harmonised_data <- rbind(harmonised_data,Home_med)
-harmonised_data <- rbind(harmonised_data,Imaging_data)
-harmonised_data <- rbind(harmonised_data,Complications)
-harmonised_data <- rbind(harmonised_data,Labo)
-harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
-
-rm(list=c("SiAndSympt",
-          "Complications",
-          "ComAndRF",
-          "Dates",
-          "Demographics",
-          "Home_med",
-          "Imaging_data",
-          "Complications",
-          "Labo",
-          "LifestyleAndDiet"))
-
-categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
-
-
-#----------------------------------------------------------------------------
-
-#Test if column names are valid
-check_column_names <- function(x){
-  
-  str_res <- "The column names:"
-  
-  for(i in 1:(nrow(data_colnames))){
-    if(!check_valid_name(data_colnames[i,1])){
-      str_res<- paste(str_res, data_colnames[i,1], sep=" ")
-    }
-  }
-  
-  str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
-  
-  return (str_res)
-  
-}
-
-#Test if a single variable name is valid
-check_valid_name <- function(x){
-  
-  valid <- FALSE
-  
-  aux <- as.data.frame(strsplit(x , split = "_"))
-  
-  if(aux[1,1] %in% harmonised_data$Harmonised.variable.name)
-    valid <- TRUE
-  
-  return (valid)
-  
-}
-
-valid_data_colnames <- function(x){
-  
-  valid_colnames = c()
-  
-  for(i in  1:(nrow(data_colnames))){
-    if(check_valid_name(data_colnames[i,1])){
-      valid_colnames = c(valid_colnames,data_colnames[i,1])
-    }
-  }
-  
-  
-  return(valid_colnames)
-}
-
-remove_space <- function(x){
-  searchString <- ' '
-  replacementString <- ''
-  res = sub(searchString,replacementString,x)
-  return(res)
-}
-
-remove_spaces_from_ds <- function(ds){
-  
-  res<- lapply(ds,remove_space )
-  
-  return(as.data.frame(res))
-  
-}
-
-is_number <- function(x){
-  res <- FALSE
-  
-  
-  if(length(x)!=0){
-    x <- str_replace(x,",",".")
-    
-    aux <- as.numeric(x)
-
-    
-    if(!is.na(aux))
-      res <- TRUE
-  }
-  
-  
-  return(res)
-  
-}
-
-check_values_not_categoric <- function(values, colname){
-  
-  valid_vals <- values
-  
-  possible_vals <- possible_values(colname)
-  
-  for(i in  1:length(values)){
-    
-    res<- FALSE
-    
-    value <- values[[i]]
-    
-    if(is_number(value)){
-      value <- str_replace(value,",",".")
-      value <- as.numeric(value)
-    }
-    
-    if(is.null(value)){
-      res <- TRUE
-    }
-      
-    else if( value == "NA" | value == "nan" | value == ".")
-      res <- TRUE
-    else{
-      
-      if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){
-        
-        if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){
-          lower = possible_vals[1,1][[1]]
-          higher = possible_vals[2,1][[1]]
-          
-        }else{
-          bounds <- as.data.frame(strsplit(possible_vals[1,1], '-'))
-          lowerAux <- str_replace(bounds [1,1],",",".")
-          higherAux <- str_replace(bounds [2,1],",",".")
-          lower <- as.numeric(remove_space(lowerAux))
-          higher <- as.numeric(remove_space(higherAux))
-        }
-        
-        
-        if ((value >= lower & value <= higher))
-          res <- TRUE
-      }
-      
-      if(nrow(possible_vals) == 3 | colname=="CSXCTR"){
-        if(value == 0 | value == 1)
-          res <- TRUE
-      }
-      
-      if(nrow(possible_vals) > 3){
-        
-        lower <- strtoi(remove_space(possible_vals[1,1]))
-        higher_b <- nrow(possible_vals)-1
-        higher <- strtoi(remove_space(possible_vals[higher_b,1]))
-        
-        if ((value >= lower & value <= higher))
-          res <- TRUE
-      }
-    }
-    
-    if(res == FALSE)
-      valid_vals[i] <- res
-    
-    
-  }
-  
-  return(valid_vals)
-}
-
-possible_values <- function(x){
-  
-  if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM")
-    res <- t(as.data.frame(list(-20,20)))
-  
-  else{
-    
-    possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5]
-    res <- strsplit(x=possible_value,split="/")
-  }
-  
-  return(as.data.frame(res))
-}
-
-possible_values_categoric <- function(x){
-  
-  possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4]
-  res <- strsplit(x=possible_value,split="/")
-  
-  return(as.data.frame(res))
-}
-
-check_values_categoric <- function(values, colname){
-  
-  possible_vals <- possible_values_categoric(colname)
-  
-  res <- TRUE
-  
-  for(i in 1:length(values)){
-    if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){
-      res <- FALSE
-    }
-    
-  }
-  
-  return(res)
-  
-}
-
-
-get_values_from_quantiles <- function(x){
-  
-  data_summary <- ds.summary(x)
-  
-  low_quantile <- data_summary[[1]][3][[1]][[1]]
-  
-  high_quantile <- data_summary[[1]][3][[1]][[7]]
-  
-  return(list(low_quantile,high_quantile))
-  
-  
-}
-
-notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){
-  
-  res <- ""
-  
-  if(length(invalid_name_list) !=0){
-    res <- "There are invalid values in the numeric fields:"
-    
-    
-    for(i in 1:length(invalid_name_list)){
-      res <- paste(res, invalid_name_list[i], sep=" ")
-    }
-    
-    res <- paste(res, "\n", sep="")
-    
-    for(i in 1:length(invalid_name_list)){
-      res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ")
-    }
-  }
-  
-  
-  if(length(wrong_categoric)!=0){
-    
-    res <- paste(res, "\n############################################################################ \n", sep="")
-    
-    res <- paste(res,"\nThe following categoric values are invalid:", sep=" ")
-    
-    for(i in 1:length(wrong_categoric)){
-      res <- paste(res, wrong_categoric[i], sep=" ")
-    }
-    
-    res <- paste(res, "\n", sep="")
-    
-    for(i in 1:length(wrong_categoric)){
-      res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ")
-    }
-  }
-  
-  if(length(missing_numeric)!=0){
-    
-    res <- paste(res, "\n############################################################################ \n", sep="")
-    
-    res<- paste(res, "\nThe following fields are missing a numeric field:")
-    
-    for(i in 1:length(wrong_categoric)){
-      res <- paste(res, missing_numeric[i], sep=" ")
-    }
-  }
-  
-  res <- paste(res, "\n", sep="")
-  
-  
-  return(res)
-  
-}
-
-error_message_categoric <- function(colname, invalid_values){
-  
-  res<- "\nValues in the field"
-  res<- paste(res, colname, sep=" ")
-  res<- paste(res, "should be", sep=" ")
-  
-  range <- possible_values_categoric(colname)
-  
-  for(i in 1:nrow(range)){
-    res <- paste(res, remove_space(range[i,1]), sep=" ")
-  }
-  
-  
-  res<- paste(res, "\nBut values were:", sep=" ")
-  
-  for(j in 1:length(invalid_values)){
-    res<- paste(res, invalid_values[[j]], sep=" ")
-  }
-  
-  res<- paste(res, "\n\n", sep="")
-  
-  
-  return(res)
-  
-}
-
-error_message <- function(colname, invalid_values){
-  
-  res<- "\nValues in the field"
-  res<- paste(res, colname, sep=" ")
-  res<- paste(res, "should be", sep=" ")
-  
-  if(grepl("numeric", colname,fixed=TRUE))
-    new_colname <- strsplit(x=colname,split="_")[[1]][1]
-  else
-    new_colname <- colname
-  
-  range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname)
-  range <- range[5]
-  range <- as.data.frame(strsplit(range[1,1], '/'))
-  
-  #Range of values or null
-  if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){
-    bounds <- as.data.frame(strsplit(range[1,1], '-'))
-    lower <- remove_space(bounds [1,1])
-    higher <- remove_space(bounds [2,1])
-    res<- paste(res, "numbers between", sep=" ")
-    res<- paste(res, lower, sep=" ")
-    res<- paste(res, "and", sep=" ")
-    res<- paste(res, higher, sep=" ")
-    res<- paste(res, "(both included)", sep=" ")
-  }
-  
-  if(nrow(range) == 3){
-    res<- paste(res, "0 or 1", sep=" ")
-  }
-  
-  if(nrow(range) > 3){
-    lower <- strtoi(remove_space(range[1,1]))
-    higher_b <- nrow(range)-1
-    higher <- strtoi(remove_space(range[higher_b,1]))
-    res<- paste(res, "numbers between", sep=" ")
-    res<- paste(res, lower, sep=" ")
-    res<- paste(res, "and", sep=" ")
-    res<- paste(res, higher, sep=" ")
-    res<- paste(res, "(both included)", sep=" ")
-  }
-  
-  if(grepl("DAT",colname, fixed=TRUE)){
-    res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ")
-  }
-  
-  
-  res<- paste(res, "\nBut values were:", sep=" ")
-  
-  for(j in 1:length(invalid_values)){
-    res<- paste(res, invalid_values[[j]], sep=" ")
-  }
-  
-  res<- paste(res, "\n", sep="")
-  
-  if(!is_number(invalid_values[[1]]))
-    res<- paste(res, "(It's missing a \"numeric\" field)", sep="")
-  
-  res<- paste(res, "\n", sep="")
-  
-  return(res)
-  
-}
-
-check_valid_values <- function(){
-  
-  invalid_name_list <- c()
-  cannot_analyse_list <- c()
-  invalid_values_list <- list()
-  wrong_categoric_values <- list()
-  wrong_categoric <- c()
-  missing_numeric <- c()
-  j<- 1
-  k <- 1
-  
-  for(i in  1:(nrow(valid_colnames))){
-    
-    data_table ="empty"
-    
-    if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){
-      
-      column <- "data$"
-      column <- paste(column, valid_colnames[i,1], sep="")
-      
-      tryCatch(
-        error = function(cnd) {
-          print("Unable to analyse data")
-          res <- FALSE
-        },
-        data_table <- as.data.frame(ds.table(column))
-      )
-      
-      if(data_table == "empty"){
-        
-        cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
-        
-      }else{
-        
-        
-        if (data_table[[1]] == "All studies failed for reasons identified below")
-          values <- get_values_from_quantiles(column)
-        else
-          values <- row.names(data_table)
-        
-        numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="")
-        
-        if( valid_colnames[i,1] %in% categoric_vars ){
-          
-          #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE)
-          has_numeric <-  numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)`
-          
-          if(!has_numeric)
-            missing_numeric <- c(missing_numeric, valid_colnames[i,1])
-          
-          
-          if (data_table[[1]] == "All studies failed for reasons identified below"){
-            
-            cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
-            
-          }else  if(!check_values_categoric(values,valid_colnames[i,1])){
-            
-            print("Wrong categoric value:")
-            print(valid_colnames[i,1])
-            
-            wrong_categoric <- c(wrong_categoric, valid_colnames[i,1])
-            wrong_categoric_values[[k]] <- values
-            k <- k+1
-          }
-          
-          # if((!is_numeric & !has_numeric) | is_numeric)
-        }else{
-          
-          if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
-            new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1]
-          else
-            new_colname <- valid_colnames[i,1]
-          
-          valid <- check_values_not_categoric(values, new_colname)
-          
-          if (FALSE %in% valid){
-            invalid_name_list <- c(invalid_name_list,valid_colnames[i,1])
-            invalid_values_list[[j]] <- values
-            j <- j+1
-          }
-          
-          
-          #print(valid_colnames[i,1])
-          #print(values)
-          
-        }#else
-        # print("This variable has a numeric version")
-      }
-      
-      
-    }
-  }
-  
-  missing_numeric
-  if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){
-    res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric)
-  }else{
-    res <- "All values are valid \n"
-  }
-  
-  if(length(cannot_analyse_list)>0){
-    res <- paste(res, "\n############################################################################ \n", sep="")
-    
-    res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" )
-  }
-    
-  
-  
-  
-  return(res)
-  
-}
-
-notify_unable_analyse <- function(x){
-  
-  res <- "\nCould not obtain data from the fields:"
-  
-  for(i in 1:length(x)){
-    res <- paste(res, x[i], sep=" ")
-  }
-  
-  return (res)
-  
-  
-}
-
-auxConnections <- connect()
-connections <- auxConnections[[1]]
-inp <- auxConnections[[2]]
-
-#Conexión a la base de datos
-
-ds.dim("data", datasources = connections)
-ds.colnames("data")
-
-#----------------------------------------------------------------------------
-
-#Check valid column names
-datastructure_name <- "data"
-data_colnames <- ds.colnames(x=datastructure_name, datasources= connections)
-
-data_colnames <- as.data.frame(data_colnames)
-
-check_valid_columns <- check_column_names(data_colnames)
-
-valid_colnames <- as.data.frame(valid_data_colnames(data_colnames))
-
-
-
-#possible_values("CSXCTR")
-result <- ""
-result<-check_valid_values()
-print(check_valid_columns)
-datashield.logout(connections) 
-cat(result)
-
-
-
-# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna")
-# # 
-# ds.summary("columna$DMXWT")
-# ds.dim("columna$DMXWT")
-# ds.table("columna$DMXWT")
-
-file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
-
-#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections)
-
-
-#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values")
-dir.create("../invalid_values", showWarnings = FALSE)
-setwd("../invalid_values")
-
-
-cat(check_valid_columns,file=file_name,sep="\n")
-cat(result,file=file_name,append=TRUE)
-
-datashield.logout(connections) 
-  
+rm(list=ls())
+
+dir_name <- readline("Introduce the name of the directory please: ")
+
+setwd(dir_name)
+
+source("required_folder_checker.R")
+source("argument_hasher.R")
+source("dependency_installer.R")
+
+
+dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
+install_dependencies(dep_list)
+
+
+
+setwd(dir_name)
+
+#source("connection_parameters.R")
+#source("necessary_functions_connection.R")
+
+codebook <- read.csv("harmon.csv" , sep = ";")
+
+codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
+
+names(codebook_col_names) <- c("col_names")
+
+setwd(paste(dir_name ,"/harmonized_data", sep=""))
+
+file_name <- readline("Introduce the name of the file to check the values: ")
+harmonized_data <- ""
+
+if (grepl(".csv" , file_name , fixed = TRUE)){
+  harmonized_data <- read.csv(file_name)
+}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
+  harmonized_data <- read.xlsx(file_name)
+}
+
+
+categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
+
+
+#----------------------------------------------------------------------------
+
+#Test if column names are valid
+check_column_names <- function(col_names){
+  
+  str_res <- "The column names:"
+  valid_colnames <- c()
+  for(i in 1:(nrow(col_names))){
+    col_name <- col_names[i,1]
+    if(!check_valid_name(col_name)){
+      str_res<- paste(str_res, col_name, sep=" ")
+    }else{
+      valid_colnames = c(valid_colnames, col_name)
+    }
+  }
+  
+  str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
+  
+  new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames)
+  return (new_list)
+  
+}
+
+#Test if a single variable name is valid
+check_valid_name <- function(col_name){
+  
+  valid <- FALSE
+  
+  if(col_name %in% codebook_col_names$col_names)
+    valid <- TRUE
+  
+  return (valid)
+  
+}
+
+remove_space <- function(x){
+  searchString <- ' '
+  replacementString <- ''
+  res = sub(searchString,replacementString,x)
+  return(res)
+}
+
+remove_spaces_from_ds <- function(ds){
+  
+  res<- lapply(ds,remove_space )
+  
+  return(as.data.frame(res))
+  
+}
+
+is_number <- function(x){
+  res <- FALSE
+  
+  
+  if(length(x)!=0){
+    x <- str_replace(x,",",".")
+    
+    aux <- as.numeric(x)
+
+    
+    if(!is.na(aux))
+      res <- TRUE
+  }
+  
+  
+  return(res)
+  
+}
+
+check_values_not_categoric <- function(values, colname){
+  
+  valid_vals <- values
+  
+  possible_vals <- possible_values(colname)
+  
+  for(i in  1:length(values)){
+    
+    res<- FALSE
+    
+    value <- values[[i]]
+    
+    if(is_number(value)){
+      value <- str_replace(value,",",".")
+      value <- as.numeric(value)
+    }
+    
+    if(is.null(value)){
+      res <- TRUE
+    }
+      
+    else if( value == "NA" | value == "nan" | value == ".")
+      res <- TRUE
+    else{
+      
+      if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){
+        
+        if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){
+          lower = possible_vals[1,1][[1]]
+          higher = possible_vals[2,1][[1]]
+          
+        }else{
+          bounds <- as.data.frame(strsplit(possible_vals[1,1], '-'))
+          lowerAux <- str_replace(bounds [1,1],",",".")
+          higherAux <- str_replace(bounds [2,1],",",".")
+          lower <- as.numeric(remove_space(lowerAux))
+          higher <- as.numeric(remove_space(higherAux))
+        }
+        
+        
+        if ((value >= lower & value <= higher))
+          res <- TRUE
+      }
+      
+      if(nrow(possible_vals) == 3 | colname=="CSXCTR"){
+        if(value == 0 | value == 1)
+          res <- TRUE
+      }
+      
+      if(nrow(possible_vals) > 3){
+        
+        lower <- strtoi(remove_space(possible_vals[1,1]))
+        higher_b <- nrow(possible_vals)-1
+        higher <- strtoi(remove_space(possible_vals[higher_b,1]))
+        
+        if ((value >= lower & value <= higher))
+          res <- TRUE
+      }
+    }
+    
+    if(res == FALSE)
+      valid_vals[i] <- res
+    
+    
+  }
+  
+  return(valid_vals)
+}
+
+possible_values <- function(x){
+  
+  if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM")
+    res <- t(as.data.frame(list(-20,20)))
+  
+  else{
+    
+    possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,5]
+    res <- strsplit(x=possible_value,split="/")
+  }
+  
+  return(as.data.frame(res))
+}
+
+possible_values_categoric <- function(x){
+  
+  possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,4]
+  res <- strsplit(x=possible_value,split="/")
+  
+  return(as.data.frame(res))
+}
+
+check_values_categoric <- function(values, colname){
+  
+  possible_vals <- possible_values_categoric(colname)
+  
+  res <- TRUE
+  
+  for(i in 1:length(values)){
+    if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){
+      res <- FALSE
+    }
+    
+  }
+  
+  return(res)
+  
+}
+
+
+get_values_from_quantiles <- function(x){
+  
+  data_summary <- summary(x)
+  
+  low_quantile <- data_summary[[1]][3][[1]][[1]]
+  
+  high_quantile <- data_summary[[1]][3][[1]][[7]]
+  
+  return(list(low_quantile,high_quantile))
+  
+  
+}
+
+notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){
+  
+  res <- ""
+  
+  if(length(invalid_name_list) !=0){
+    res <- "There are invalid values in the numeric fields:"
+    
+    
+    for(i in 1:length(invalid_name_list)){
+      res <- paste(res, invalid_name_list[i], sep=" ")
+    }
+    
+    res <- paste(res, "\n", sep="")
+    
+    for(i in 1:length(invalid_name_list)){
+      res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ")
+    }
+  }
+  
+  
+  if(length(wrong_categoric)!=0){
+    
+    res <- paste(res, "\n############################################################################ \n", sep="")
+    
+    res <- paste(res,"\nThe following categoric values are invalid:", sep=" ")
+    
+    for(i in 1:length(wrong_categoric)){
+      res <- paste(res, wrong_categoric[i], sep=" ")
+    }
+    
+    res <- paste(res, "\n", sep="")
+    
+    for(i in 1:length(wrong_categoric)){
+      res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ")
+    }
+  }
+  
+  if(length(missing_numeric)!=0){
+    
+    res <- paste(res, "\n############################################################################ \n", sep="")
+    
+    res<- paste(res, "\nThe following fields are missing a numeric field:")
+    
+    for(i in 1:length(wrong_categoric)){
+      res <- paste(res, missing_numeric[i], sep=" ")
+    }
+  }
+  
+  res <- paste(res, "\n", sep="")
+  
+  
+  return(res)
+  
+}
+
+error_message_categoric <- function(colname, invalid_values){
+  
+  res<- "\nValues in the field"
+  res<- paste(res, colname, sep=" ")
+  res<- paste(res, "should be", sep=" ")
+  
+  range <- possible_values_categoric(colname)
+  
+  for(i in 1:nrow(range)){
+    res <- paste(res, remove_space(range[i,1]), sep=" ")
+  }
+  
+  
+  res<- paste(res, "\nBut values were:", sep=" ")
+  
+  for(j in 1:length(invalid_values)){
+    res<- paste(res, invalid_values[[j]], sep=" ")
+  }
+  
+  res<- paste(res, "\n\n", sep="")
+  
+  
+  return(res)
+  
+}
+
+error_message <- function(colname, invalid_values){
+  
+  res<- "\nValues in the field"
+  res<- paste(res, colname, sep=" ")
+  res<- paste(res, "should be", sep=" ")
+  
+  if(grepl("numeric", colname,fixed=TRUE))
+    new_colname <- strsplit(x=colname,split="_")[[1]][1]
+  else
+    new_colname <- colname
+  
+  range <- subset(harmonized_data, harmonized_data$harmonized.variable.name == new_colname)
+  range <- range[5]
+  range <- as.data.frame(strsplit(range[1,1], '/'))
+  
+  #Range of values or null
+  if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){
+    bounds <- as.data.frame(strsplit(range[1,1], '-'))
+    lower <- remove_space(bounds [1,1])
+    higher <- remove_space(bounds [2,1])
+    res<- paste(res, "numbers between", sep=" ")
+    res<- paste(res, lower, sep=" ")
+    res<- paste(res, "and", sep=" ")
+    res<- paste(res, higher, sep=" ")
+    res<- paste(res, "(both included)", sep=" ")
+  }
+  
+  if(nrow(range) == 3){
+    res<- paste(res, "0 or 1", sep=" ")
+  }
+  
+  if(nrow(range) > 3){
+    lower <- strtoi(remove_space(range[1,1]))
+    higher_b <- nrow(range)-1
+    higher <- strtoi(remove_space(range[higher_b,1]))
+    res<- paste(res, "numbers between", sep=" ")
+    res<- paste(res, lower, sep=" ")
+    res<- paste(res, "and", sep=" ")
+    res<- paste(res, higher, sep=" ")
+    res<- paste(res, "(both included)", sep=" ")
+  }
+  
+  if(grepl("DAT",colname, fixed=TRUE)){
+    res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ")
+  }
+  
+  
+  res<- paste(res, "\nBut values were:", sep=" ")
+  
+  for(j in 1:length(invalid_values)){
+    res<- paste(res, invalid_values[[j]], sep=" ")
+  }
+  
+  res<- paste(res, "\n", sep="")
+  
+  if(!is_number(invalid_values[[1]]))
+    res<- paste(res, "(It's missing a \"numeric\" field)", sep="")
+  
+  res<- paste(res, "\n", sep="")
+  
+  return(res)
+  
+}
+
+check_valid_values <- function(){
+  
+  invalid_name_list <- c()
+  cannot_analyse_list <- c()
+  invalid_values_list <- list()
+  wrong_categoric_values <- list()
+  wrong_categoric <- c()
+  missing_numeric <- c()
+  j<- 1
+  k <- 1
+  
+  for(i in  1:(nrow(valid_colnames))){
+    
+    data_table ="empty"
+    
+    if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){
+      
+      column <- "data$"
+      column <- paste(column, valid_colnames[i,1], sep="")
+      
+      tryCatch(
+        error = function(cnd) {
+          print("Unable to analyse data")
+          res <- FALSE
+        },
+        data_table <- as.data.frame(table(column))
+      )
+      
+      if(data_table == "empty"){
+        
+        cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
+        
+      }else{
+        
+        
+        if (data_table[[1]] == "All studies failed for reasons identified below")
+          values <- get_values_from_quantiles(column)
+        else
+          values <- row.names(data_table)
+        
+        numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="")
+        
+        if( valid_colnames[i,1] %in% categoric_vars ){
+          
+          #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE)
+          has_numeric <-  numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)`
+          
+          if(!has_numeric)
+            missing_numeric <- c(missing_numeric, valid_colnames[i,1])
+          
+          
+          if (data_table[[1]] == "All studies failed for reasons identified below"){
+            
+            cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
+            
+          }else  if(!check_values_categoric(values,valid_colnames[i,1])){
+            
+            print("Wrong categoric value:")
+            print(valid_colnames[i,1])
+            
+            wrong_categoric <- c(wrong_categoric, valid_colnames[i,1])
+            wrong_categoric_values[[k]] <- values
+            k <- k+1
+          }
+          
+        }else{
+          
+          if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
+            new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1]
+          else
+            new_colname <- valid_colnames[i,1]
+          
+          valid <- check_values_not_categoric(values, new_colname)
+          
+          if (FALSE %in% valid){
+            invalid_name_list <- c(invalid_name_list,valid_colnames[i,1])
+            invalid_values_list[[j]] <- values
+            j <- j+1
+          }
+          
+          
+        }
+      }
+      
+      
+    }
+  }
+  
+  missing_numeric
+  if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){
+    res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric)
+  }else{
+    res <- "All values are valid \n"
+  }
+  
+  if(length(cannot_analyse_list)>0){
+    res <- paste(res, "\n############################################################################ \n", sep="")
+    
+    res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" )
+  }
+    
+  
+  
+  
+  return(res)
+  
+}
+
+notify_unable_analyse <- function(x){
+  
+  res <- "\nCould not obtain data from the fields:"
+  
+  for(i in 1:length(x)){
+    res <- paste(res, x[i], sep=" ")
+  }
+  
+  return (res)
+  
+  
+}
+
+data_colnames <- as.data.frame(colnames(harmonized_data))
+
+check_valid_columns <- check_column_names(data_colnames)
+
+columns_not_valid <- check_valid_columns$not_colnames
+
+valid_colnames <- as.data.frame(check_valid_columns$colnames)
+names(valid_colnames) = c("valid_colnames")
+
+
+result <- ""
+result<-check_valid_values()
+print(check_valid_columns)
+#datashield.logout(connections) 
+cat(result)
+
+
+
+file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
+
+
+dir.create("../invalid_values", showWarnings = FALSE)
+setwd("../invalid_values")
+
+cat(check_valid_columns,file=file_name,sep="\n")
+cat(result,file=file_name,append=TRUE)
+
+#datashield.logout(connections) 
+  
-- 
2.24.1