rm(list=ls()) setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") source("required_folder_checker.R") source("argument_hasher.R") source("dependency_installer.R") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") install_dependencies(dep_list) #,"DSI","DSOpal","DSLite" setwd("C:/Users/victor/Documents/TFG/r-analytics-master") source("connection_parameters.R") source("necessary_functions_connection.R") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) rm(list=c("SiAndSympt", "Complications", "ComAndRF", "Dates", "Demographics", "Home_med", "Imaging_data", "Complications", "Labo", "LifestyleAndDiet")) categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") #---------------------------------------------------------------------------- #Test if column names are valid check_column_names <- function(x){ str_res <- "The column names:" for(i in 1:(nrow(data_colnames))){ if(!check_valid_name(data_colnames[i,1])){ str_res<- paste(str_res, data_colnames[i,1], sep=" ") } } str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") return (str_res) } #Test if a single variable name is valid check_valid_name <- function(x){ valid <- FALSE aux <- as.data.frame(strsplit(x , split = "_")) if(aux[1,1] %in% harmonised_data$Harmonised.variable.name) valid <- TRUE return (valid) } valid_data_colnames <- function(x){ valid_colnames = c() for(i in 1:(nrow(data_colnames))){ if(check_valid_name(data_colnames[i,1])){ valid_colnames = c(valid_colnames,data_colnames[i,1]) } } return(valid_colnames) } remove_space <- function(x){ searchString <- ' ' replacementString <- '' res = sub(searchString,replacementString,x) return(res) } remove_spaces_from_ds <- function(ds){ res<- lapply(ds,remove_space ) return(as.data.frame(res)) } is_number <- function(x){ res <- FALSE if(length(x)!=0){ x <- str_replace(x,",",".") aux <- as.numeric(x) if(!is.na(aux)) res <- TRUE } return(res) } check_values_not_categoric <- function(values, colname){ valid_vals <- values possible_vals <- possible_values(colname) for(i in 1:length(values)){ res<- FALSE value <- values[[i]] if(is_number(value)){ value <- str_replace(value,",",".") value <- as.numeric(value) } if(is.null(value)){ res <- TRUE } else if( value == "NA" | value == "nan" | value == ".") res <- TRUE else{ if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){ if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){ lower = possible_vals[1,1][[1]] higher = possible_vals[2,1][[1]] }else{ bounds <- as.data.frame(strsplit(possible_vals[1,1], '-')) lowerAux <- str_replace(bounds [1,1],",",".") higherAux <- str_replace(bounds [2,1],",",".") lower <- as.numeric(remove_space(lowerAux)) higher <- as.numeric(remove_space(higherAux)) } if ((value >= lower & value <= higher)) res <- TRUE } if(nrow(possible_vals) == 3 | colname=="CSXCTR"){ if(value == 0 | value == 1) res <- TRUE } if(nrow(possible_vals) > 3){ lower <- strtoi(remove_space(possible_vals[1,1])) higher_b <- nrow(possible_vals)-1 higher <- strtoi(remove_space(possible_vals[higher_b,1])) if ((value >= lower & value <= higher)) res <- TRUE } } if(res == FALSE) valid_vals[i] <- res } return(valid_vals) } possible_values <- function(x){ if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM") res <- t(as.data.frame(list(-20,20))) else{ possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] res <- strsplit(x=possible_value,split="/") } return(as.data.frame(res)) } possible_values_categoric <- function(x){ possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] res <- strsplit(x=possible_value,split="/") return(as.data.frame(res)) } check_values_categoric <- function(values, colname){ possible_vals <- possible_values_categoric(colname) res <- TRUE for(i in 1:length(values)){ if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){ res <- FALSE } } return(res) } get_values_from_quantiles <- function(x){ data_summary <- ds.summary(x) low_quantile <- data_summary[[1]][3][[1]][[1]] high_quantile <- data_summary[[1]][3][[1]][[7]] return(list(low_quantile,high_quantile)) } notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){ res <- "" if(length(invalid_name_list) !=0){ res <- "There are invalid values in the numeric fields:" for(i in 1:length(invalid_name_list)){ res <- paste(res, invalid_name_list[i], sep=" ") } res <- paste(res, "\n", sep="") for(i in 1:length(invalid_name_list)){ res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ") } } if(length(wrong_categoric)!=0){ res <- paste(res, "\n############################################################################ \n", sep="") res <- paste(res,"\nThe following categoric values are invalid:", sep=" ") for(i in 1:length(wrong_categoric)){ res <- paste(res, wrong_categoric[i], sep=" ") } res <- paste(res, "\n", sep="") for(i in 1:length(wrong_categoric)){ res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ") } } if(length(missing_numeric)!=0){ res <- paste(res, "\n############################################################################ \n", sep="") res<- paste(res, "\nThe following fields are missing a numeric field:") for(i in 1:length(wrong_categoric)){ res <- paste(res, missing_numeric[i], sep=" ") } } res <- paste(res, "\n", sep="") return(res) } error_message_categoric <- function(colname, invalid_values){ res<- "\nValues in the field" res<- paste(res, colname, sep=" ") res<- paste(res, "should be", sep=" ") range <- possible_values_categoric(colname) for(i in 1:nrow(range)){ res <- paste(res, remove_space(range[i,1]), sep=" ") } res<- paste(res, "\nBut values were:", sep=" ") for(j in 1:length(invalid_values)){ res<- paste(res, invalid_values[[j]], sep=" ") } res<- paste(res, "\n\n", sep="") return(res) } error_message <- function(colname, invalid_values){ res<- "\nValues in the field" res<- paste(res, colname, sep=" ") res<- paste(res, "should be", sep=" ") if(grepl("numeric", colname,fixed=TRUE)) new_colname <- strsplit(x=colname,split="_")[[1]][1] else new_colname <- colname range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) range <- range[5] range <- as.data.frame(strsplit(range[1,1], '/')) #Range of values or null if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){ bounds <- as.data.frame(strsplit(range[1,1], '-')) lower <- remove_space(bounds [1,1]) higher <- remove_space(bounds [2,1]) res<- paste(res, "numbers between", sep=" ") res<- paste(res, lower, sep=" ") res<- paste(res, "and", sep=" ") res<- paste(res, higher, sep=" ") res<- paste(res, "(both included)", sep=" ") } if(nrow(range) == 3){ res<- paste(res, "0 or 1", sep=" ") } if(nrow(range) > 3){ lower <- strtoi(remove_space(range[1,1])) higher_b <- nrow(range)-1 higher <- strtoi(remove_space(range[higher_b,1])) res<- paste(res, "numbers between", sep=" ") res<- paste(res, lower, sep=" ") res<- paste(res, "and", sep=" ") res<- paste(res, higher, sep=" ") res<- paste(res, "(both included)", sep=" ") } if(grepl("DAT",colname, fixed=TRUE)){ res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ") } res<- paste(res, "\nBut values were:", sep=" ") for(j in 1:length(invalid_values)){ res<- paste(res, invalid_values[[j]], sep=" ") } res<- paste(res, "\n", sep="") if(!is_number(invalid_values[[1]])) res<- paste(res, "(It's missing a \"numeric\" field)", sep="") res<- paste(res, "\n", sep="") return(res) } check_valid_values <- function(){ invalid_name_list <- c() cannot_analyse_list <- c() invalid_values_list <- list() wrong_categoric_values <- list() wrong_categoric <- c() missing_numeric <- c() j<- 1 k <- 1 for(i in 1:(nrow(valid_colnames))){ data_table ="empty" if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ column <- "data$" column <- paste(column, valid_colnames[i,1], sep="") tryCatch( error = function(cnd) { print("Unable to analyse data") res <- FALSE }, data_table <- as.data.frame(ds.table(column)) ) if(data_table == "empty"){ cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) }else{ if (data_table[[1]] == "All studies failed for reasons identified below") values <- get_values_from_quantiles(column) else values <- row.names(data_table) numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") if( valid_colnames[i,1] %in% categoric_vars ){ #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` if(!has_numeric) missing_numeric <- c(missing_numeric, valid_colnames[i,1]) if (data_table[[1]] == "All studies failed for reasons identified below"){ cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) }else if(!check_values_categoric(values,valid_colnames[i,1])){ print("Wrong categoric value:") print(valid_colnames[i,1]) wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) wrong_categoric_values[[k]] <- values k <- k+1 } # if((!is_numeric & !has_numeric) | is_numeric) }else{ if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] else new_colname <- valid_colnames[i,1] valid <- check_values_not_categoric(values, new_colname) if (FALSE %in% valid){ invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) invalid_values_list[[j]] <- values j <- j+1 } #print(valid_colnames[i,1]) #print(values) }#else # print("This variable has a numeric version") } } } missing_numeric if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){ res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric) }else{ res <- "All values are valid \n" } if(length(cannot_analyse_list)>0){ res <- paste(res, "\n############################################################################ \n", sep="") res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) } return(res) } notify_unable_analyse <- function(x){ res <- "\nCould not obtain data from the fields:" for(i in 1:length(x)){ res <- paste(res, x[i], sep=" ") } return (res) } auxConnections <- connect() connections <- auxConnections[[1]] inp <- auxConnections[[2]] #Conexión a la base de datos ds.dim("data", datasources = connections) ds.colnames("data") #---------------------------------------------------------------------------- #Check valid column names datastructure_name <- "data" data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) data_colnames <- as.data.frame(data_colnames) check_valid_columns <- check_column_names(data_colnames) valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) #possible_values("CSXCTR") result <- "" result<-check_valid_values() print(check_valid_columns) datashield.logout(connections) cat(result) # ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna") # # # ds.summary("columna$DMXWT") # ds.dim("columna$DMXWT") # ds.table("columna$DMXWT") file_name<- paste(hospital_name,"_invalid_values.txt", sep="") #ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections) #setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values") dir.create("../invalid_values", showWarnings = FALSE) setwd("../invalid_values") cat(check_valid_columns,file=file_name,sep="\n") cat(result,file=file_name,append=TRUE) datashield.logout(connections)