diff --git a/valid_variables_script.R b/valid_variables_script.R old mode 100755 new mode 100644 index f96d4158ceadf483e0240ccc6e83671fb2568e07..9576c763d125b853969d5c405ad92b4264da1ac5 --- a/valid_variables_script.R +++ b/valid_variables_script.R @@ -1,35 +1,63 @@ rm(list=ls()) -dir_name <- readline("Introduce the name of the directory please: ") - -setwd(dir_name) +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") +source("required_folder_checker.R") +source("argument_hasher.R") source("dependency_installer.R") +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source") + -dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") +dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") install_dependencies(dep_list) -#source("connection_parameters.R") -#source("necessary_functions_connection.R") +#,"DSI","DSOpal","DSLite" -codebook <- read.csv("new_harmon.csv" , sep = ",") +setwd("C:/Users/victor/Documents/TFG/r-analytics-master") +source("connection_parameters.R") +source("necessary_functions_connection.R") -codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) -names(codebook_col_names) <- c("col_names") +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") -setwd(paste(dir_name ,"/harmonized_data", sep="")) -file_name <- readline("Introduce the name of the file to check the values: ") -harmonized_data <- "" +ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] +Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] +Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] +Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] +Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] +Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] +Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] +SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] +Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] +LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] + -if (grepl(".csv" , file_name , fixed = TRUE)){ - harmonized_data <- read.csv(file_name) -}else if (grepl(".xlsx" , file_name , fixed = TRUE)){ - harmonized_data <- read.xlsx(file_name) -} +harmonised_data <- rbind(SiAndSympt,ComAndRF) +harmonised_data <- rbind(harmonised_data,Treatment) +harmonised_data <- rbind(harmonised_data,Dates) +harmonised_data <- rbind(harmonised_data,Demographics) +harmonised_data <- rbind(harmonised_data,Home_med) +harmonised_data <- rbind(harmonised_data,Imaging_data) +harmonised_data <- rbind(harmonised_data,Complications) +harmonised_data <- rbind(harmonised_data,Labo) +harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) + +rm(list=c("SiAndSympt", + "Complications", + "ComAndRF", + "Dates", + "Demographics", + "Home_med", + "Imaging_data", + "Complications", + "Labo", + "LifestyleAndDiet")) categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") @@ -37,45 +65,50 @@ categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DS #---------------------------------------------------------------------------- #Test if column names are valid -check_column_names <- function(col_names){ +check_column_names <- function(x){ str_res <- "The column names:" - valid_colnames <- c() - repeated_colnames <- c() - for(i in 1:(nrow(col_names))){ - col_name <- col_names[i,1] - number_of_column <- check_valid_name(col_name) - if( number_of_column == 0){ - str_res<- paste(str_res, col_name, sep=" ") - }else if (number_of_column == 1){ - valid_colnames = c(valid_colnames, col_name) - }else{ - repeated_colnames = c(repeated_colnames , col_name) + + for(i in 1:(nrow(data_colnames))){ + if(!check_valid_name(data_colnames[i,1])){ + str_res<- paste(str_res, data_colnames[i,1], sep=" ") } } str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") - new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames) - return (new_list) + return (str_res) } #Test if a single variable name is valid -check_valid_name <- function(col_name){ +check_valid_name <- function(x){ - valid <- 0 + valid <- FALSE - if(col_name %in% codebook_col_names$col_names){ - - valid <- length(grep(col_name, names(harmonized_data))) + aux <- as.data.frame(strsplit(x , split = "_")) - } + if(aux[1,1] %in% harmonised_data$Harmonised.variable.name) + valid <- TRUE return (valid) } +valid_data_colnames <- function(x){ + + valid_colnames = c() + + for(i in 1:(nrow(data_colnames))){ + if(check_valid_name(data_colnames[i,1])){ + valid_colnames = c(valid_colnames,data_colnames[i,1]) + } + } + + + return(valid_colnames) +} + remove_space <- function(x){ searchString <- ' ' replacementString <- '' @@ -99,7 +132,7 @@ is_number <- function(x){ x <- str_replace(x,",",".") aux <- as.numeric(x) - + if(!is.na(aux)) res <- TRUE @@ -130,7 +163,7 @@ check_values_not_categoric <- function(values, colname){ if(is.null(value)){ res <- TRUE } - + else if( value == "NA" | value == "nan" | value == ".") res <- TRUE else{ @@ -186,7 +219,7 @@ possible_values <- function(x){ else{ - possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,5] + possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] res <- strsplit(x=possible_value,split="/") } @@ -195,7 +228,7 @@ possible_values <- function(x){ possible_values_categoric <- function(x){ - possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,4] + possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] res <- strsplit(x=possible_value,split="/") return(as.data.frame(res)) @@ -221,7 +254,7 @@ check_values_categoric <- function(values, colname){ get_values_from_quantiles <- function(x){ - data_summary <- summary(x) + data_summary <- ds.summary(x) low_quantile <- data_summary[[1]][3][[1]][[1]] @@ -324,7 +357,7 @@ error_message <- function(colname, invalid_values){ else new_colname <- colname - range <- subset(harmonized_data, harmonized_data$harmonized.variable.name == new_colname) + range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) range <- range[5] range <- as.data.frame(strsplit(range[1,1], '/')) @@ -377,7 +410,7 @@ error_message <- function(colname, invalid_values){ } -check_valid_values <- function(valid_colnames){ +check_valid_values <- function(){ invalid_name_list <- c() cannot_analyse_list <- c() @@ -389,59 +422,85 @@ check_valid_values <- function(valid_colnames){ k <- 1 for(i in 1:(nrow(valid_colnames))){ - name <- names(valid_colnames_with_data)[i] - if("DMRBORN" == name | grepl("DAT",colname, fixed=TRUE) | "ISO" == name | "BEF" == name){ - next - } - - column <- valid_colnames[,i] - - data_table <- as.data.frame(table(column)) - - values <- row.names(data_table) - numeric_col<- paste(valid_colnames[,i],"_numeric", sep="") + data_table ="empty" - if( name %in% categoric_vars ){ + if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ - #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) - has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` + column <- "data$" + column <- paste(column, valid_colnames[i,1], sep="") - if(!has_numeric) - missing_numeric <- c(missing_numeric, valid_colnames[i,1]) + tryCatch( + error = function(cnd) { + print("Unable to analyse data") + res <- FALSE + }, + data_table <- as.data.frame(ds.table(column)) + ) - - if(!check_values_categoric(values,valid_colnames[i,1])){ + if(data_table == "empty"){ - print("Wrong categoric value:") - print(valid_colnames[i,1]) + cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) - wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) - wrong_categoric_values[[k]] <- values - k <- k+1 - } - - }else{ - - if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) - new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] - else - new_colname <- valid_colnames[i,1] - - valid <- check_values_not_categoric(values, new_colname) - - if (FALSE %in% valid){ - invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) - invalid_values_list[[j]] <- values - j <- j+1 + }else{ + + + if (data_table[[1]] == "All studies failed for reasons identified below") + values <- get_values_from_quantiles(column) + else + values <- row.names(data_table) + + numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") + + if( valid_colnames[i,1] %in% categoric_vars ){ + + #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) + has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` + + if(!has_numeric) + missing_numeric <- c(missing_numeric, valid_colnames[i,1]) + + + if (data_table[[1]] == "All studies failed for reasons identified below"){ + + cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) + + }else if(!check_values_categoric(values,valid_colnames[i,1])){ + + print("Wrong categoric value:") + print(valid_colnames[i,1]) + + wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) + wrong_categoric_values[[k]] <- values + k <- k+1 + } + + # if((!is_numeric & !has_numeric) | is_numeric) + }else{ + + if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) + new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] + else + new_colname <- valid_colnames[i,1] + + valid <- check_values_not_categoric(values, new_colname) + + if (FALSE %in% valid){ + invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) + invalid_values_list[[j]] <- values + j <- j+1 + } + + + #print(valid_colnames[i,1]) + #print(values) + + }#else + # print("This variable has a numeric version") } } - - - - } missing_numeric @@ -456,7 +515,7 @@ check_valid_values <- function(valid_colnames){ res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) } - + @@ -477,33 +536,57 @@ notify_unable_analyse <- function(x){ } -data_colnames <- as.data.frame(colnames(harmonized_data)) +auxConnections <- connect() +connections <- auxConnections[[1]] +inp <- auxConnections[[2]] + +#Conexión a la base de datos + +ds.dim("data", datasources = connections) +ds.colnames("data") + +#---------------------------------------------------------------------------- + +#Check valid column names +datastructure_name <- "data" +data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) + +data_colnames <- as.data.frame(data_colnames) check_valid_columns <- check_column_names(data_colnames) -columns_not_valid <- check_valid_columns$not_colnames +valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) -valid_colnames <- as.data.frame(check_valid_columns$colnames) -names(valid_colnames) = c("valid_colnames") -valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames$valid_colnames) +#possible_values("CSXCTR") result <- "" -result<-check_valid_values(valid_colnames_with_data) +result<-check_valid_values() print(check_valid_columns) -#datashield.logout(connections) +datashield.logout(connections) cat(result) +# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna") +# # +# ds.summary("columna$DMXWT") +# ds.dim("columna$DMXWT") +# ds.table("columna$DMXWT") + file_name<- paste(hospital_name,"_invalid_values.txt", sep="") +#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections) + +#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values") dir.create("../invalid_values", showWarnings = FALSE) setwd("../invalid_values") + cat(check_valid_columns,file=file_name,sep="\n") cat(result,file=file_name,append=TRUE) -#datashield.logout(connections) +datashield.logout(connections) + diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R new file mode 100755 index 0000000000000000000000000000000000000000..a37aa4955791cb63a54f8f7420ecda091bb64d9a --- /dev/null +++ b/valid_variables_script_local.R @@ -0,0 +1,218 @@ +rm(list=ls()) + +dir_name <- readline("Introduce the name of the directory please: ") + +setwd(dir_name) + +source("dependency_installer.R") + + +dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") +install_dependencies(dep_list) + +#source("connection_parameters.R") +#source("necessary_functions_connection.R") + +codebook <- read.csv("new_harmon.csv" , sep = ",") + +codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) + +names(codebook_col_names) <- c("col_names") + +setwd(paste(dir_name ,"/harmonized_data", sep="")) + +file_name <- readline("Introduce the name of the file to check the values: ") +harmonized_data <- "" + +if (grepl(".csv" , file_name , fixed = TRUE)){ + harmonized_data <- read.csv(file_name) +}else if (grepl(".xlsx" , file_name , fixed = TRUE)){ + harmonized_data <- read.xlsx(file_name) +} + +#---------------------------------------------------------------------------- + +#Test if column names are valid +check_column_names <- function(col_names){ + + str_res <- "The column names:" + valid_colnames <- c() + repeated_colnames <- c() + for(i in 1:(nrow(col_names))){ + col_name <- col_names[i,1] + number_of_column <- check_valid_name(col_name) + if( number_of_column == 0){ + str_res<- paste(str_res, col_name, sep=" ") + }else if (number_of_column == 1){ + valid_colnames = c(valid_colnames, col_name) + }else{ + repeated_colnames = c(repeated_colnames , col_name) + } + } + + str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") + + new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames) + return (new_list) + +} + + +#Test if a single variable name is valid +check_valid_name <- function(col_name){ + + valid <- 0 + + if(col_name %in% codebook_col_names$col_names){ + + valid <- length(grep(col_name, names(harmonized_data))) + + } + + return (valid) + +} + +check_valid_values_continuous <- function(colname , codebook_param , column){ + column <- column[column != "."] + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + possible_values_list = str_split(possible_values_format , "/")[[1]] + + # Fallará cuando el codebook no tenga min-max / . + range_as_str <- str_trim(possible_values_list[1]) + missing_value_format <- str_trim(str_trim(possible_values_list[2])) + + separate_range <- str_split(range_as_str , "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + + failing_values <- column[column < min_value | column > max_value] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + + + return(str_res) +} + +check_valid_values_binary <- function(colname , column){ + column <- column[column != "."] + failing_values <- column[column < 0 | column > 1] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + + + return(str_res) +} + +check_valid_values_categorical <- function(colname , codebook_param , column){ + column <- column[column != "."] + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + possible_values_list <- str_split(possible_values_format , "/")[[1]] + + possible_values_list <- lapply(possible_values_list , str_trim) + + str_res <- "" + min_value <- 0 + max_value <- 0 + if (length(possible_values_list[[1]]) == 2){ + separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + + }else{ + + possible_values_list <- lapply(possible_values_list , strtoi)[[1]] + min_value <- possible_values_list[1] + max_value <- possible_values_list[length(possible_values_list) - 1] + } + + failing_values <- column[column < min_value | column > max_value ] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + if(number_of_failing_values == 0){ + str_res <- "No failing values" + }else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + +} + +check_valid_values <- function(valid_colnames, codebook_param){ + + res <- "" + + for(i in 1:(ncol(valid_colnames))){ + name <- names(valid_colnames)[i] + #if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){ + # next + #} + + column <- valid_colnames[,i] + + # Esto falla si tu codebook no es mismo que new_harmon.csv + column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] + + result = switch( + column_type, + "Continuous"= check_valid_values_continuous(name , codebook_param , column), + "Binary"= check_valid_values_binary(name , column), + "Categorical"= check_valid_values_categorical(name, codebook_param , column), + "Calendar date" = paste("No failing values"), + "ISO country code"= paste("No failing values"), + ) + + if (result != "No failing values"){ + res <- paste(res , result, sep="\n") + } + } + + return(res) + +} + + +data_colnames <- as.data.frame(colnames(harmonized_data)) + +check_valid_columns <- check_column_names(data_colnames) + +columns_not_valid <- check_valid_columns$not_colnames + +valid_colnames_column <- as.data.frame(check_valid_columns$colnames) +names(valid_colnames_column) = c("valid_colnames") +valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames) + + +result <- "" +result<-check_valid_values(valid_colnames_with_data, codebook) +print(check_valid_columns) +#datashield.logout(connections) +cat(result) + + + +file_name<- paste(hospital_name,"_invalid_values.txt", sep="") + + +dir.create("../invalid_values", showWarnings = FALSE) +setwd("../invalid_values") + +cat(check_valid_columns,file=file_name,sep="\n") +cat(result,file=file_name,append=TRUE) + +#datashield.logout(connections) +