Commit 03c015c4 authored by Pepe Márquez Romero's avatar Pepe Márquez Romero

recuperando script de victor, renombrando el script de analisis de harmonizacion en local

parent 3d03e9b5
This diff is collapsed.
rm(list=ls())
dir_name <- readline("Introduce the name of the directory please: ")
setwd(dir_name)
source("dependency_installer.R")
dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list)
#source("connection_parameters.R")
#source("necessary_functions_connection.R")
codebook <- read.csv("new_harmon.csv" , sep = ",")
codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
names(codebook_col_names) <- c("col_names")
setwd(paste(dir_name ,"/harmonized_data", sep=""))
file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){
harmonized_data <- read.csv(file_name)
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name)
}
#----------------------------------------------------------------------------
#Test if column names are valid
check_column_names <- function(col_names){
str_res <- "The column names:"
valid_colnames <- c()
repeated_colnames <- c()
for(i in 1:(nrow(col_names))){
col_name <- col_names[i,1]
number_of_column <- check_valid_name(col_name)
if( number_of_column == 0){
str_res<- paste(str_res, col_name, sep=" ")
}else if (number_of_column == 1){
valid_colnames = c(valid_colnames, col_name)
}else{
repeated_colnames = c(repeated_colnames , col_name)
}
}
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames)
return (new_list)
}
#Test if a single variable name is valid
check_valid_name <- function(col_name){
valid <- 0
if(col_name %in% codebook_col_names$col_names){
valid <- length(grep(col_name, names(harmonized_data)))
}
return (valid)
}
check_valid_values_continuous <- function(colname , codebook_param , column){
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format , "/")[[1]]
# Fallará cuando el codebook no tenga min-max / .
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str , "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
return(str_res)
}
check_valid_values_binary <- function(colname , column){
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
return(str_res)
}
check_valid_values_categorical <- function(colname , codebook_param , column){
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format , "/")[[1]]
possible_values_list <- lapply(possible_values_list , str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2){
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
}else{
possible_values_list <- lapply(possible_values_list , strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value ]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if(number_of_failing_values == 0){
str_res <- "No failing values"
}else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(check_valid_columns)
#datashield.logout(connections)
cat(result)
file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
dir.create("../invalid_values", showWarnings = FALSE)
setwd("../invalid_values")
cat(check_valid_columns,file=file_name,sep="\n")
cat(result,file=file_name,append=TRUE)
#datashield.logout(connections)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment