Commit 3d03e9b5 authored by Pepe Márquez Romero's avatar Pepe Márquez Romero

cambiando el codebook y empezando a cambiar el analisis de los valores

parent 3a586839
...@@ -4,22 +4,16 @@ dir_name <- readline("Introduce the name of the directory please: ") ...@@ -4,22 +4,16 @@ dir_name <- readline("Introduce the name of the directory please: ")
setwd(dir_name) setwd(dir_name)
source("required_folder_checker.R")
source("argument_hasher.R")
source("dependency_installer.R") source("dependency_installer.R")
dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list) install_dependencies(dep_list)
setwd(dir_name)
#source("connection_parameters.R") #source("connection_parameters.R")
#source("necessary_functions_connection.R") #source("necessary_functions_connection.R")
codebook <- read.csv("harmon.csv" , sep = ";") codebook <- read.csv("new_harmon.csv" , sep = ",")
codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
...@@ -47,18 +41,22 @@ check_column_names <- function(col_names){ ...@@ -47,18 +41,22 @@ check_column_names <- function(col_names){
str_res <- "The column names:" str_res <- "The column names:"
valid_colnames <- c() valid_colnames <- c()
repeated_colnames <- c()
for(i in 1:(nrow(col_names))){ for(i in 1:(nrow(col_names))){
col_name <- col_names[i,1] col_name <- col_names[i,1]
if(!check_valid_name(col_name)){ number_of_column <- check_valid_name(col_name)
if( number_of_column == 0){
str_res<- paste(str_res, col_name, sep=" ") str_res<- paste(str_res, col_name, sep=" ")
}else{ }else if (number_of_column == 1){
valid_colnames = c(valid_colnames, col_name) valid_colnames = c(valid_colnames, col_name)
}else{
repeated_colnames = c(repeated_colnames , col_name)
} }
} }
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames) new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames)
return (new_list) return (new_list)
} }
...@@ -66,10 +64,13 @@ check_column_names <- function(col_names){ ...@@ -66,10 +64,13 @@ check_column_names <- function(col_names){
#Test if a single variable name is valid #Test if a single variable name is valid
check_valid_name <- function(col_name){ check_valid_name <- function(col_name){
valid <- FALSE valid <- 0
if(col_name %in% codebook_col_names$col_names) if(col_name %in% codebook_col_names$col_names){
valid <- TRUE
valid <- length(grep(col_name, names(harmonized_data)))
}
return (valid) return (valid)
...@@ -98,7 +99,7 @@ is_number <- function(x){ ...@@ -98,7 +99,7 @@ is_number <- function(x){
x <- str_replace(x,",",".") x <- str_replace(x,",",".")
aux <- as.numeric(x) aux <- as.numeric(x)
if(!is.na(aux)) if(!is.na(aux))
res <- TRUE res <- TRUE
...@@ -129,7 +130,7 @@ check_values_not_categoric <- function(values, colname){ ...@@ -129,7 +130,7 @@ check_values_not_categoric <- function(values, colname){
if(is.null(value)){ if(is.null(value)){
res <- TRUE res <- TRUE
} }
else if( value == "NA" | value == "nan" | value == ".") else if( value == "NA" | value == "nan" | value == ".")
res <- TRUE res <- TRUE
else{ else{
...@@ -376,7 +377,7 @@ error_message <- function(colname, invalid_values){ ...@@ -376,7 +377,7 @@ error_message <- function(colname, invalid_values){
} }
check_valid_values <- function(){ check_valid_values <- function(valid_colnames){
invalid_name_list <- c() invalid_name_list <- c()
cannot_analyse_list <- c() cannot_analyse_list <- c()
...@@ -388,80 +389,59 @@ check_valid_values <- function(){ ...@@ -388,80 +389,59 @@ check_valid_values <- function(){
k <- 1 k <- 1
for(i in 1:(nrow(valid_colnames))){ for(i in 1:(nrow(valid_colnames))){
name <- names(valid_colnames_with_data)[i]
if("DMRBORN" == name | grepl("DAT",colname, fixed=TRUE) | "ISO" == name | "BEF" == name){
next
}
column <- valid_colnames[,i]
data_table ="empty" data_table <- as.data.frame(table(column))
if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ values <- row.names(data_table)
numeric_col<- paste(valid_colnames[,i],"_numeric", sep="")
if( name %in% categoric_vars ){
column <- "data$" #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE)
column <- paste(column, valid_colnames[i,1], sep="") has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)`
tryCatch( if(!has_numeric)
error = function(cnd) { missing_numeric <- c(missing_numeric, valid_colnames[i,1])
print("Unable to analyse data")
res <- FALSE
},
data_table <- as.data.frame(table(column))
)
if(data_table == "empty"){
if(!check_values_categoric(values,valid_colnames[i,1])){
cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
}else{
if (data_table[[1]] == "All studies failed for reasons identified below")
values <- get_values_from_quantiles(column)
else
values <- row.names(data_table)
numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") print("Wrong categoric value:")
print(valid_colnames[i,1])
if( valid_colnames[i,1] %in% categoric_vars ){ wrong_categoric <- c(wrong_categoric, valid_colnames[i,1])
wrong_categoric_values[[k]] <- values
#is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) k <- k+1
has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` }
if(!has_numeric) }else{
missing_numeric <- c(missing_numeric, valid_colnames[i,1])
if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1]
if (data_table[[1]] == "All studies failed for reasons identified below"){ else
new_colname <- valid_colnames[i,1]
cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
valid <- check_values_not_categoric(values, new_colname)
}else if(!check_values_categoric(values,valid_colnames[i,1])){
if (FALSE %in% valid){
print("Wrong categoric value:") invalid_name_list <- c(invalid_name_list,valid_colnames[i,1])
print(valid_colnames[i,1]) invalid_values_list[[j]] <- values
j <- j+1
wrong_categoric <- c(wrong_categoric, valid_colnames[i,1])
wrong_categoric_values[[k]] <- values
k <- k+1
}
}else{
if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1]
else
new_colname <- valid_colnames[i,1]
valid <- check_values_not_categoric(values, new_colname)
if (FALSE %in% valid){
invalid_name_list <- c(invalid_name_list,valid_colnames[i,1])
invalid_values_list[[j]] <- values
j <- j+1
}
}
} }
} }
} }
missing_numeric missing_numeric
...@@ -476,7 +456,7 @@ check_valid_values <- function(){ ...@@ -476,7 +456,7 @@ check_valid_values <- function(){
res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" )
} }
...@@ -505,10 +485,11 @@ columns_not_valid <- check_valid_columns$not_colnames ...@@ -505,10 +485,11 @@ columns_not_valid <- check_valid_columns$not_colnames
valid_colnames <- as.data.frame(check_valid_columns$colnames) valid_colnames <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames) = c("valid_colnames") names(valid_colnames) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames$valid_colnames)
result <- "" result <- ""
result<-check_valid_values() result<-check_valid_values(valid_colnames_with_data)
print(check_valid_columns) print(check_valid_columns)
#datashield.logout(connections) #datashield.logout(connections)
cat(result) cat(result)
...@@ -525,4 +506,4 @@ cat(check_valid_columns,file=file_name,sep="\n") ...@@ -525,4 +506,4 @@ cat(check_valid_columns,file=file_name,sep="\n")
cat(result,file=file_name,append=TRUE) cat(result,file=file_name,append=TRUE)
#datashield.logout(connections) #datashield.logout(connections)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment