Commit 47ee4e03 authored by GNajeral's avatar GNajeral

Some very minor changes for 'Guille'

Added the ";" separator
Added local line for the wd path
parent 2ebe9c3e
File added
install_dependencies(dep_list)
codebook_file <- "20220315_Data Harmonisation.xlsb.xlsx"
codebook_demo <- read.xlsx(codebook_file , sheet = 2 )
codebook_com_and_rf <- read.xlsx(codebook_file , sheet = 3 )
codebook_home_med <- read.xlsx(codebook_file , sheet = 4 )
codebook_si_sympt <- read.xlsx(codebook_file , sheet = 5 )
codebook_treatments <- read.xlsx(codebook_file , sheet = 6 )
codebook_labo <- read.xlsx(codebook_file , sheet = 7 )
codebook_complications <- read.xlsx(codebook_file , sheet = 8 )
codebook_imaging_data <- read.xlsx(codebook_file , sheet = 9 )
codebook_lifestyle_diet <- read.xlsx(codebook_file , sheet = 10 )
codebook_dates <- read.xlsx(codebook_file , sheet = 11 )
codebook <- rbind(codebook_demo , codebook_com_and_rf)
codebook <- rbind(codebook , codebook_home_med)
codebook <- rbind(codebook , codebook_si_sympt)
codebook <- rbind(codebook , codebook_treatments)
codebook <- rbind(codebook , codebook_labo)
codebook <- rbind(codebook , codebook_complications)
codebook <- rbind(codebook , codebook_imaging_data)
codebook_lifestyle_diet <- codebook_lifestyle_diet[, !names(codebook_lifestyle_diet) %in% c("X2", "X4" , "X10")]
codebook <- rbind(codebook , codebook_lifestyle_diet)
codebook <- rbind(codebook , codebook_dates)
codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
names(codebook_col_names) <- c("col_names")
setwd(paste(dir_name ,"/harmonized_data", sep=""))
file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){
harmonized_data <- read.csv(file_name)
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name)
}
#Test if column names are valid
check_column_names <- function(col_names){
str_res <- "The column names:"
valid_colnames <- c()
repeated_colnames <- c()
for(i in 1:(nrow(col_names))){
col_name <- col_names[i,1]
number_of_column <- check_valid_name(col_name)
if( number_of_column == 0){
str_res<- paste(str_res, col_name, sep=" ")
}else if (number_of_column == 1){
valid_colnames = c(valid_colnames, col_name)
}else{
repeated_colnames = c(repeated_colnames , col_name)
}
}
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames)
return (new_list)
}
#Test if a single variable name is valid
check_valid_name <- function(col_name){
valid <- 0
if(col_name %in% codebook_col_names$col_names){
valid <- length(grep(col_name, names(harmonized_data)))
}
return (valid)
}
check_valid_values_continuous <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format, "/")[[1]]
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str, "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, "(continuous)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]]
possible_values_list <- lapply(possible_values_list, str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2) {
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
} else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_binary <- function(colname, column) {
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else {
range_as_str <- "0-1 (binary)"
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
if (grepl("DAT", name, fixed=TRUE)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
if (is.na(column_type) ) {
variable <- paste("Variable ", name, " wrong", sep = " ")
res <- paste(res, variable , sep="\n")
next
}
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
{
paste("some column " , column_type , sep = " ")
}
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid)
cat(result)
check_valid_values_continuous <- function(colname , codebook_param , column){
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format , "/")[[1]]
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str , "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else{
failing_values <- failing_values[!is.na(failing_values)]
failing_value_counts <- table(failing_values)
str_res <- paste(colname, "has failing values:")
for (i in seq_along(failing_value_counts)) {
value <- names(failing_value_counts)[i]
count <- failing_value_counts[i]
str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
}
str_res <- paste(str_res, "should be in range", range_as_str, "(continuous)", sep = " ")
}
return(str_res)
}
check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]]
possible_values_list <- lapply(possible_values_list, str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2) {
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
} else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)")
failing_values <- failing_values[!is.na(failing_values)]
failing_value_counts <- table(failing_values)
str_res <- paste(colname, "has failing values:")
for (i in seq_along(failing_value_counts)) {
value <- names(failing_value_counts)[i]
count <- failing_value_counts[i]
str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
}
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_binary <- function(colname, column) {
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else {
range_as_str <- "0-1 (binary)"
failing_values <- failing_values[!is.na(failing_values)]
failing_value_counts <- table(failing_values)
str_res <- paste(colname, "has failing values:")
for (i in seq_along(failing_value_counts)) {
value <- names(failing_value_counts)[i]
count <- failing_value_counts[i]
str_res <- paste(str_res, paste(value, "(", count, "times)", collapse = " "), sep = " ")
}
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
if (grepl("DAT", name, fixed=TRUE)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
if (is.na(column_type) ) {
variable <- paste("Variable ", name, " wrong", sep = " ")
res <- paste(res, variable , sep="\n")
next
}
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
{
paste("some column " , column_type , sep = " ")
}
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid)
cat(result)
check_valid_values_continuous <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format, "/")[[1]]
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str, "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, "(continuous)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]]
possible_values_list <- lapply(possible_values_list, str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2) {
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
} else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_binary <- function(colname, column) {
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else {
range_as_str <- "0-1 (binary)"
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
if (grepl("DAT", name, fixed=TRUE)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
if (is.na(column_type) ) {
variable <- paste("Variable ", name, " wrong", sep = " ")
res <- paste(res, variable , sep="\n")
next
}
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
{
paste("some column " , column_type , sep = " ")
}
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid)
cat(result)
check_valid_values_continuous <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format, "/")[[1]]
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str, "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, "(continuous)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_categorical <- function(colname, codebook_param, column) {
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format, "/")[[1]]
possible_values_list <- lapply(possible_values_list, str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2) {
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
} else {
possible_values_list <- lapply(possible_values_list, strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if (number_of_failing_values == 0) {
str_res <- "No failing values"
} else {
range_as_str <- paste(min_value, "-", max_value, " (categorical)")
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values_binary <- function(colname, column) {
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else {
range_as_str <- "0-1 (binary)"
str_res <- paste(colname, "has", number_of_failing_values, "failing values")
str_res <- paste(str_res, "should be in range", range_as_str, sep = " ")
}
return(str_res)
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
if (grepl("DAT", name, fixed=TRUE)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
if (is.na(column_type) ) {
variable <- paste("Variable ", name, " wrong", sep = " ")
res <- paste(res, variable , sep="\n")
next
}
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
{
paste("some column " , column_type , sep = " ")
}
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(columns_not_valid)
cat(result)
*.xlsx
harmonized_data/*
.Rproj.user
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
......@@ -2,6 +2,7 @@ rm(list=ls())
dir_name <- readline("Introduce the name of the directory please: ")
#/Users/gnl/Documents/CTB UPM/UNCOVER/uncover_harmonization
#C:/Users/guill/Documents/harmonize_scripts
setwd(dir_name)
......
......@@ -51,7 +51,7 @@ file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){
harmonized_data <- read.csv(file_name)
harmonized_data <- read.csv(file_name, sep=";")
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment