Commit ef256262 authored by GNajeral's avatar GNajeral

First working script

parent 841190e3
......@@ -49,9 +49,6 @@ codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
names(codebook_col_names) <- c("col_names")
categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
#----------------------------------------------------------------------------
#Test if column names are valid
......@@ -132,8 +129,8 @@ is_number <- function(x){
check_values_format <- function(valid_columns, codebook_param){
res <- ""
variables_out_of_range = "Variables out of range:"
# for(i in 1:length(valid_columns[[1]])){
for(i in 1:9){
for(i in 1:length(valid_columns[[1]])){
#for(i in 1:1){
current_column <- valid_columns[[1]][[i]]
print(current_column)
variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column]
......@@ -150,12 +147,13 @@ check_values_format <- function(valid_columns, codebook_param){
### parse del formato de una variable continua ##
## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete.
value_format <- strsplit(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column], " / ")[[1]]
high_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
low_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
value_format <- strsplit(possible_values, " / ")[[1]]
low_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
high_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
if(low_limit == ""){
high_limit <- str_trim(sub(",.*", "", value_format[1]))
low_limit <- str_trim(strtrimsub(".*,", "", value_format[1]))
low_limit <- str_trim(sub(",.*", "", value_format[1]))
high_limit <- str_trim(strtrimsub(".*,", "", value_format[1]))
}
### parse del formato de una variable continua ##
......@@ -169,6 +167,7 @@ check_values_format <- function(valid_columns, codebook_param){
res <- c(res, error)
},
{
print(paste("Higher Limit: ", high_limit))
ds.dataFrameSubset(df.name = "data",
V1.name = paste("data$", current_column, sep=""),
V2.name = high_limit,
......@@ -177,7 +176,7 @@ check_values_format <- function(valid_columns, codebook_param){
keep.NAs = TRUE,
datasources = connections)
print(paste("Lower Limit: ", low_limit))
ds.dataFrameSubset(df.name = "inRangeHigh",
V1.name = paste("inRangeHigh$", current_column, sep=""),
V2.name = low_limit,
......@@ -190,13 +189,18 @@ check_values_format <- function(valid_columns, codebook_param){
if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){
variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ")
print(paste(current_column, "does not follow the established format", sep=" "))
print(paste("It should follow the following format: ", possible_values))
}
else{
print(paste(paste("Data in: ", current_column), " was valid"))
}
}
)
################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE #############
}else if (variable_type == "Categorical" || variable_type == "Binary"){
value_format <- lapply(strsplit(mierda, "/") , str_trim)[[1]]
possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
value_format <- lapply(strsplit(possible_values, "/") , str_trim)[[1]]
tryCatch(
error = function(cnd) {
if(grepl("list them with datashield.errors()",cnd))
......@@ -209,14 +213,22 @@ check_values_format <- function(valid_columns, codebook_param){
{
contingency_table <- ds.table(paste("data$",current_column,sep=""))
row_names <- rownames(contingency_table[[1]][[3]])
result <- FALSE
for (i in 1:length(row_names)) {
if(row_names[i] == "NA")
next
if(!row_names[i] %in% value_format){
variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ")
print(paste(current_column, "does not follow the established format", sep=" "))
print(paste("It should follow the following format:", possible_values))
print(paste("Instead of:", paste(row_names, collapse = " ")))
result <- TRUE
break
}
}
if(!result){
print(paste(paste("Data in: ", current_column), " was valid"))
}
}
)
}
......@@ -232,28 +244,28 @@ inp <- auxConnections[[2]]
#Conexión a la base de datos
# ds.dim("data", datasources = connections)
# colnames <- ds.colnames("data")
# colnames
#
ds.dim("data", datasources = connections)
colnames <- ds.colnames("data")
colnames
# ds.dataFrameSubset(df.name = "data",
# V1.name = "data$DMXBMI",
# V2.name = "130",
# V1.name = "data$DMRAGEYR",
# V2.name = "150",
# Boolean.operator = "<=",
# newobj = "inRangeHigh",
# keep.NAs = TRUE,
# datasources = connections)
#
# ds.dataFrameSubset(df.name = "inRangeHigh",
# V1.name = "inRangeHigh$DMXBMI",
# V2.name = "11",
# V1.name = "inRangeHigh$DMRAGEYR",
# V2.name = "0",
# Boolean.operator = ">=",
# newobj = "inRange",
# keep.NAs = TRUE,
# datasources = connections)
#
# summary <- ds.summary("inRange$DMXBMI")
# if(ds.length("data$DMXBMI")[[1]] > summary[[1]][[2]]){
# summary <- ds.summary("inRange$DMRAGEYR")
# if(ds.length("data$DMRAGEYR")[[1]] > summary[[1]][[2]]){
# res <- c(res, paste(current_column, "does not follow the established format" , sep="\n"))
# }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment