Commit ef256262 authored by GNajeral's avatar GNajeral

First working script

parent 841190e3
...@@ -49,9 +49,6 @@ codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) ...@@ -49,9 +49,6 @@ codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
names(codebook_col_names) <- c("col_names") names(codebook_col_names) <- c("col_names")
categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
#Test if column names are valid #Test if column names are valid
...@@ -132,8 +129,8 @@ is_number <- function(x){ ...@@ -132,8 +129,8 @@ is_number <- function(x){
check_values_format <- function(valid_columns, codebook_param){ check_values_format <- function(valid_columns, codebook_param){
res <- "" res <- ""
variables_out_of_range = "Variables out of range:" variables_out_of_range = "Variables out of range:"
# for(i in 1:length(valid_columns[[1]])){ for(i in 1:length(valid_columns[[1]])){
for(i in 1:9){ #for(i in 1:1){
current_column <- valid_columns[[1]][[i]] current_column <- valid_columns[[1]][[i]]
print(current_column) print(current_column)
variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column] variable_type <- codebook_param$Variable.type[codebook$Harmonised.variable.name == current_column]
...@@ -150,12 +147,13 @@ check_values_format <- function(valid_columns, codebook_param){ ...@@ -150,12 +147,13 @@ check_values_format <- function(valid_columns, codebook_param){
### parse del formato de una variable continua ## ### parse del formato de una variable continua ##
## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete. ## esta sentencia funciona codebook$Possible.values.format[codebook$Harmonised.variable.name == "CMXDE"] pruebala en el interprete.
value_format <- strsplit(codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column], " / ")[[1]] possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
high_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1])))) value_format <- strsplit(possible_values, " / ")[[1]]
low_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1])))) low_limit <- str_trim(gsub(",", ".", (sub("-.*", "", value_format[1]))))
high_limit <- str_trim(gsub(",", ".", (sub(".*-", "", value_format[1]))))
if(low_limit == ""){ if(low_limit == ""){
high_limit <- str_trim(sub(",.*", "", value_format[1])) low_limit <- str_trim(sub(",.*", "", value_format[1]))
low_limit <- str_trim(strtrimsub(".*,", "", value_format[1])) high_limit <- str_trim(strtrimsub(".*,", "", value_format[1]))
} }
### parse del formato de una variable continua ## ### parse del formato de una variable continua ##
...@@ -169,6 +167,7 @@ check_values_format <- function(valid_columns, codebook_param){ ...@@ -169,6 +167,7 @@ check_values_format <- function(valid_columns, codebook_param){
res <- c(res, error) res <- c(res, error)
}, },
{ {
print(paste("Higher Limit: ", high_limit))
ds.dataFrameSubset(df.name = "data", ds.dataFrameSubset(df.name = "data",
V1.name = paste("data$", current_column, sep=""), V1.name = paste("data$", current_column, sep=""),
V2.name = high_limit, V2.name = high_limit,
...@@ -177,7 +176,7 @@ check_values_format <- function(valid_columns, codebook_param){ ...@@ -177,7 +176,7 @@ check_values_format <- function(valid_columns, codebook_param){
keep.NAs = TRUE, keep.NAs = TRUE,
datasources = connections) datasources = connections)
print(paste("Lower Limit: ", low_limit))
ds.dataFrameSubset(df.name = "inRangeHigh", ds.dataFrameSubset(df.name = "inRangeHigh",
V1.name = paste("inRangeHigh$", current_column, sep=""), V1.name = paste("inRangeHigh$", current_column, sep=""),
V2.name = low_limit, V2.name = low_limit,
...@@ -190,13 +189,18 @@ check_values_format <- function(valid_columns, codebook_param){ ...@@ -190,13 +189,18 @@ check_values_format <- function(valid_columns, codebook_param){
if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){ if(ds.length(paste("data$", current_column, sep=""))[[1]] > summary[[1]][[2]]){
variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ")
print(paste(current_column, "does not follow the established format", sep=" ")) print(paste(current_column, "does not follow the established format", sep=" "))
print(paste("It should follow the following format: ", possible_values))
}
else{
print(paste(paste("Data in: ", current_column), " was valid"))
} }
} }
) )
################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE ############# ################## FIN ESTO PODRÍA IR EN UNA FUNC DIFERENTE #############
}else if (variable_type == "Categorical" || variable_type == "Binary"){ }else if (variable_type == "Categorical" || variable_type == "Binary"){
value_format <- lapply(strsplit(mierda, "/") , str_trim)[[1]] possible_values <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == current_column]
value_format <- lapply(strsplit(possible_values, "/") , str_trim)[[1]]
tryCatch( tryCatch(
error = function(cnd) { error = function(cnd) {
if(grepl("list them with datashield.errors()",cnd)) if(grepl("list them with datashield.errors()",cnd))
...@@ -209,14 +213,22 @@ check_values_format <- function(valid_columns, codebook_param){ ...@@ -209,14 +213,22 @@ check_values_format <- function(valid_columns, codebook_param){
{ {
contingency_table <- ds.table(paste("data$",current_column,sep="")) contingency_table <- ds.table(paste("data$",current_column,sep=""))
row_names <- rownames(contingency_table[[1]][[3]]) row_names <- rownames(contingency_table[[1]][[3]])
result <- FALSE
for (i in 1:length(row_names)) { for (i in 1:length(row_names)) {
if(row_names[i] == "NA") if(row_names[i] == "NA")
next next
if(!row_names[i] %in% value_format){ if(!row_names[i] %in% value_format){
variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ") variables_out_of_range <- paste(variables_out_of_range, current_column, sep = " ")
print(paste(current_column, "does not follow the established format", sep=" ")) print(paste(current_column, "does not follow the established format", sep=" "))
print(paste("It should follow the following format:", possible_values))
print(paste("Instead of:", paste(row_names, collapse = " ")))
result <- TRUE
break
} }
} }
if(!result){
print(paste(paste("Data in: ", current_column), " was valid"))
}
} }
) )
} }
...@@ -232,28 +244,28 @@ inp <- auxConnections[[2]] ...@@ -232,28 +244,28 @@ inp <- auxConnections[[2]]
#Conexión a la base de datos #Conexión a la base de datos
# ds.dim("data", datasources = connections) ds.dim("data", datasources = connections)
# colnames <- ds.colnames("data") colnames <- ds.colnames("data")
# colnames colnames
#
# ds.dataFrameSubset(df.name = "data", # ds.dataFrameSubset(df.name = "data",
# V1.name = "data$DMXBMI", # V1.name = "data$DMRAGEYR",
# V2.name = "130", # V2.name = "150",
# Boolean.operator = "<=", # Boolean.operator = "<=",
# newobj = "inRangeHigh", # newobj = "inRangeHigh",
# keep.NAs = TRUE, # keep.NAs = TRUE,
# datasources = connections) # datasources = connections)
# #
# ds.dataFrameSubset(df.name = "inRangeHigh", # ds.dataFrameSubset(df.name = "inRangeHigh",
# V1.name = "inRangeHigh$DMXBMI", # V1.name = "inRangeHigh$DMRAGEYR",
# V2.name = "11", # V2.name = "0",
# Boolean.operator = ">=", # Boolean.operator = ">=",
# newobj = "inRange", # newobj = "inRange",
# keep.NAs = TRUE, # keep.NAs = TRUE,
# datasources = connections) # datasources = connections)
# #
# summary <- ds.summary("inRange$DMXBMI") # summary <- ds.summary("inRange$DMRAGEYR")
# if(ds.length("data$DMXBMI")[[1]] > summary[[1]][[2]]){ # if(ds.length("data$DMRAGEYR")[[1]] > summary[[1]][[2]]){
# res <- c(res, paste(current_column, "does not follow the established format" , sep="\n")) # res <- c(res, paste(current_column, "does not follow the established format" , sep="\n"))
# } # }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment