rm(list=ls()) setwd("/home/david/Documentos/Uncover/ScriptsVictor") source("dependency_installer.R") dep_list = c("magrittr","officer","dplyr","stringr","DSI","DSOpal","DSLite","dsBaseClient") install_dependencies(dep_list) #,"DSI","DSOpal","DSLite" source("connection_parameters.R") source("necessary_functions_connection.R") codebook <- data.frame(read.csv("new_harmon.csv", sep = ",")) codebook_variable_names <- codebook[,2] namesDescMes <- codebook[,c(2,4,6)] categoric_vars = c( #Si&Sympt "CSXCTR","SMXASAH","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXSEA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CSXCOT", #Com&RF "CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN","CMXCHF","CMXVDP","CMXVDC","CMXCA","CMXVD","CMXABL","CMXAD","CMXNDP","CMXNDD","CMXCOPD","CMXPCD","CMXRI","CMXMLD","CMXNPM","CMXCLH","CMXMET","CMXTU","CMXMY","CMXNP","RFXMD","CMXANX","CMXBDO","CMXEPI","CMXOAR","CMXSPI","CMXSOT","CMXOKD","CMXUTI","CMXINC","CMXHPP","CMXHGL","RFXLC","RFXPM","RFXAP","CMXIHD","CMXATH", #Treatment "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", #Dates "DSXHO","DSXIC","DSXOS", #Home med "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","HMRANC","HMRAT","HMRINS", #Demographics "DMRGENDR","DMRRETH1","DMROCCU","DMRHREDU", #Imaging Data "IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXXRI","IMDXPN", #Complications "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC","COXRF","COXLF","COXADE","COXSTN","COXNOC" ) date_vars <- c("DATCOT","DATSO","DATAD","DATADI","DATDSI","DATDS","DATPR","DATOS") #Check which hospitals contain a given value hospitals_contains_variable <- function(variable){ res <- c() for(i in seq_along(full_variable_list)){ if (variable %in% unlist(full_variable_list[i])) res <- c(res, hospital_names[i]) } return(res) } get_working_hospitals <- function(errors){ working <- c() for(i in seq_along(hosp_to_analyze)){ hosp_in_err <- grepl(hosp_to_analyze[i], errors, fixed = TRUE) if(!(TRUE %in% hosp_in_err)) working <- c(working, hosp_to_analyze[i]) } return(working) } auxConnections <- connect() connections <- auxConnections[[1]] inp <- auxConnections[[2]] #-------------------------------------------------------- #Extract the column names from the hospital datasets full_variable_list <- ds.colnames("data") data_dimentions <- head(ds.dim("data"),-1) data_dimentions <- lapply(data_dimentions, `[[`, 1) names(data_dimentions) <- names(full_variable_list) hospital_names <- names(full_variable_list) #Intersect the elements from the variables present in the hospitals and the ones in the codebook # COMPARAR LAS VARIABLES QUE TIENEN CADA HOSPITAL CON LAS DEL CODEBOOK Y ASI ELIMINAR LAS QUE NO ESTEN for(i in seq_along(full_variable_list)){ hospital_variable_list <- unlist(full_variable_list[i]) full_variable_list[[i]] <- Reduce(intersect, list(hospital_variable_list,codebook_variable_names)) } #Make a list containing all the variables to analyze # HACES LA UNION DE LAS VARIABLES DE TODOS LOS HOSPITALES variables_to_analyze <- Reduce(union, full_variable_list) #aux_vars <- list("DMRAGEYR","LBXSBUA","LBXCRPA","LBXPLTSIA","CSXCHRA","CSXOSTA","LBXFIOA","SMXBLA","SMXSBA","SMXACA","RFXONC","LBXNENOA","LBXLYMNOA","LBXSBLA","LBXSCBA","DMRGENDR", "CSXRRA","CSXOSTA","SMXACA","LBXSBUA","LBXCRPA") # CON ESTOS SELECCIONAS LAS VARIABLES QUE QUIERES ANALIZAR aux_vars <- list("DMRGENDR", "SMXACA", "DMRAGEYR","CSXBTPA","CSXDIA","CSXOSTA","CSXRRA","LBXSBUA") vars_list <- as.list(variables_to_analyze) variables_to_analyze <- unlist(Reduce(intersect, list(aux_vars,vars_list))) df_list <- list() variances_list <- list() df_categoric_list <- list() variances_list_categoric <- list() for(i in seq_along(variables_to_analyze)){ #Select the variable to analyse variable <- variables_to_analyze[i] dataset_variable <- paste0("data$", variable) is_categoric <- variable %in% categoric_vars is_date <- variable %in% date_vars print(variable) hosp_to_analyze <- hospitals_contains_variable(variable) #If the variable is categoric if(is_categoric){ print("categoric") OK <- FALSE try({ categoric_table <- ds.table(dataset_variable, datasources = connections[hosp_to_analyze]) OK <- TRUE }) if(OK){ result <- TRUE list_bool <- !grepl("failed", categoric_table[[1]]) list_bool <- c(TRUE, TRUE, TRUE) for (i in seq_along(list_bool)){ result <- result & list_bool[i] } if(result){ categoric_table <- as.data.frame(categoric_table[[1]][[2]]) categoric_table[,"column variance"] <- NA for( j in seq_len(nrow(categoric_table))){ row_values <- as.numeric(as.vector(categoric_table[j,])) row_variance <- var(row_values[!is.na(row_values)]) categoric_table[j,"column variance"] <- row_variance } categoric_table <- categoric_table %>% mutate_if(is.numeric, round, 3) categoric_table <- as.data.frame(t(categoric_table)) #aux_df <- #INCLUIR UNA COLUMNA QUE TENGA LOS NOMBRES DE LOS HOSPITALES categoric_table["Hospitals"] <- row.names(categoric_table) aux_table <- categoric_table[c(ncol(categoric_table), 1:(ncol(categoric_table)-1))] aux_list <- list(aux_table) names(aux_list) <- variable df_categoric_list <- append(df_categoric_list, aux_list) vector_aux <- as.numeric(aux_table["column variance",][,-1]) mean_aux <- list(mean(vector_aux[!is.na(vector_aux)])) names(mean_aux) <- variable variances_list_categoric <- append(variances_list_categoric,mean_aux) }else{ print("This variable cannot be analyzed") print(paste("Error in the hospitals:", aux_table["Hospitals"])) } } # If the variable is nuemric }else if (!is_categoric & !is_date){ print("numeric") #Obtain the means and the missing values for each hospital res <- tryCatch({ # OBTIENE LAS MEDIAS DE LOS HOSPITALES means <- as.data.frame(ds.mean(dataset_variable, datasources = connections[hosp_to_analyze])[[1]])[c(1,2)] working_hospitals <- hosp_to_analyze follow <- TRUE res<- list(follow, means, working_hospitals) return(res) },error = function(e){ errors <- datashield.errors() #If all the hospitals that contain the variable give errors, the variable is skipped if(length(errors) != length(hosp_to_analyze)){ working_hospitals <- get_working_hospitals(errors) means <- as.data.frame(ds.mean(dataset_variable, datasources = connections[working_hospitals])[[1]])[c(1,2)] follow <- TRUE print("Hospitals that contain the variable:") print(hosp_to_analyze) print("Hospitals that do not give errors:") print(working_hospitals) }else{ err_mes <- paste("Unable to analyze data for the variable:", variable) print(err_mes) follow <- FALSE } res<- list(follow, means, working_hospitals) return(res) }) follow <- unlist(res[1]) means <- as.data.frame(res[2]) working_hospitals <- unlist(res[3]) if(follow == TRUE){ # PONES BONITO LOS VALORES MISSING means["Hospital"] <- rownames(means) df_aux <- means[,c(3,1,2)] colnames(df_aux) <- c("Hospital", "Mean", "Missing") df_aux$Mean<-format(round(df_aux$Mean,2),nsmall=2) missing <- as.list(df_aux$Missing) percentages <- lapply(mapply(FUN = `/`, missing, data_dimentions[working_hospitals], SIMPLIFY = FALSE), round, 4) percentages <- lapply(percentages,"*", 100) for(j in 1:length(percentages)){ value <- paste(paste(paste(missing[j], "("),percentages[j],sep=""),"%)",sep="") percentages[j] <- value } df_aux$Missing <- unlist(percentages) } means_var <- var(means[,1]) #Store the obtained values aux_list <- list(means_var) names(aux_list) <- variable variances_list<- append(variances_list,aux_list) aux_list <- list(df_aux) names(aux_list) <- variable df_list<- append(df_list,aux_list) }else if (!is_categoric & is_date){ print("date") } } datashield.logout(connections) doc_1 <- read_docx() %>% body_add_par("Data quality report", style = "heading 1")%>% body_add_par("", style = "Normal")%>% body_add_par(value = "This document contains information on every variable present in the hospital's datasets.", style = "Normal") %>% body_add_par("The variables will be displayed following a decreasing order of the means variance. If a variable is missing a hospital, it means that there has been an error analyzing that variable in that specific hospital.", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("Numeric variables", style = "heading 1")%>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% print(doc_1, target = "data_quality_report.docx") ordered_variance_list <- variances_list[order(unlist(variances_list), decreasing = TRUE)] ordered_variance_list_Not_NA <- ordered_variance_list[!is.na(ordered_variance_list)] ordered_variance_list_NA <- ordered_variance_list[is.na(ordered_variance_list)] ordered_names_Not_NA <- names(ordered_variance_list_Not_NA) ordered_names_NA <- names(ordered_variance_list_NA) # SOLO RECORRE LAS VARIABLES NUMERICAS QUE TIENE VARIANZA NO NULA for (i in 1:length(ordered_variance_list_Not_NA)) { element <- ordered_names_Not_NA[i] tab1 <- df_list[[element]] variance <- variances_list[[element]] if(!is.na(variance)) variance <- format(round(variance, 2), nsmall = 2) print(element) codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element) description <- unlist(codebook_entry[4]) unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1] range <- codebook_entry[[7]] mes_range <- paste("The range of possible values is:", range) mes_var <- paste("The means variance obtained for this variable is", variance) mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and is measured in"),unit) doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par(element, style = "heading 2")%>% body_add_par(mes_var)%>% body_add_par(mes_desc_unit)%>% body_add_par(mes_range)%>% body_add_par("", style = "Normal")%>% body_add_table(tab1, style = "Table Professional")%>% print(doc_table, target = "data_quality_report.docx") } doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("The following variables only appear in one hospital, the variance between the means will not be displayed.", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% print(doc_table, target = "data_quality_report.docx") for (i in 1:length(ordered_variance_list_NA)) { element <- ordered_names_NA[i] tab1 <- df_list[[element]] print(element) codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element) description <- unlist(codebook_entry[4]) unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1] range <- codebook_entry[[7]] mes_range <- paste("The range of possible values is:", range) mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and is measured in"),unit) doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par(element, style = "heading 2")%>% body_add_par(mes_desc_unit)%>% body_add_par(mes_range)%>% body_add_par("", style = "Normal")%>% body_add_table(tab1, style = "Table Professional")%>% print(doc_table, target = "data_quality_report.docx") } if (length(df_categoric_list) != 0){ doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("Categoric variables", style = "heading 1")%>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% print(doc_table, target = "data_quality_report.docx") categoric_names <- names(df_categoric_list) ordered_variance_list_categoric <- variances_list_categoric[order(unlist(variances_list_categoric), decreasing = TRUE)] ordered_variance_list_Not_NA_categoric <- ordered_variance_list_categoric[!is.na(ordered_variance_list_categoric)] ordered_variance_list_NA_categoric <- ordered_variance_list_categoric[is.na(ordered_variance_list_categoric)] ordered_names_Not_NA_categoric <- names(ordered_variance_list_Not_NA_categoric) ordered_names_NA_categoric <- names(ordered_variance_list_NA_categoric) for (i in 1:length(ordered_names_Not_NA_categoric)) { element <- ordered_names_Not_NA_categoric[i] tab1 <- as.data.frame(df_categoric_list[[element]]) var <- unlist(ordered_variance_list_Not_NA_categoric[element]) print(element) codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element) description <- unlist(codebook_entry[4]) unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1] mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and the possible values are"),unit) mes_var <- paste("The variance between rows is:", var) doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par(element, style = "heading 2")%>% body_add_par(mes_desc_unit)%>% body_add_par(mes_var)%>% body_add_par("", style = "Normal")%>% body_add_table(tab1, style = "Table Professional")%>% print(doc_table, target = "data_quality_report.docx") } doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("The following variables only appear in one hospital, the variance between the means will not be displayed.", style = "Normal")%>% body_add_par("", style = "Normal")%>% body_add_par("", style = "Normal")%>% print(doc_table, target = "data_quality_report.docx") if(length(ordered_names_NA_categoric)>0){ for (i in 1:length(ordered_names_NA_categoric)) { element <- ordered_names_NA_categoric[i] tab1 <- as.data.frame(df_categoric_list[[element]]) print(element) codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element) description <- unlist(codebook_entry[4]) unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1] mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and the possible values are"),unit) doc_table <- read_docx(path = "data_quality_report.docx") %>% body_add_par(element, style = "heading 2")%>% body_add_par(mes_desc_unit)%>% body_add_par("", style = "Normal")%>% body_add_table(tab1, style = "Table Professional")%>% print(doc_table, target = "data_quality_report.docx") } } }