rm(list=ls())

setwd("/home/david/Documentos/Uncover/ScriptsVictor")

source("dependency_installer.R")


dep_list = c("magrittr","officer","dplyr","stringr","DSI","DSOpal","DSLite","dsBaseClient")
install_dependencies(dep_list)

#,"DSI","DSOpal","DSLite"

source("connection_parameters.R")
source("necessary_functions_connection.R")

codebook <- data.frame(read.csv("new_harmon.csv", sep = ","))
codebook_variable_names <- codebook[,2]

namesDescMes <- codebook[,c(2,4,6)]

categoric_vars = c(
  #Si&Sympt
  "CSXCTR","SMXASAH","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXSEA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CSXCOT",
  
  #Com&RF
  "CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN","CMXCHF","CMXVDP","CMXVDC","CMXCA","CMXVD","CMXABL","CMXAD","CMXNDP","CMXNDD","CMXCOPD","CMXPCD","CMXRI","CMXMLD","CMXNPM","CMXCLH","CMXMET","CMXTU","CMXMY","CMXNP","RFXMD","CMXANX","CMXBDO","CMXEPI","CMXOAR","CMXSPI","CMXSOT","CMXOKD","CMXUTI","CMXINC","CMXHPP","CMXHGL","RFXLC","RFXPM","RFXAP","CMXIHD","CMXATH",
  
  #Treatment
  "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",
  
  #Dates
  "DSXHO","DSXIC","DSXOS",
  
  #Home med
  "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","HMRANC","HMRAT","HMRINS",
  
  #Demographics
  "DMRGENDR","DMRRETH1","DMROCCU","DMRHREDU",
  
  #Imaging Data
  "IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXXRI","IMDXPN",
  
  #Complications
  "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC","COXRF","COXLF","COXADE","COXSTN","COXNOC"
  
  
)

date_vars <- c("DATCOT","DATSO","DATAD","DATADI","DATDSI","DATDS","DATPR","DATOS")

#Check which hospitals contain a given value
hospitals_contains_variable <- function(variable){
  
  res <- c()
  
  for(i in seq_along(full_variable_list)){
    if (variable %in% unlist(full_variable_list[i]))
      res <- c(res, hospital_names[i])
  }
  
  return(res)
}

get_working_hospitals <- function(errors){
  
  working <- c()
  
  for(i in seq_along(hosp_to_analyze)){
    hosp_in_err <- grepl(hosp_to_analyze[i], errors, fixed = TRUE)
    if(!(TRUE %in% hosp_in_err))
      working <- c(working, hosp_to_analyze[i])
  }
  
  return(working)
}


auxConnections <- connect()
connections <- auxConnections[[1]]
inp <- auxConnections[[2]]


#--------------------------------------------------------

#Extract the column names from the hospital datasets
full_variable_list <- ds.colnames("data")
data_dimentions <- head(ds.dim("data"),-1)
data_dimentions <- lapply(data_dimentions, `[[`, 1)
names(data_dimentions) <- names(full_variable_list)

hospital_names <- names(full_variable_list)

#Intersect the elements from the variables present in the hospitals and the ones in the codebook
# COMPARAR LAS VARIABLES QUE TIENEN CADA HOSPITAL CON LAS DEL CODEBOOK Y ASI ELIMINAR LAS QUE NO ESTEN
for(i in seq_along(full_variable_list)){
  hospital_variable_list <- unlist(full_variable_list[i])
  full_variable_list[[i]] <- Reduce(intersect, list(hospital_variable_list,codebook_variable_names))
}

#Make a list containing all the variables to analyze
# HACES LA UNION DE LAS VARIABLES DE TODOS LOS HOSPITALES
variables_to_analyze <- Reduce(union, full_variable_list)
#aux_vars <- list("DMRAGEYR","LBXSBUA","LBXCRPA","LBXPLTSIA","CSXCHRA","CSXOSTA","LBXFIOA","SMXBLA","SMXSBA","SMXACA","RFXONC","LBXNENOA","LBXLYMNOA","LBXSBLA","LBXSCBA","DMRGENDR", "CSXRRA","CSXOSTA","SMXACA","LBXSBUA","LBXCRPA")
# CON ESTOS SELECCIONAS LAS VARIABLES QUE QUIERES ANALIZAR
aux_vars <- list("DMRGENDR", "SMXACA", "DMRAGEYR","CSXBTPA","CSXDIA","CSXOSTA","CSXRRA","LBXSBUA")
vars_list <- as.list(variables_to_analyze)
variables_to_analyze <- unlist(Reduce(intersect, list(aux_vars,vars_list)))

df_list <- list()
variances_list <- list()
df_categoric_list <- list()
variances_list_categoric <- list()

for(i in seq_along(variables_to_analyze)){

  #Select the variable to analyse
  variable <- variables_to_analyze[i]
  dataset_variable <- paste0("data$", variable)
  is_categoric <- variable %in% categoric_vars
  is_date <- variable %in% date_vars
  print(variable)
  
  hosp_to_analyze <- hospitals_contains_variable(variable)
  
  
  #If the variable is categoric
  if(is_categoric){
    print("categoric")
    
    OK <- FALSE
    try({
      categoric_table <- ds.table(dataset_variable, datasources = connections[hosp_to_analyze])
      OK <- TRUE
    })
    
    if(OK){
      result <- TRUE
      list_bool <- !grepl("failed", categoric_table[[1]])
      list_bool <- c(TRUE, TRUE, TRUE)
      for (i in seq_along(list_bool)){
        result <- result & list_bool[i]
      }
      if(result){
        
        categoric_table <- as.data.frame(categoric_table[[1]][[2]])
        categoric_table[,"column variance"] <- NA
        
        for( j in seq_len(nrow(categoric_table))){
          
          row_values <- as.numeric(as.vector(categoric_table[j,]))
          row_variance <-  var(row_values[!is.na(row_values)])
          categoric_table[j,"column variance"] <- row_variance
        }
        
        
        categoric_table <- categoric_table %>% mutate_if(is.numeric, round, 3)
        categoric_table <- as.data.frame(t(categoric_table))
        
        #aux_df <- #INCLUIR UNA COLUMNA QUE TENGA LOS NOMBRES DE LOS HOSPITALES
        categoric_table["Hospitals"] <- row.names(categoric_table)
        aux_table <- categoric_table[c(ncol(categoric_table), 1:(ncol(categoric_table)-1))]
        
        aux_list <- list(aux_table)
        names(aux_list) <- variable
        df_categoric_list <- append(df_categoric_list, aux_list)
        
        vector_aux <- as.numeric(aux_table["column variance",][,-1])
        mean_aux <- list(mean(vector_aux[!is.na(vector_aux)]))
        names(mean_aux) <- variable
        variances_list_categoric <- append(variances_list_categoric,mean_aux)
        
      }else{
        print("This variable cannot be analyzed")
        print(paste("Error in the hospitals:", aux_table["Hospitals"]))
      }
    }
    
  # If the variable is nuemric  
  }else
    if (!is_categoric & !is_date){
    print("numeric")
    
    #Obtain the means and the missing values for each hospital
    res <- tryCatch({
      # OBTIENE LAS MEDIAS DE LOS HOSPITALES
      means <- as.data.frame(ds.mean(dataset_variable, datasources = connections[hosp_to_analyze])[[1]])[c(1,2)]
      working_hospitals <- hosp_to_analyze
      follow <- TRUE
      res<- list(follow, means, working_hospitals)
      return(res)
  
    },error = function(e){
      errors <- datashield.errors()
      
      #If all the hospitals that contain the variable give errors, the variable is skipped
      if(length(errors) != length(hosp_to_analyze)){
        working_hospitals <- get_working_hospitals(errors)
        means <- as.data.frame(ds.mean(dataset_variable, datasources = connections[working_hospitals])[[1]])[c(1,2)]
        follow <- TRUE
        print("Hospitals that contain the variable:")
        print(hosp_to_analyze)
        print("Hospitals that do not give errors:")
        print(working_hospitals)
      }else{
        err_mes <- paste("Unable to analyze data for the variable:", variable)
        print(err_mes)
        follow <- FALSE
      }
      
      res<- list(follow, means, working_hospitals)
      
      return(res)
    })
    
    follow <- unlist(res[1])
    means <- as.data.frame(res[2])
    working_hospitals <- unlist(res[3])
    
    if(follow == TRUE){
      # PONES BONITO LOS VALORES MISSING
      means["Hospital"] <- rownames(means)
      df_aux <- means[,c(3,1,2)]
      colnames(df_aux) <- c("Hospital", "Mean", "Missing")
      df_aux$Mean<-format(round(df_aux$Mean,2),nsmall=2)
      missing <- as.list(df_aux$Missing)
      percentages <- lapply(mapply(FUN = `/`, missing, data_dimentions[working_hospitals], SIMPLIFY = FALSE), round, 4)
      percentages <- lapply(percentages,"*", 100)
      
      for(j in 1:length(percentages)){
        value <- paste(paste(paste(missing[j], "("),percentages[j],sep=""),"%)",sep="")
        percentages[j] <- value
      }
      
      df_aux$Missing <- unlist(percentages)
    }
    

    means_var <- var(means[,1])
    
    #Store the obtained values
    aux_list <- list(means_var)
    names(aux_list) <- variable
    variances_list<- append(variances_list,aux_list)
    aux_list <- list(df_aux)
    names(aux_list) <- variable
    df_list<- append(df_list,aux_list)
    
  }else if (!is_categoric & is_date){
    print("date")
  }
  
}



 datashield.logout(connections)








doc_1 <- read_docx() %>%
  body_add_par("Data quality report", style = "heading 1")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par(value = "This document contains information on every variable present in the hospital's datasets.", style = "Normal") %>% 
  body_add_par("The variables will be displayed following a decreasing order of the means variance. If a variable is missing a hospital, it means that there has been an error analyzing that variable in that specific hospital.", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("Numeric variables", style = "heading 1")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
print(doc_1, target = "data_quality_report.docx") 
  

ordered_variance_list <- variances_list[order(unlist(variances_list), decreasing = TRUE)]
ordered_variance_list_Not_NA <- ordered_variance_list[!is.na(ordered_variance_list)]
ordered_variance_list_NA <- ordered_variance_list[is.na(ordered_variance_list)]
ordered_names_Not_NA <- names(ordered_variance_list_Not_NA)
ordered_names_NA <- names(ordered_variance_list_NA)


# SOLO RECORRE LAS VARIABLES NUMERICAS QUE TIENE VARIANZA NO NULA
for (i in 1:length(ordered_variance_list_Not_NA)) {
  element <- ordered_names_Not_NA[i]
  
  tab1 <- df_list[[element]]
  variance <- variances_list[[element]]
  if(!is.na(variance))
    variance <- format(round(variance, 2), nsmall = 2)
  
  print(element)
  
  codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element)
  description <- unlist(codebook_entry[4])
  unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1]
  range <- codebook_entry[[7]]
  mes_range <- paste("The range of possible values is:", range)
  
  mes_var <- paste("The means variance obtained for this variable is", variance)
  mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and is measured in"),unit)
  
  
  
  doc_table <- read_docx(path = "data_quality_report.docx") %>%
    body_add_par(element, style = "heading 2")%>%
    body_add_par(mes_var)%>%
    body_add_par(mes_desc_unit)%>%
    body_add_par(mes_range)%>%
    body_add_par("", style = "Normal")%>%
    body_add_table(tab1, style = "Table Professional")%>%
  print(doc_table, target = "data_quality_report.docx")
  
}  

doc_table <- read_docx(path = "data_quality_report.docx") %>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("The following variables only appear in one hospital, the variance between the means will not be displayed.", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
print(doc_table, target = "data_quality_report.docx")



for (i in 1:length(ordered_variance_list_NA)) {
  
  element <- ordered_names_NA[i]
  
  tab1 <- df_list[[element]]
  
  print(element)
  
  codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element)
  description <- unlist(codebook_entry[4])
  unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1]
  range <- codebook_entry[[7]]
  mes_range <- paste("The range of possible values is:", range)
  
  mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and is measured in"),unit)
  
  doc_table <- read_docx(path = "data_quality_report.docx") %>%
    body_add_par(element, style = "heading 2")%>%
    body_add_par(mes_desc_unit)%>%
    body_add_par(mes_range)%>%
    body_add_par("", style = "Normal")%>%
    body_add_table(tab1, style = "Table Professional")%>%
    print(doc_table, target = "data_quality_report.docx")
  
}  



if (length(df_categoric_list) != 0){

  doc_table <- read_docx(path = "data_quality_report.docx") %>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("Categoric variables", style = "heading 1")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  print(doc_table, target = "data_quality_report.docx")
  categoric_names <- names(df_categoric_list)

ordered_variance_list_categoric <- variances_list_categoric[order(unlist(variances_list_categoric), decreasing = TRUE)]
ordered_variance_list_Not_NA_categoric <- ordered_variance_list_categoric[!is.na(ordered_variance_list_categoric)]
ordered_variance_list_NA_categoric <- ordered_variance_list_categoric[is.na(ordered_variance_list_categoric)]
ordered_names_Not_NA_categoric <- names(ordered_variance_list_Not_NA_categoric)
ordered_names_NA_categoric <- names(ordered_variance_list_NA_categoric)

for (i in 1:length(ordered_names_Not_NA_categoric)) {

  element <- ordered_names_Not_NA_categoric[i]

  tab1 <- as.data.frame(df_categoric_list[[element]])
  var <- unlist(ordered_variance_list_Not_NA_categoric[element])

  print(element)

  codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element)
  description <- unlist(codebook_entry[4])
  unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1]

  mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and the possible values are"),unit)
  mes_var <- paste("The variance between rows is:", var)

  doc_table <- read_docx(path = "data_quality_report.docx") %>%
    body_add_par(element, style = "heading 2")%>%
    body_add_par(mes_desc_unit)%>%
    body_add_par(mes_var)%>%
    body_add_par("", style = "Normal")%>%
    body_add_table(tab1, style = "Table Professional")%>%
    print(doc_table, target = "data_quality_report.docx")

}

doc_table <- read_docx(path = "data_quality_report.docx") %>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("The following variables only appear in one hospital, the variance between the means will not be displayed.", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  body_add_par("", style = "Normal")%>%
  print(doc_table, target = "data_quality_report.docx")


if(length(ordered_names_NA_categoric)>0){
  for (i in 1:length(ordered_names_NA_categoric)) {

    element <- ordered_names_NA_categoric[i]

    tab1 <- as.data.frame(df_categoric_list[[element]])

    print(element)

    codebook_entry <- subset(codebook, codebook$Harmonised.variable.name == element)
    description <- unlist(codebook_entry[4])
    unit <- str_split(unlist(codebook_entry[6]), " ")[[1]][1]

    mes_desc_unit <- paste(paste(paste("This variable represents", paste("'",paste(description,"'",sep=""), sep = "")), "and the possible values are"),unit)

    doc_table <- read_docx(path = "data_quality_report.docx") %>%
      body_add_par(element, style = "heading 2")%>%
      body_add_par(mes_desc_unit)%>%
      body_add_par("", style = "Normal")%>%
      body_add_table(tab1, style = "Table Professional")%>%
      print(doc_table, target = "data_quality_report.docx")

  }


}
}