scripts de harmonizacion de David

3570d2a0 · pxp9 · 3570d2a0 · 3570d2a0 · 3570d2a0 · 3570d2a0
Commit 3570d2a0 authored Feb 01, 2023 by pxp9
21 changed files
--- a/README.txt
+++ b/README.txt
+empty_cell_fixer.r + add_missing_values.r
+
+Cambia las casillas vacías, puntos, "NA", y "Unknowns" por missing, y en el "_numeric" acorde lo cambia por un 999
+
+dora_test.r
+
+Ejecuta la función personalizada que calcula el score derivado de las variables
+
+full_quality_report.r
+
+Genera un documento word con los hospitales y las variables especificadas dónde se ordenan de mayor a menor varianza entre los hosptiales
+
+survival_curve.r
+
+Genera una curva de supervivencia
+
+table1_script.r + necessary_functions_table1_v2_subset.r
+
+Muestra una tabla que muestra la información sobre los hospitales y las variables indicadas.
+
+valid_variables_script.r
+
+Comprueba que los valores de las variables del hospital indicado estén dentro de los rangos
+(Se puede cambiar para que refleje lo que se desea obtener)
+
+treatment_heatmap.r
+
+Devuelve un mapa de calor que indica el porcentaje de pacientes que han recibido cada tratamiento spearados por olas.
+
+ressourceCleaner.r
+
+Deja los datos con Yes/No Male/Female etc
+
+ressourceHarmonizer.r
+
+Deriva las variables "_numeric"
\ No newline at end of file
--- a/add_missing_values.r
+++ b/add_missing_values.r
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+codebook <- read.csv("new_harmon.csv", sep = ",")
+
+binary <- unlist(codebook[codebook["Variable.type"] == "Binary",]["Harmonised.variable.name"])
+categorical <- unlist(codebook[codebook["Variable.type"] == "Categorical",]["Harmonised.variable.name"])
+continuous <- unlist(codebook[codebook["Variable.type"] == "Continuous",]["Harmonised.variable.name"])
+
+categoric_vars <- c(binary, categorical)
+
+replace_with_Missing_categoric <- function(column){
+  
+  for (i in 1:length(column)){
+    
+    x <- column[i]
+    
+    if (is.na(x)){
+      x <- "Missing"
+      
+    }else{
+      
+      if(x == "" | x == "NA" | x == "Unknown")
+        x <- "Missing"
+    }
+    
+    column[i] <- x
+    
+  }
+  
+  return (column)
+}
+
+replace_with_Missing_num_categoric <- function(column){
+  
+  for (i in 1:length(column)){
+    
+    x <- column[i]
+    
+    if (is.na(x)){
+      x <- 9999
+      
+    }else{
+      
+      if(x == "" | x == "NA" | x == "Unknown")
+        x <- 9999
+    }
+    
+    column[i] <- x
+    
+  }
+  
+  return (column)
+}
+
+
+replace_with_Missing_continuous <- function(column){
+  
+  for (i in 1:length(column)){
+    
+    x <- column[i]
+    
+    if (is.na(x)){
+      x <- ""
+      
+    }else{
+      
+      if(x == "NA" | x == "Unknown")
+        x <- ""
+    }
+    
+    column[i] <- x
+    
+  }
+  
+  return (column)
+  
+}
+
+
+add_missing_values <- function(path_to_file){
+  
+  
+  setwd("C:/Users/victor/Documents/TFG/r-analytics-master/ressources/current_db")
+  data <- read.csv(path_to_file, sep = ",")
+  
+  data_colnames <- colnames(data)
+  
+  num_categoric <- data_colnames[grepl("_numeric", data_colnames)]
+  
+  aux <- length(data_colnames)
+  
+  for (i in 1:aux){
+    colname <- data_colnames[i]
+    
+    progress <- round((100*i/aux),digits = 0)
+    print(paste(progress,"%", sep = ""))
+    
+    if(colname %in% categoric_vars){
+      column <- unlist(data[colname])
+      data[colname] <- replace_with_Missing_categoric(column)
+    }
+    
+    if(colname %in% num_categoric){
+      column <- unlist(data[colname])
+      data[colname] <- replace_with_Missing_num_categoric(column)
+    }
+    
+    if(colname %in% continuous){
+      column <- unlist(data[colname])
+      data[colname] <- replace_with_Missing_continuous(column)
+    }
+    
+    
+  }
+  
+  return(data)
+  
+  
+}
+
+
--- a/admissions_histogram.R
+++ b/admissions_histogram.R
+
+
+rm(list=ls())
+
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+
+
+source("dependency_installer.R")
+source("connection_parameters.R")
+source("necessary_functions_graph.R")
+
+dep_list = c( "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient")
+install_dependencies(dep_list)
+
+dir.create("./admissions_analysis", showWarnings = FALSE)
+setwd("./admissions_analysis")
+
+aux <- connect()
+connections <- aux[[1]]
+inp <- aux[[2]]
+
+
+obtain_admissions_graph(connections,inp)
+
+
--- a/connection_parameters.R
+++ b/connection_parameters.R
+
+
+hospital_names <- c(    #Añadir Los de Baskent y sacrocuore
+  "Princesa",
+  "CIPH",
+  "UMF_Iasis",
+  "SMUC",
+  "HM",
+  "Porto",
+  "FJD",
+  "Coimbra",
+  "UNAV",
+  "TU",
+  "Ankara Impatient",
+  "Konya Impatient",
+  "Istambul Impatient",
+  "Izmir Impatient",
+  "Alanya Impatient",
+  "Adana Impatient",
+  "Ankara Outpatient",
+  "Konya Outpatient",
+  "Istambul Outpatient",
+  "Izmir Outpatient",
+  "Alanya Outpatient",
+  "Sacrocuore Emergency",
+  "Sacrocuore Employees",
+  "Sacrocuore Verona",
+  "Sacrocuore Isaric",
+  "TUDublin",
+  "UMF_Cluj",
+  "UdeA",
+  "Inantro",
+  "UNSA",
+  "UZA"
+)
+
+project_names <- c(
+  "FIBHULP",
+  "CIPH_unCoVer",
+  "umfiasi",
+  "SMUC",
+  "FiHM",
+  "uncover-up",
+  "IISFJD",
+  "IPC",
+  "unCOVer-UNAV",
+  "TU_Uncover",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "BU",
+  "S_uncover",
+  "S_uncover",
+  "S_uncover",
+  "S_uncover",
+  "TUDublin",
+  "UMF_Cluj",
+  "INS_Data",
+  "INANTRO",
+  "UnCoVer-BiH-Final",
+  "UZA"
+)
+
+resource_names <- c(
+  "Harmonized_variables_2",
+  "CIPH_numeric_derivated",
+  "20220719_HarmonisedUMFIasi",
+  "SMUC_resource",
+  "20220720_HarmonisedHM",
+  "Resource_derived",
+  "IISFJD_Harmonized_1",
+  "IPC_Harmonized",
+  "UNAV_rsc",
+  "TU_Harmonized",
+  "inpatient_ankara",
+  "inpatient_konya",
+  "inpatient_istanbul",
+  "inpatient_izmir",
+  "inpatient_alanya",
+  "inpatient_adana",
+  "outpatient_ankara",
+  "outpatient_konya",
+  "outpatient_istanbul",
+  "outpatient_izmir",
+  "outpatient_alanya",
+  "emergency",
+  "employees",
+  "verona",
+  "isaric",
+  "TUDublin_harmonised",
+  "Romania",
+  "colombia_all",
+  "Inantro",
+  "20220722_HarmonizedUNSA",
+  "UZA_prelim"
+)
+
+urls <- c(
+  "https://192.168.1.200:8001",
+  "https://192.168.1.200:8002",
+  "https://192.168.1.200:8003",
+  "https://192.168.1.200:8006",
+  "https://192.168.1.50:9002",
+  "https://192.168.1.102",
+  "https://uncover.itg.be",
+  "https://uncover.itg.be",
+  "https://192.168.1.50:9001",
+  "https://192.168.1.200:8004",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.101:8443",
+  "https://192.168.1.50:8890",
+  "https://192.168.1.50:8890",
+  "https://192.168.1.50:8890",
+  "https://192.168.1.50:8890",
+  "https://uncover.itg.be",
+  "https://192.168.1.200:8005",
+  "https://fenfisdi.udea.edu.co/opal",
+  "https://192.168.1.200:8007",
+  "https://192.168.1.200:8008",
+  "https://uncover.itg.be"
+  
+)
+
+users <- c(
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "emertens",
+  "emertens",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "emertens",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "user_analisis",
+  "emertens"
+)
+
+pass <- c(
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "3^z4AV.)hG5~PT/]",
+  "3^z4AV.)hG5~PT/]",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "3^z4AV.)hG5~PT/]",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "Ekfl07UUgz",
+  "3^z4AV.)hG5~PT/]"
+)
--- a/data_quality_report.docx
+++ b/data_quality_report.docx
--- a/dependency_installer.R
+++ b/dependency_installer.R
+
+# This function receives a list of packages to be installed before the execution of a script.
+# For each package provided, it verifies that it is not installed already. If so, it proceeds to
+# install it.
+# ToDo: exception control for unexistent/misspelled packages.
+
+install_dependencies <- function(dep_list) {
+  
+  # Deprecated since executing server is Ubuntu.
+  #if(Sys.info()["sysname"] == "Windows"){
+  #  dep_list[length(dep_list)+1] = "Rtools"
+  #}
+  for(p in dep_list){
+
+    if(require(p, character.only = T) == FALSE){
+      if(p == "dsBaseClient"){
+        library(remotes)
+        install_github("datashield/dsBaseClient", dependencies = TRUE)
+      }else{
+        if(p == "DSI"){
+           library(remotes)
+           install_github('datashield/DSI', ref = '1.3.3', dependencies = TRUE)
+        }else{
+           install.packages(p, dependencies = TRUE, repos = "https://cran.us.r-project.org")
+        }
+      }  
+    }
+    if(p != "Rtools"){
+      library(p, character.only = T)
+    }   
+  }  
+  
+}
+
--- a/dora_test.R
+++ b/dora_test.R
+rm(list=ls())
+
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+
+source("dependency_installer.R")
+
+
+dep_list = c("magrittr","officer","dplyr","stringr","DSI","DSOpal","DSLite","dsBaseClient")
+install_dependencies(dep_list)
+
+#,"DSI","DSOpal","DSLite"
+
+source("connection_parameters.R")
+source("necessary_functions_connection.R")
+
+auxConnections <- connect()
+connections <- auxConnections[[1]]
+inp <- auxConnections[[2]]
+
+calltext <- call("DORA_scoresDS", "data")
+newobj <- "DORA_table"
+datashield.assign(connections, newobj, calltext)
+
+ds.colnames("DORA_table")
+ds.table("DORA_table$DORA_class")
+
+datashield.logout(connections)
--- a/empty_cell_fixer.r
+++ b/empty_cell_fixer.r
+
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+source("add_missing_values.R")
+
+list_of_files <- list.files("C:/Users/victor/Documents/TFG/r-analytics-master/ressources/current_db")[-6]
+
+#ist_of_files <- "Harmonized_LIH.csv"
+
+for (i in 1:length(list_of_files)) {
+  
+  file_name <- list_of_files[i]
+  
+  print(paste ("Fixing file:", file_name))
+  
+  setwd("C:/Users/victor/Documents/TFG/r-analytics-master/ressources/current_db")
+  
+  ready_file <- add_missing_values(file_name)
+  
+  setwd("C:/Users/victor/Documents/TFG/r-analytics-master/ressources/current_db/ready")
+  
+  write.csv(x=ready_file, file = file_name, row.names = FALSE)
+  
+}
--- a/full_quality_report.R
+++ b/full_quality_report.R
--- a/general_histogram.R
+++ b/general_histogram.R
+rm(list=ls())
+
+setwd("C:/Users/victor/Desktop/TFG/r-analytics-master")
+
+source("required_folder_checker.R")
+source("argument_hasher.R")
+source("dependency_installer.R")
+source("connection_parameters.R")
+source("necessary_function_table1s.R")
+
+dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient")
+install_dependencies(dep_list)
+
+
+dir.create("./demographic_analisis", showWarnings = FALSE)
+setwd("./demographic_analisis")
+
+
+aux <- connect()
+connections <- aux[[1]]
+inp <- aux[[2]]
+
+hasOutput  <- checkHasOutput(inp)
+
+name <- getHname(inp)
+
+png(paste(name,".png",sep=""),width=800, height=400)
+
+res <- obtainData(connections, hasOutput, inp)
+
+print(res)
+dev.off()
+
+datashield.logout(connections) 
+
+res
+
+
--- a/harmon.csv
+++ b/harmon.csv
--- a/necessary_functions_connection.R
+++ b/necessary_functions_connection.R
+
+connect <- function (){
+  
+  cat("\n\n\n----------------------------------------------------------------------------------------------------------")
+  cat("\nPlease select the number corresponding to the hospital you want to analyse, if you want to do a combined analysis select multiple hospitals")
+  cat("\n  
+        Princesa -> 1  
+        CIPH -> 2  
+        UMF_Iasis -> 3  
+        SMUC -> 4  
+        HM -> 5  
+        Porto -> 6  
+        FJD -> 7  
+        Coimbra -> 8  
+        UNAV -> 9 
+        TU -> 10 
+        
+          Baskent:
+          
+          Ankara Impatient -> 11
+          Konya Impatient -> 12
+          Istambul Impatient -> 13
+          Izmir Impatient -> 14
+          Alanya Impatient -> 15
+          Adana Impatient -> 16
+          
+          Ankara Outpatient -> 17
+          Konya Outpatient -> 18
+          Istambul Outpatient -> 19
+          Izmir Outpatient -> 20
+          Alanya Outpatient -> 21
+          
+        Sacrocuore:
+        
+          Emergency database -> 22
+          Employees database -> 23
+          Verona database -> 24
+          Isaric -> 25
+        
+        TU Dublin -> 26
+        UMF Cluj -> 27
+        UdeA -> 28
+        Inantro -> 29
+        UNSA -> 30
+        UZA -> 31
+          
+      ")
+  
+  inp <- scan()
+  
+  builder <- DSI::newDSLoginBuilder()
+  
+  
+  hospital_names <- hospital_names[inp]
+  project_names <- project_names[inp]
+  resource_names <- resource_names[inp]
+  urls <- urls[inp]
+  users <- users[inp]
+  pass <- pass[inp]
+  
+  print(hospital_names)
+  print(project_names)
+  print(resource_names)
+  print(urls)
+  print(users)
+  print(pass)
+  
+  url_ctr <- 0
+  for(i in 1:length(urls)){
+    print(paste("Connecting to Server with URL:", urls[i], sep=" "))
+    builder$append(server = hospital_names[i], url = urls[i],
+                   user = users[i], password = pass[i],
+                   resource = paste(project_names[i], resource_names[i], sep="."),
+                   driver = "OpalDriver", options="list(ssl_verifyhost=0,ssl_verifypeer=0)")
+    
+    url_ctr <- url_ctr+1
+  }
+  
+  logindata <- builder$build()
+  
+  connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D", failSafe = TRUE)
+  
+  
+  datashield.assign.expr(connections, symbol = 'data', expr = quote(as.resource.data.frame(D)))
+  #datashield.assign.expr(connections, symbol = 'auxDf', expr = quote(as.resource.data.frame(D)))
+  
+  
+  print("Successful connection to servers.")
+  
+  return(list(connections,inp))
+}
--- a/necessary_functions_table1_v2_subset.R
+++ b/necessary_functions_table1_v2_subset.R
--- a/new_harmon.csv
+++ b/new_harmon.csv
--- a/old_codebook.zip
+++ b/old_codebook.zip
--- a/ressourceCleaner.R
+++ b/ressourceCleaner.R
+
+rm(list=ls())
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
+
+#Cambiarlo por el nombre del ressource que se desea limpiar
+hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
+
+
+hospital <- hospital %>% select(-contains("numeric"))
+
+# hospital["NOT.HARMONISED"] <- NULL
+# 
+# names <- colnames(hospital)
+# for (i in 1:length(names)){
+# 
+#   if(grepl("NOT.HARMONISED", names[i])){
+#     hospital[names[i]] <- NULL
+#     print(paste("quito ", names[i]))
+#   }
+# 
+# }
+# hospital <- hospital[-1,]
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
+
+ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
+Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
+Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
+Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
+Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
+Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
+Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
+SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
+Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
+LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
+
+
+harmonised_data <- rbind(SiAndSympt,ComAndRF)
+harmonised_data <- rbind(harmonised_data,Treatment)
+
+harmonised_data <- rbind(harmonised_data,Dates)
+harmonised_data <- rbind(harmonised_data,Demographics)
+harmonised_data <- rbind(harmonised_data,Home_med)
+harmonised_data <- rbind(harmonised_data,Imaging_data)
+harmonised_data <- rbind(harmonised_data,Complications)
+harmonised_data <- rbind(harmonised_data,Labo)
+harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
+
+rm(list=c("SiAndSympt",
+          "Complications",
+          "ComAndRF",
+          "Dates",
+          "Demographics",
+          "Home_med",
+          "Imaging_data",
+          "Complications",
+          "Labo",
+          "LifestyleAndDiet"))
+
+noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
+noYesValues <- noYesValues$Harmonised.variable.name
+noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
+
+categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
+
+personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
+
+is_number <- function(x){
+  res <- FALSE
+  
+  if(length(x)!=0){
+    x <- str_replace(x,",",".")
+    
+    aux <- as.numeric(x)
+    
+    if(!is.na(aux))
+      res <- TRUE
+  }
+  return(res)
+}
+
+
+replaceNoYesValues <- function(x){
+  
+  #Replace the value with Yes or No
+    if(is.na(x))
+      x <- ""
+    else if (x==0 | x =="No" | x ==" No" | x =="NO")
+      x <- "No"
+    else if (x==1 | x == "Yes" |  x == " Yes" | x=="SI")
+      x <- "Yes"
+    else
+      x <- ""
+    
+    return (x)
+  
+}
+
+fixNonCategoric <- function(x){
+  
+  if(!is_number(x)){
+    x <- ""
+  }else{
+    x <- str_replace(x,",",".")
+  }
+  
+  return(x)
+  
+}
+
+personalizedFun <- function(x, colname){
+  
+  if(colname == "DMRGENDR"){
+    if(is.na(x))
+      x <- ""
+    else if(x == 1 | x == "F"| x == "f" | x== "Female")
+      x <- "Female"
+    else if (x == 0 | x =="M" | x == "m" | x== "Male")
+      x <- "Male"
+  }
+  
+  if(colname == "CSXCTR"){
+    if(is.na(x))
+      x <- ""
+    else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
+      x <- "Positive"
+    else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
+      x <- "Negative"
+  }
+
+  if(colname == "SMXFEA"){
+    if(is.na(x))
+      x <- ""
+    else if(x == 1)
+      x <- "Yes"
+    else if (x == 0)
+      x <- "No"
+    else if (x == ".")
+      x <- ""
+  }
+  
+  if(colname == "DMRRETH1"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "Asian"
+    else if (x == 2)
+      x <- "Black"
+    else if (x == 3)
+      x <- "Hispanic"
+    else if (x == 4)
+      x <- "White"
+    else if (x == 5)
+      x <- "Multiracial"
+    else if (x == 6)
+      x <- "Other"
+  }
+  
+  if(colname == "DMROCCU"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "Unemployed"
+    else if (x == 2)
+      x <- "Student"
+    else if (x == 3)
+      x <- "Employed"
+    else if (x == 4)
+      x <- "Self-employed"
+    else if (x == 5)
+      x <- "Retired"
+    else if (x == 6)
+      x <- ""
+  }
+  
+  if(colname == "DMRHREDU"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "High School"
+    else if (x == 2)
+      x <- "Bachelors"
+    else if (x == 3)
+      x <- "Postgraduate"
+    else if (x == 4)
+      x <- "Other"
+  }
+  
+  if(colname =="DSXOS"){
+    
+    if(is.na(x))
+      x <- ""
+    else if (x==0 | x == "Recovered")
+      x <- "Recovered"
+    else if (x==1 | x == "Deceased")
+      x <- "Deceased"
+    else if (x==2 | x == "Transferred")
+      x <- "Transferred"
+    else
+      x <- ""
+  }
+  
+  if(colname =="CSXCOT"){
+    if(is.na(x))
+      x <- ""
+    else if (x==1 )
+      x <- "PCR"
+    else if (x==2 )
+      x <- "antigen"
+    else if (x==3 )
+      x <- "other"
+    else
+      x <- ""
+  }
+  
+
+  return(x)
+  
+}
+
+dotToBar <- function (x){
+  
+  if (grepl(".", x, fixed = TRUE))
+    res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
+  else res <- x
+  return(res)
+  
+}
+
+
+
+rm(newDf)
+
+newDf <- hospital
+
+
+
+names <- colnames(hospital)
+
+
+for (j in 1:ncol(hospital)){
+  
+  percentage <- trunc(j/ncol(hospital)*100)
+  mes <- paste(toString(percentage),"% completed", sep="")
+  print(mes)
+  
+  print(names[j])
+  
+  for(i in 1:nrow(hospital)){
+    
+    if(names[j] %in% noYesValues){
+      newDf[i,j] <- replaceNoYesValues(hospital[i,j])
+    }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
+      newDf[i,j] <- fixNonCategoric(hospital[i,j])
+    }
+    
+    if(names[j] %in% personalized){
+      newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
+    }
+    
+    if (is.na(hospital[i,j]))
+      newDf[i,j] <- ""
+    
+    else if (hospital[i,j] == ".")
+      newDf[i,j] <- ""
+    
+  }
+}
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
+
+
+
+
+write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
+
+
--- a/ressourceHarmonizer.R
+++ b/ressourceHarmonizer.R
+
+rm(list=ls())
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
+
+
+hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
+
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
+
+ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))
+Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
+Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
+Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
+Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
+Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
+Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
+SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
+Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
+LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
+
+
+harmonised_data <- rbind(SiAndSympt,ComAndRF)
+harmonised_data <- rbind(harmonised_data,Treatment)
+harmonised_data <- rbind(harmonised_data,Dates)
+harmonised_data <- rbind(harmonised_data,Demographics)
+harmonised_data <- rbind(harmonised_data,Home_med)
+harmonised_data <- rbind(harmonised_data,Imaging_data)
+harmonised_data <- rbind(harmonised_data,Complications)
+harmonised_data <- rbind(harmonised_data,Labo)
+harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
+
+rm(list=c("SiAndSympt",
+          "Complications",
+          "ComAndRF",
+          "Dates",
+          "Demographics",
+          "Home_med",
+          "Imaging_data",
+          "Complications",
+          "Labo",
+          "LifestyleAndDiet"))
+
+noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
+noYesValues <- noYesValues$Harmonised.variable.name
+noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO","SMXFEA")
+
+categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
+
+personalized <- c("DMRGENDR", "DMRRETH1", "DMROCCU", "DMRHREDU",  "DSXOS")
+
+is_number <- function(x){
+  res <- FALSE
+
+  if(length(x)!=0){
+    x <- str_replace(x,",",".")
+
+    aux <- as.numeric(x)
+
+    if(!is.na(aux))
+      res <- TRUE
+  }
+  return(res)
+}
+
+fixNonCategoric <- function(x){
+
+  if(!is_number(x)){
+    x <- ""
+  }
+  return(x)
+
+}
+
+getNumericValue <- function (x,colname) {
+  
+  
+  if(colname %in% noYesValues){
+    if(is.na(x))
+      x <- ""
+    else if (x == "No")
+      x <- 0
+    else if (x == "Yes")
+      x <- 1
+  }
+  
+  if(colname == "DMRGENDR"){
+    if(x == "Female")
+      x <- 1
+    else if (x == "Male")
+      x <- 0
+  }
+  
+  if(colname == "DSXOS"){
+    
+    if(is.na(x))
+      x<- ""
+    else if (x == "Missing")
+      x <- ""
+    else if (x == "Recovered")
+      x <- 0
+    else if (x == "Deceased")
+      x <- 1
+    else if (x == "Transferred")
+      x <- 2
+  }
+  
+  if(colname == "DMRRETH1"){
+    if(is.na(x))
+      x <- ""
+    else if(x =="Asian")
+      x <- 1
+    else if (x == "Black")
+      x <- 2
+    else if (x == "Hispanic")
+      x <- 3
+    else if (x == "White")
+      x <- 4
+    else if (x == "Multiracial")
+      x <- 5
+    else if (x == "Other")
+      x <- 6
+  }
+  
+  if(colname == "DMROCCU"){
+    if(is.na(x))
+      x <- ""
+    else if(x == "Unemployed")
+      x <- 1
+    else if (x == "Student")
+      x <- 2
+    else if (x == "Employed")
+      x <- 3
+    else if (x == "Self-employed")
+      x <- 4
+    else if (x == "Retired")
+      x <- 5
+  }
+  
+  if(colname == "DMRHREDU"){
+    if(is.na(x))
+      x <- ""
+    else if(x =="High School")
+      x <- 1
+    else if (x == "Bachelors")
+      x <- 2
+    else if (x == "Postgraduate")
+      x <- 3
+    else if (x == "Other")
+      x <- 4
+  }
+  
+  if(colname =="CSXCOT"){
+    if(is.na(x))
+      x <- ""
+    else if (x=="PCR" )
+      x <- 1
+    else if (x=="antigen" )
+      x <- 2
+    else if (x=="other" )
+      x <- 3
+    else
+      x <- ""
+  }
+  
+  
+  return(x)
+  
+}
+
+noNa <- function(x){
+  
+  if(is.na(x))
+    x <- ""
+  
+  return(x)
+}
+
+
+
+rm(newDf)
+
+newDf <- hospital
+
+names <- colnames(hospital)
+
+list_numeric <- c()
+
+for(k in 1:length(names)){
+  
+  mes <- paste(names[k], "_numeric",sep ="")
+  
+  if(names[k] %in% noYesValues){
+    list_numeric <- c(list_numeric,mes)
+  }else if(names[k] %in% personalized){
+    list_numeric <- c(list_numeric,mes)
+  }
+  
+}
+
+
+numericDf <- data.frame(matrix(NA, nrow = nrow(hospital), ncol = length(list_numeric)))
+colnames(numericDf) <-  list_numeric
+
+
+newDf<-bind_cols(newDf,numericDf)
+
+
+#newDf[,"DMRGENDR_numeric"] <- NA
+
+
+
+
+for (j in 1:ncol(hospital)){
+  
+  percentage <- trunc(j/ncol(hospital)*100)
+  mes <- paste(toString(percentage),"% completed", sep="")
+  print(mes)
+  
+  numeric_col <- paste(names[j], "_numeric", sep="")
+  
+  print(names[j])
+  
+  for(i in 1:nrow(hospital)){
+    
+    if(i %% 10000 == 0)
+    print(i)
+    
+    if(numeric_col %in% list_numeric){
+      newDf[i,numeric_col] <- getNumericValue(hospital[i,j],names[j])
+    }
+    
+    if(is.na(hospital[i,j]))
+      newDf[i,j] <- ""
+    
+  }
+}
+
+
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources/Numeric_derived")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean/harmonized"))
+
+
+write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
+
+
+
+
+
+
--- a/survival_curve.R
+++ b/survival_curve.R
+
+source("dependency_installer.R")
+source("required_folder_checker.R")
+source("argument_hasher.R")
+
+dep_list = c("survival", "lubridate", "survminer", "stringr", "DSI", "DSOpal", "DSLite", "fields", "hrbrthemes", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient")
+install_dependencies(dep_list)
+
+
+image_format <- ".png"
+
+args <- commandArgs(trailingOnly = TRUE)
+
+hospital_names <- c("HM","Princesa")
+project_names <- c("RESOURCE_GUIDE","RESOURCE_GUIDE")
+resource_names <- c("HM_rs", "Princesa_rs")
+urls <- c("https://192.168.1.50:8844","https://192.168.1.50:8844")
+users <- c("opal_admin","opal_admin")
+pass <- c("5f%R!&wfbUF*7gZ14mg","5f%R!&wfbUF*7gZ14mg")
+
+hospital_names <- hospital_names[2]
+project_names <- project_names[2]
+resource_names <- resource_names[2]
+urls <- urls[2]
+users <- users[2]
+pass <- pass[2]
+
+# project_names_o <- args[1]
+# project_names <- str_split(project_names_o, ";")[[1]]
+# 
+# resource_names_o <- args[2]
+# resource_names <- str_split(resource_names_o, ";")[[1]]
+# 
+# urls_o <- args[3]
+# urls <- str_split(urls_o, ";")[[1]]
+# 
+# users_o <- args[4]
+# users <- str_split(users_o, ";")[[1]]
+# 
+# pass_o <- args[5]
+# pass <- str_split(pass_o, ";")[[1]]
+# 
+# hospital_name <- args[6]
+# extra_filter <- args[7]
+
+
+
+
+json_output <- c()
+
+
+builder <- DSI::newDSLoginBuilder()
+
+url_ctr <- 0
+for(i in 1:length(urls)){
+  print(paste("Connecting to Server with URL:", urls[i], sep=" "))
+  builder$append(server = paste("study", url_ctr, sep=""), url = urls[i],
+                 user = users[i], password = pass[i],
+                 resource = paste(project_names[i], resource_names[i], sep="."),
+                 driver = "OpalDriver", options="list(ssl_verifyhost=0,ssl_verifypeer=0)")
+  
+  url_ctr <- url_ctr+1
+}
+
+logindata <- builder$build()
+connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
+
+datashield.assign.expr(connections, symbol = 'data', expr = quote(as.resource.data.frame(D)))
+datastructure_name <- "data"
+ds.colnames(x=datastructure_name, datasources= connections)
+
+data_dim <- ds.dim(x=datastructure_name, datasources= connections)
+data_dim_rows <- data_dim$`dimensions of data in combined studies`[1]
+data_dim_cols <- data_dim$`dimensions of data in combined studies`[2]
+
+
+get_reconstructed_population <- function(df, var, size){
+  
+  data_dim <- ds.dim(x=df, datasources= connections)
+  data_dim_rows <- data_dim[[length(data_dim)]][1]
+  data_dim_cols <- data_dim[[length(data_dim)]][2]
+  
+  quantile_data <- ds.quantileMean(x=paste(df, var, sep="$"), datasources = connections)
+  
+  est_min <- round(quantile_data[[1]])
+  est_q1 <- round(quantile_data[[3]])
+  est_median <- round(quantile_data[[4]])
+  est_q3 <- round(quantile_data[[5]])
+  est_max <- round(quantile_data[[7]])
+  
+  combined_mean <- quantile_data[[8]]
+  
+  nn <- size
+  quantiles <- c(est_min, est_q1, est_median, est_q3, est_max)
+  
+  set.seed(1)
+  reconstructed_population <- c(
+    runif(nn/4,quantiles[1],quantiles[2]),
+    runif(nn/4,quantiles[2],quantiles[3]),
+    runif(nn/4,quantiles[3],quantiles[4]),
+    runif(nn/4,quantiles[4],quantiles[5]))
+  
+  return(reconstructed_population)
+}
+
+ds.dataFrameSubset(df.name = datastructure_name, V1.name = "data$DSXOS_numeric", V2.name = "1", Boolean.operator = "==", newobj = "OutFilteredDEATH")
+ds.dataFrameSubset(df.name = datastructure_name, V1.name = "data$DSXOS_numeric", V2.name = "0", Boolean.operator = "==", newobj = "OutFilteredALIVE")
+
+data_dim_DEATH <- ds.dim(x="OutFilteredDEATH", datasources= connections)
+data_dim_DEATH <- data_dim_DEATH[[length(data_dim_DEATH)]][1]
+data_dim_ALIVE <- ds.dim(x="OutFilteredALIVE", datasources= connections)
+data_dim_ALIVE <- data_dim_ALIVE[[length(data_dim_ALIVE)]][1]
+
+reconstr_pop_time_outcome_death <- get_reconstructed_population("OutFilteredDEATH", "DATLGT", data_dim_DEATH)
+
+df_death <- data.frame(reconstr_pop_time_outcome_death)
+df_death["status"] = "death"
+colnames(df_death) = c("out_time", "status")
+
+
+reconstr_pop_time_outcome_alive <- get_reconstructed_population("OutFilteredALIVE", "DATLGT", data_dim_ALIVE)
+df_alive <- data.frame(reconstr_pop_time_outcome_alive)
+df_alive["status"] = "alive"
+colnames(df_alive) = c("out_time", "status")
+
+
+full_status_df <- rbind(df_alive, df_death)
+
+filtercol <- c()
+samplenum <- sample(0:100000, nrow(full_status_df), replace = T)
+for( i in 1:length(samplenum) ) {
+  
+  if(samplenum[i] %% 2 == 0){
+    filtercol[length(filtercol)+1] <- "MALE"
+  }else{
+    filtercol[length(filtercol)+1] <- "FEMALE"
+  }
+  
+}
+
+full_status_df[extra_filter] = filtercol
+full_status_df["status_surv"] = 1
+full_status_df[full_status_df$status == "alive" ,"status_surv"] = 0
+
+
+#filename <- paste0(hospital_name, "survival_curve", sep="")
+#filename <- paste(filename, "outcome", sep="_")
+#filename <- paste(filename, image_format, sep="")
+
+dir.create("./survAlberto", showWarnings = FALSE)
+setwd("./survAlberto")
+
+print("survival_curve.png")
+png("survival_curve.png", width = 750, height = 500)
+
+survplot <- ggsurvplot(
+  fit = survfit(Surv(out_time, status_surv) ~ 1, data = full_status_df), 
+  xlab = "Days", 
+  ylab = "Overall survival probability")
+
+survplot
+dev.off()
+
+
+
+
+datashield.logout(connections) 
+
+
--- a/table1_script.R
+++ b/table1_script.R
+
+
+rm(list=ls())
+
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+
+#Obtain the data from the codebook
+codebook <- read.csv("harmon.csv", sep = ";")
+codebook <- codebook[,c(1,3,4)]
+colnames(codebook) <- c("variable", "description", "unit")
+
+source("dependency_installer.R")
+source("connection_parameters_aux.R")
+source("necessary_functions_table1_v2_subset.R")
+
+dep_list = c("DSI","DSOpal","DSLite", "dsBaseClient")
+install_dependencies(dep_list)
+
+dir.create("./table_1", showWarnings = FALSE)
+setwd("./table_1")
+
+
+#Create the connection
+auxConnections <- connect()
+connections <- auxConnections[[1]]
+inp <- auxConnections[[2]]
+
+#Only do this if you want to subset the data for a specific value of a categoric variable
+apply_filters()
+
+#Select the variables to be analized (Add the ones you consider relevant and remove the ones that you do not)
+varToAnalize <- c("DMRGENDR","DMRAGEYR","DSXOS", "RFXSM", "TRXAV",  "CMXCVD", "RFXOB")
+varToAnalize <- c("DMRGENDR","DMRAGEYR")
+
+table1 <- obtain_table1(connections, inp, varToAnalize)
+
+#datashield.logout(connections)
+
+library(gridExtra)
+png("table1TFGFiltro.png", height = 30*nrow(table1), width = 150*ncol(table1))
+grid.table(table1)
+dev.off()
--- a/treatment_heatmap_script.r
+++ b/treatment_heatmap_script.r
+
+
+rm(list=ls())
+
+library("ggplot2")
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
+
+Treatments <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
+descriptions <- Treatments[, c(1,3)]
+rownames(descriptions) <- descriptions$Harmonised.variable.name
+Treatments <- Treatments$Harmonised.variable.name
+
+setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
+
+
+source("dependency_installer.R")
+source("connection_parameters.R")
+source("necessary_functions_treatments_heatmap.R")
+
+dep_list = c("DSI","DSOpal","DSLite", "ggplot2", "dsBaseClient")
+install_dependencies(dep_list)
+
+
+#Create the connection
+auxConnections <- connect()
+connections <- auxConnections[[1]]
+inp <- auxConnections[[2]]
+
+
+get_treatments_map(connections, inp)
+
+
+
+ds.colnames("data")
--- a/valid_variables_script.R
+++ b/valid_variables_script.R