diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..615bdc100e439f698e43b4d252dd7844517b7b79 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.xlsx +harmonized_data/*.csv diff --git a/ressourceCleaner.R b/ressourceCleaner.R index cb4f7ea753e46cd387115a1a5fe8e64da49aadc7..69090e8f7c5b3b0bcdf73d112258b1dffe1cf46b 100755 --- a/ressourceCleaner.R +++ b/ressourceCleaner.R @@ -1,280 +1,285 @@ - -rm(list=ls()) - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") -setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) - -#Cambiarlo por el nombre del ressource que se desea limpiar -hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) - - -hospital <- hospital %>% select(-contains("numeric")) - -# hospital["NOT.HARMONISED"] <- NULL -# -# names <- colnames(hospital) -# for (i in 1:length(names)){ -# -# if(grepl("NOT.HARMONISED", names[i])){ -# hospital[names[i]] <- NULL -# print(paste("quito ", names[i])) -# } -# -# } -# hospital <- hospital[-1,] - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") - -ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] -Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] -Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] -Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] -Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] -Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] -Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] -SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] -Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] -LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] - - -harmonised_data <- rbind(SiAndSympt,ComAndRF) -harmonised_data <- rbind(harmonised_data,Treatment) - -harmonised_data <- rbind(harmonised_data,Dates) -harmonised_data <- rbind(harmonised_data,Demographics) -harmonised_data <- rbind(harmonised_data,Home_med) -harmonised_data <- rbind(harmonised_data,Imaging_data) -harmonised_data <- rbind(harmonised_data,Complications) -harmonised_data <- rbind(harmonised_data,Labo) -harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) - -rm(list=c("SiAndSympt", - "Complications", - "ComAndRF", - "Dates", - "Demographics", - "Home_med", - "Imaging_data", - "Complications", - "Labo", - "LifestyleAndDiet")) - -noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") -noYesValues <- noYesValues$Harmonised.variable.name -noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") - -categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") - -personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") - -is_number <- function(x){ - res <- FALSE - - if(length(x)!=0){ - x <- str_replace(x,",",".") - - aux <- as.numeric(x) - - if(!is.na(aux)) - res <- TRUE - } - return(res) -} - - -replaceNoYesValues <- function(x){ - - #Replace the value with Yes or No - if(is.na(x)) - x <- "" - else if (x==0 | x =="No" | x ==" No" | x =="NO") - x <- "No" - else if (x==1 | x == "Yes" | x == " Yes" | x=="SI") - x <- "Yes" - else - x <- "" - - return (x) - -} - -fixNonCategoric <- function(x){ - - if(!is_number(x)){ - x <- "" - }else{ - x <- str_replace(x,",",".") - } - - return(x) - -} - -personalizedFun <- function(x, colname){ - - if(colname == "DMRGENDR"){ - if(is.na(x)) - x <- "" - else if(x == 1 | x == "F"| x == "f" | x== "Female") - x <- "Female" - else if (x == 0 | x =="M" | x == "m" | x== "Male") - x <- "Male" - } - - if(colname == "CSXCTR"){ - if(is.na(x)) - x <- "" - else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM") - x <- "Positive" - else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") - x <- "Negative" - } - - if(colname == "SMXFEA"){ - if(is.na(x)) - x <- "" - else if(x == 1) - x <- "Yes" - else if (x == 0) - x <- "No" - else if (x == ".") - x <- "" - } - - if(colname == "DMRRETH1"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "Asian" - else if (x == 2) - x <- "Black" - else if (x == 3) - x <- "Hispanic" - else if (x == 4) - x <- "White" - else if (x == 5) - x <- "Multiracial" - else if (x == 6) - x <- "Other" - } - - if(colname == "DMROCCU"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "Unemployed" - else if (x == 2) - x <- "Student" - else if (x == 3) - x <- "Employed" - else if (x == 4) - x <- "Self-employed" - else if (x == 5) - x <- "Retired" - else if (x == 6) - x <- "" - } - - if(colname == "DMRHREDU"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "High School" - else if (x == 2) - x <- "Bachelors" - else if (x == 3) - x <- "Postgraduate" - else if (x == 4) - x <- "Other" - } - - if(colname =="DSXOS"){ - - if(is.na(x)) - x <- "" - else if (x==0 | x == "Recovered") - x <- "Recovered" - else if (x==1 | x == "Deceased") - x <- "Deceased" - else if (x==2 | x == "Transferred") - x <- "Transferred" - else - x <- "" - } - - if(colname =="CSXCOT"){ - if(is.na(x)) - x <- "" - else if (x==1 ) - x <- "PCR" - else if (x==2 ) - x <- "antigen" - else if (x==3 ) - x <- "other" - else - x <- "" - } - - - return(x) - -} - -dotToBar <- function (x){ - - if (grepl(".", x, fixed = TRUE)) - res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") - else res <- x - return(res) - -} - - - -rm(newDf) - -newDf <- hospital - - - -names <- colnames(hospital) - - -for (j in 1:ncol(hospital)){ - - percentage <- trunc(j/ncol(hospital)*100) - mes <- paste(toString(percentage),"% completed", sep="") - print(mes) - - print(names[j]) - - for(i in 1:nrow(hospital)){ - - if(names[j] %in% noYesValues){ - newDf[i,j] <- replaceNoYesValues(hospital[i,j]) - }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){ - newDf[i,j] <- fixNonCategoric(hospital[i,j]) - } - - if(names[j] %in% personalized){ - newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) - } - - if (is.na(hospital[i,j])) - newDf[i,j] <- "" - - else if (hospital[i,j] == ".") - newDf[i,j] <- "" - - } -} - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") -setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) - - - - -write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) - - + +rm(list=ls()) + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") +setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) + +#Cambiarlo por el nombre del ressource que se desea limpiar +hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) + + +hospital <- hospital %>% select(-contains("numeric")) + +# hospital["NOT.HARMONISED"] <- NULL +# +# names <- colnames(hospital) +# for (i in 1:length(names)){ +# +# if(grepl("NOT.HARMONISED", names[i])){ +# hospital[names[i]] <- NULL +# print(paste("quito ", names[i])) +# } +# +# } +# hospital <- hospital[-1,] + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") + +ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] +Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] +Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] +Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] +Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] +Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] +Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] +SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] +Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] +LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] + + +harmonised_data <- rbind(SiAndSympt,ComAndRF) +harmonised_data <- rbind(harmonised_data,Treatment) + +harmonised_data <- rbind(harmonised_data,Dates) +harmonised_data <- rbind(harmonised_data,Demographics) +harmonised_data <- rbind(harmonised_data,Home_med) +harmonised_data <- rbind(harmonised_data,Imaging_data) +harmonised_data <- rbind(harmonised_data,Complications) +harmonised_data <- rbind(harmonised_data,Labo) +harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) + +rm(list=c("SiAndSympt", + "Complications", + "ComAndRF", + "Dates", + "Demographics", + "Home_med", + "Imaging_data", + "Complications", + "Labo", + "LifestyleAndDiet")) + +noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") +noYesValues <- noYesValues$Harmonised.variable.name +noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") + +categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") + +personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") + +is_number <- function(x){ + res <- FALSE + + if(length(x)!=0){ + x <- str_replace(x,",",".") + + aux <- as.numeric(x) + + if(!is.na(aux)) + res <- TRUE + } + return(res) +} + + +replaceNoYesValues <- function(x){ + + #Replace the value with Yes or No + if(is.na(x)){ + x <- "" + return (x) + } + + x <- trimws(toupper(x)) + if (x=="0" | x =="NO") + x <- "No" + else if (x=="1" | x == "YES" | x=="SI") + x <- "Yes" + else + x <- "" + + return (x) + +} + +fixNonCategoric <- function(x){ + + if(!is_number(x)){ + x <- "" + }else{ + x <- str_replace(x,",",".") + } + + return(x) + +} + +personalizedFun <- function(x, colname){ + + if(colname == "DMRGENDR"){ + if(is.na(x)) + x <- "" + else if(x == 1 | x == "F"| x == "f" | x== "Female") + x <- "Female" + else if (x == 0 | x =="M" | x == "m" | x== "Male") + x <- "Male" + } + + if(colname == "CSXCTR"){ + if(is.na(x)) + x <- "" + else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM") + x <- "Positive" + else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") + x <- "Negative" + } + + if(colname == "SMXFEA"){ + x <- replaceNoYesValues(x) + #if(is.na(x)) + # x <- "" + #else if(x == 1) + # x <- "Yes" + #else if (x == 0) + # x <- "No" + #else if (x == ".") + # x <- "" + } + + if(colname == "DMRRETH1"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "Asian" + else if (x == 2) + x <- "Black" + else if (x == 3) + x <- "Hispanic" + else if (x == 4) + x <- "White" + else if (x == 5) + x <- "Multiracial" + else if (x == 6) + x <- "Other" + } + + if(colname == "DMROCCU"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "Unemployed" + else if (x == 2) + x <- "Student" + else if (x == 3) + x <- "Employed" + else if (x == 4) + x <- "Self-employed" + else if (x == 5) + x <- "Retired" + else if (x == 6) + x <- "" + } + + if(colname == "DMRHREDU"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "High School" + else if (x == 2) + x <- "Bachelors" + else if (x == 3) + x <- "Postgraduate" + else if (x == 4) + x <- "Other" + } + + if(colname =="DSXOS"){ + + if(is.na(x)) + x <- "" + else if (x==0 | x == "Recovered") + x <- "Recovered" + else if (x==1 | x == "Deceased") + x <- "Deceased" + else if (x==2 | x == "Transferred") + x <- "Transferred" + else + x <- "" + } + + if(colname =="CSXCOT"){ + if(is.na(x)) + x <- "" + else if (x==1 ) + x <- "PCR" + else if (x==2 ) + x <- "antigen" + else if (x==3 ) + x <- "other" + else + x <- "" + } + + + return(x) + +} + +dotToBar <- function (x){ + + if (grepl(".", x, fixed = TRUE)) + res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") + else res <- x + return(res) + +} + + + +rm(newDf) + +newDf <- hospital + + + +names <- colnames(hospital) + + +for (j in 1:ncol(hospital)){ + + percentage <- trunc(j/ncol(hospital)*100) + mes <- paste(toString(percentage),"% completed", sep="") + print(mes) + + print(names[j]) + + for(i in 1:nrow(hospital)){ + + if(names[j] %in% noYesValues){ + newDf[i,j] <- replaceNoYesValues(hospital[i,j]) + }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){ + newDf[i,j] <- fixNonCategoric(hospital[i,j]) + } + + if(names[j] %in% personalized){ + newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) + } + + if (is.na(hospital[i,j])) + newDf[i,j] <- "" + + else if (hospital[i,j] == ".") + newDf[i,j] <- "" + + } +} + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") +setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) + + + + +write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) + + diff --git a/valid_variables_script.R b/valid_variables_script.R old mode 100755 new mode 100644 index 6c62d8a1bdb734de2dd83263b53d41fd89a9d92c..9576c763d125b853969d5c405ad92b4264da1ac5 --- a/valid_variables_script.R +++ b/valid_variables_script.R @@ -1,591 +1,592 @@ -rm(list=ls()) - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") - -source("required_folder_checker.R") -source("argument_hasher.R") -source("dependency_installer.R") - -# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source") -# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source") -# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source") -# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source") - - -dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") -install_dependencies(dep_list) - -#,"DSI","DSOpal","DSLite" - -setwd("C:/Users/victor/Documents/TFG/r-analytics-master") -source("connection_parameters.R") -source("necessary_functions_connection.R") - - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") - - -ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] -Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] -Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] -Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] -Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] -Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] -Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] -SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] -Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] -LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] - - - -harmonised_data <- rbind(SiAndSympt,ComAndRF) -harmonised_data <- rbind(harmonised_data,Treatment) -harmonised_data <- rbind(harmonised_data,Dates) -harmonised_data <- rbind(harmonised_data,Demographics) -harmonised_data <- rbind(harmonised_data,Home_med) -harmonised_data <- rbind(harmonised_data,Imaging_data) -harmonised_data <- rbind(harmonised_data,Complications) -harmonised_data <- rbind(harmonised_data,Labo) -harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) - -rm(list=c("SiAndSympt", - "Complications", - "ComAndRF", - "Dates", - "Demographics", - "Home_med", - "Imaging_data", - "Complications", - "Labo", - "LifestyleAndDiet")) - -categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") - - -#---------------------------------------------------------------------------- - -#Test if column names are valid -check_column_names <- function(x){ - - str_res <- "The column names:" - - for(i in 1:(nrow(data_colnames))){ - if(!check_valid_name(data_colnames[i,1])){ - str_res<- paste(str_res, data_colnames[i,1], sep=" ") - } - } - - str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") - - return (str_res) - -} - -#Test if a single variable name is valid -check_valid_name <- function(x){ - - valid <- FALSE - - aux <- as.data.frame(strsplit(x , split = "_")) - - if(aux[1,1] %in% harmonised_data$Harmonised.variable.name) - valid <- TRUE - - return (valid) - -} - -valid_data_colnames <- function(x){ - - valid_colnames = c() - - for(i in 1:(nrow(data_colnames))){ - if(check_valid_name(data_colnames[i,1])){ - valid_colnames = c(valid_colnames,data_colnames[i,1]) - } - } - - - return(valid_colnames) -} - -remove_space <- function(x){ - searchString <- ' ' - replacementString <- '' - res = sub(searchString,replacementString,x) - return(res) -} - -remove_spaces_from_ds <- function(ds){ - - res<- lapply(ds,remove_space ) - - return(as.data.frame(res)) - -} - -is_number <- function(x){ - res <- FALSE - - - if(length(x)!=0){ - x <- str_replace(x,",",".") - - aux <- as.numeric(x) - - - if(!is.na(aux)) - res <- TRUE - } - - - return(res) - -} - -check_values_not_categoric <- function(values, colname){ - - valid_vals <- values - - possible_vals <- possible_values(colname) - - for(i in 1:length(values)){ - - res<- FALSE - - value <- values[[i]] - - if(is_number(value)){ - value <- str_replace(value,",",".") - value <- as.numeric(value) - } - - if(is.null(value)){ - res <- TRUE - } - - else if( value == "NA" | value == "nan" | value == ".") - res <- TRUE - else{ - - if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){ - - if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){ - lower = possible_vals[1,1][[1]] - higher = possible_vals[2,1][[1]] - - }else{ - bounds <- as.data.frame(strsplit(possible_vals[1,1], '-')) - lowerAux <- str_replace(bounds [1,1],",",".") - higherAux <- str_replace(bounds [2,1],",",".") - lower <- as.numeric(remove_space(lowerAux)) - higher <- as.numeric(remove_space(higherAux)) - } - - - if ((value >= lower & value <= higher)) - res <- TRUE - } - - if(nrow(possible_vals) == 3 | colname=="CSXCTR"){ - if(value == 0 | value == 1) - res <- TRUE - } - - if(nrow(possible_vals) > 3){ - - lower <- strtoi(remove_space(possible_vals[1,1])) - higher_b <- nrow(possible_vals)-1 - higher <- strtoi(remove_space(possible_vals[higher_b,1])) - - if ((value >= lower & value <= higher)) - res <- TRUE - } - } - - if(res == FALSE) - valid_vals[i] <- res - - - } - - return(valid_vals) -} - -possible_values <- function(x){ - - if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM") - res <- t(as.data.frame(list(-20,20))) - - else{ - - possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] - res <- strsplit(x=possible_value,split="/") - } - - return(as.data.frame(res)) -} - -possible_values_categoric <- function(x){ - - possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] - res <- strsplit(x=possible_value,split="/") - - return(as.data.frame(res)) -} - -check_values_categoric <- function(values, colname){ - - possible_vals <- possible_values_categoric(colname) - - res <- TRUE - - for(i in 1:length(values)){ - if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){ - res <- FALSE - } - - } - - return(res) - -} - - -get_values_from_quantiles <- function(x){ - - data_summary <- ds.summary(x) - - low_quantile <- data_summary[[1]][3][[1]][[1]] - - high_quantile <- data_summary[[1]][3][[1]][[7]] - - return(list(low_quantile,high_quantile)) - - -} - -notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){ - - res <- "" - - if(length(invalid_name_list) !=0){ - res <- "There are invalid values in the numeric fields:" - - - for(i in 1:length(invalid_name_list)){ - res <- paste(res, invalid_name_list[i], sep=" ") - } - - res <- paste(res, "\n", sep="") - - for(i in 1:length(invalid_name_list)){ - res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ") - } - } - - - if(length(wrong_categoric)!=0){ - - res <- paste(res, "\n############################################################################ \n", sep="") - - res <- paste(res,"\nThe following categoric values are invalid:", sep=" ") - - for(i in 1:length(wrong_categoric)){ - res <- paste(res, wrong_categoric[i], sep=" ") - } - - res <- paste(res, "\n", sep="") - - for(i in 1:length(wrong_categoric)){ - res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ") - } - } - - if(length(missing_numeric)!=0){ - - res <- paste(res, "\n############################################################################ \n", sep="") - - res<- paste(res, "\nThe following fields are missing a numeric field:") - - for(i in 1:length(wrong_categoric)){ - res <- paste(res, missing_numeric[i], sep=" ") - } - } - - res <- paste(res, "\n", sep="") - - - return(res) - -} - -error_message_categoric <- function(colname, invalid_values){ - - res<- "\nValues in the field" - res<- paste(res, colname, sep=" ") - res<- paste(res, "should be", sep=" ") - - range <- possible_values_categoric(colname) - - for(i in 1:nrow(range)){ - res <- paste(res, remove_space(range[i,1]), sep=" ") - } - - - res<- paste(res, "\nBut values were:", sep=" ") - - for(j in 1:length(invalid_values)){ - res<- paste(res, invalid_values[[j]], sep=" ") - } - - res<- paste(res, "\n\n", sep="") - - - return(res) - -} - -error_message <- function(colname, invalid_values){ - - res<- "\nValues in the field" - res<- paste(res, colname, sep=" ") - res<- paste(res, "should be", sep=" ") - - if(grepl("numeric", colname,fixed=TRUE)) - new_colname <- strsplit(x=colname,split="_")[[1]][1] - else - new_colname <- colname - - range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) - range <- range[5] - range <- as.data.frame(strsplit(range[1,1], '/')) - - #Range of values or null - if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){ - bounds <- as.data.frame(strsplit(range[1,1], '-')) - lower <- remove_space(bounds [1,1]) - higher <- remove_space(bounds [2,1]) - res<- paste(res, "numbers between", sep=" ") - res<- paste(res, lower, sep=" ") - res<- paste(res, "and", sep=" ") - res<- paste(res, higher, sep=" ") - res<- paste(res, "(both included)", sep=" ") - } - - if(nrow(range) == 3){ - res<- paste(res, "0 or 1", sep=" ") - } - - if(nrow(range) > 3){ - lower <- strtoi(remove_space(range[1,1])) - higher_b <- nrow(range)-1 - higher <- strtoi(remove_space(range[higher_b,1])) - res<- paste(res, "numbers between", sep=" ") - res<- paste(res, lower, sep=" ") - res<- paste(res, "and", sep=" ") - res<- paste(res, higher, sep=" ") - res<- paste(res, "(both included)", sep=" ") - } - - if(grepl("DAT",colname, fixed=TRUE)){ - res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ") - } - - - res<- paste(res, "\nBut values were:", sep=" ") - - for(j in 1:length(invalid_values)){ - res<- paste(res, invalid_values[[j]], sep=" ") - } - - res<- paste(res, "\n", sep="") - - if(!is_number(invalid_values[[1]])) - res<- paste(res, "(It's missing a \"numeric\" field)", sep="") - - res<- paste(res, "\n", sep="") - - return(res) - -} - -check_valid_values <- function(){ - - invalid_name_list <- c() - cannot_analyse_list <- c() - invalid_values_list <- list() - wrong_categoric_values <- list() - wrong_categoric <- c() - missing_numeric <- c() - j<- 1 - k <- 1 - - for(i in 1:(nrow(valid_colnames))){ - - data_table ="empty" - - if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ - - column <- "data$" - column <- paste(column, valid_colnames[i,1], sep="") - - tryCatch( - error = function(cnd) { - print("Unable to analyse data") - res <- FALSE - }, - data_table <- as.data.frame(ds.table(column)) - ) - - if(data_table == "empty"){ - - cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) - - }else{ - - - if (data_table[[1]] == "All studies failed for reasons identified below") - values <- get_values_from_quantiles(column) - else - values <- row.names(data_table) - - numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") - - if( valid_colnames[i,1] %in% categoric_vars ){ - - #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) - has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` - - if(!has_numeric) - missing_numeric <- c(missing_numeric, valid_colnames[i,1]) - - - if (data_table[[1]] == "All studies failed for reasons identified below"){ - - cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) - - }else if(!check_values_categoric(values,valid_colnames[i,1])){ - - print("Wrong categoric value:") - print(valid_colnames[i,1]) - - wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) - wrong_categoric_values[[k]] <- values - k <- k+1 - } - - # if((!is_numeric & !has_numeric) | is_numeric) - }else{ - - if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) - new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] - else - new_colname <- valid_colnames[i,1] - - valid <- check_values_not_categoric(values, new_colname) - - if (FALSE %in% valid){ - invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) - invalid_values_list[[j]] <- values - j <- j+1 - } - - - #print(valid_colnames[i,1]) - #print(values) - - }#else - # print("This variable has a numeric version") - } - - - } - } - - missing_numeric - if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){ - res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric) - }else{ - res <- "All values are valid \n" - } - - if(length(cannot_analyse_list)>0){ - res <- paste(res, "\n############################################################################ \n", sep="") - - res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) - } - - - - - return(res) - -} - -notify_unable_analyse <- function(x){ - - res <- "\nCould not obtain data from the fields:" - - for(i in 1:length(x)){ - res <- paste(res, x[i], sep=" ") - } - - return (res) - - -} - -auxConnections <- connect() -connections <- auxConnections[[1]] -inp <- auxConnections[[2]] - -#Conexión a la base de datos - -ds.dim("data", datasources = connections) -ds.colnames("data") - -#---------------------------------------------------------------------------- - -#Check valid column names -datastructure_name <- "data" -data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) - -data_colnames <- as.data.frame(data_colnames) - -check_valid_columns <- check_column_names(data_colnames) - -valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) - - - -#possible_values("CSXCTR") -result <- "" -result<-check_valid_values() -print(check_valid_columns) -datashield.logout(connections) -cat(result) - - - -# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna") -# # -# ds.summary("columna$DMXWT") -# ds.dim("columna$DMXWT") -# ds.table("columna$DMXWT") - -file_name<- paste(hospital_name,"_invalid_values.txt", sep="") - -#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections) - - -#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values") -dir.create("../invalid_values", showWarnings = FALSE) -setwd("../invalid_values") - - -cat(check_valid_columns,file=file_name,sep="\n") -cat(result,file=file_name,append=TRUE) - -datashield.logout(connections) - +rm(list=ls()) + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") + +source("required_folder_checker.R") +source("argument_hasher.R") +source("dependency_installer.R") + +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source") +# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source") + + +dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") +install_dependencies(dep_list) + +#,"DSI","DSOpal","DSLite" + +setwd("C:/Users/victor/Documents/TFG/r-analytics-master") +source("connection_parameters.R") +source("necessary_functions_connection.R") + + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") + + +ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] +Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] +Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] +Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] +Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] +Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] +Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] +SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] +Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] +LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] + + + +harmonised_data <- rbind(SiAndSympt,ComAndRF) +harmonised_data <- rbind(harmonised_data,Treatment) +harmonised_data <- rbind(harmonised_data,Dates) +harmonised_data <- rbind(harmonised_data,Demographics) +harmonised_data <- rbind(harmonised_data,Home_med) +harmonised_data <- rbind(harmonised_data,Imaging_data) +harmonised_data <- rbind(harmonised_data,Complications) +harmonised_data <- rbind(harmonised_data,Labo) +harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) + +rm(list=c("SiAndSympt", + "Complications", + "ComAndRF", + "Dates", + "Demographics", + "Home_med", + "Imaging_data", + "Complications", + "Labo", + "LifestyleAndDiet")) + +categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") + + +#---------------------------------------------------------------------------- + +#Test if column names are valid +check_column_names <- function(x){ + + str_res <- "The column names:" + + for(i in 1:(nrow(data_colnames))){ + if(!check_valid_name(data_colnames[i,1])){ + str_res<- paste(str_res, data_colnames[i,1], sep=" ") + } + } + + str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") + + return (str_res) + +} + +#Test if a single variable name is valid +check_valid_name <- function(x){ + + valid <- FALSE + + aux <- as.data.frame(strsplit(x , split = "_")) + + if(aux[1,1] %in% harmonised_data$Harmonised.variable.name) + valid <- TRUE + + return (valid) + +} + +valid_data_colnames <- function(x){ + + valid_colnames = c() + + for(i in 1:(nrow(data_colnames))){ + if(check_valid_name(data_colnames[i,1])){ + valid_colnames = c(valid_colnames,data_colnames[i,1]) + } + } + + + return(valid_colnames) +} + +remove_space <- function(x){ + searchString <- ' ' + replacementString <- '' + res = sub(searchString,replacementString,x) + return(res) +} + +remove_spaces_from_ds <- function(ds){ + + res<- lapply(ds,remove_space ) + + return(as.data.frame(res)) + +} + +is_number <- function(x){ + res <- FALSE + + + if(length(x)!=0){ + x <- str_replace(x,",",".") + + aux <- as.numeric(x) + + + if(!is.na(aux)) + res <- TRUE + } + + + return(res) + +} + +check_values_not_categoric <- function(values, colname){ + + valid_vals <- values + + possible_vals <- possible_values(colname) + + for(i in 1:length(values)){ + + res<- FALSE + + value <- values[[i]] + + if(is_number(value)){ + value <- str_replace(value,",",".") + value <- as.numeric(value) + } + + if(is.null(value)){ + res <- TRUE + } + + else if( value == "NA" | value == "nan" | value == ".") + res <- TRUE + else{ + + if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){ + + if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){ + lower = possible_vals[1,1][[1]] + higher = possible_vals[2,1][[1]] + + }else{ + bounds <- as.data.frame(strsplit(possible_vals[1,1], '-')) + lowerAux <- str_replace(bounds [1,1],",",".") + higherAux <- str_replace(bounds [2,1],",",".") + lower <- as.numeric(remove_space(lowerAux)) + higher <- as.numeric(remove_space(higherAux)) + } + + + if ((value >= lower & value <= higher)) + res <- TRUE + } + + if(nrow(possible_vals) == 3 | colname=="CSXCTR"){ + if(value == 0 | value == 1) + res <- TRUE + } + + if(nrow(possible_vals) > 3){ + + lower <- strtoi(remove_space(possible_vals[1,1])) + higher_b <- nrow(possible_vals)-1 + higher <- strtoi(remove_space(possible_vals[higher_b,1])) + + if ((value >= lower & value <= higher)) + res <- TRUE + } + } + + if(res == FALSE) + valid_vals[i] <- res + + + } + + return(valid_vals) +} + +possible_values <- function(x){ + + if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM") + res <- t(as.data.frame(list(-20,20))) + + else{ + + possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] + res <- strsplit(x=possible_value,split="/") + } + + return(as.data.frame(res)) +} + +possible_values_categoric <- function(x){ + + possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] + res <- strsplit(x=possible_value,split="/") + + return(as.data.frame(res)) +} + +check_values_categoric <- function(values, colname){ + + possible_vals <- possible_values_categoric(colname) + + res <- TRUE + + for(i in 1:length(values)){ + if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){ + res <- FALSE + } + + } + + return(res) + +} + + +get_values_from_quantiles <- function(x){ + + data_summary <- ds.summary(x) + + low_quantile <- data_summary[[1]][3][[1]][[1]] + + high_quantile <- data_summary[[1]][3][[1]][[7]] + + return(list(low_quantile,high_quantile)) + + +} + +notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){ + + res <- "" + + if(length(invalid_name_list) !=0){ + res <- "There are invalid values in the numeric fields:" + + + for(i in 1:length(invalid_name_list)){ + res <- paste(res, invalid_name_list[i], sep=" ") + } + + res <- paste(res, "\n", sep="") + + for(i in 1:length(invalid_name_list)){ + res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ") + } + } + + + if(length(wrong_categoric)!=0){ + + res <- paste(res, "\n############################################################################ \n", sep="") + + res <- paste(res,"\nThe following categoric values are invalid:", sep=" ") + + for(i in 1:length(wrong_categoric)){ + res <- paste(res, wrong_categoric[i], sep=" ") + } + + res <- paste(res, "\n", sep="") + + for(i in 1:length(wrong_categoric)){ + res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ") + } + } + + if(length(missing_numeric)!=0){ + + res <- paste(res, "\n############################################################################ \n", sep="") + + res<- paste(res, "\nThe following fields are missing a numeric field:") + + for(i in 1:length(wrong_categoric)){ + res <- paste(res, missing_numeric[i], sep=" ") + } + } + + res <- paste(res, "\n", sep="") + + + return(res) + +} + +error_message_categoric <- function(colname, invalid_values){ + + res<- "\nValues in the field" + res<- paste(res, colname, sep=" ") + res<- paste(res, "should be", sep=" ") + + range <- possible_values_categoric(colname) + + for(i in 1:nrow(range)){ + res <- paste(res, remove_space(range[i,1]), sep=" ") + } + + + res<- paste(res, "\nBut values were:", sep=" ") + + for(j in 1:length(invalid_values)){ + res<- paste(res, invalid_values[[j]], sep=" ") + } + + res<- paste(res, "\n\n", sep="") + + + return(res) + +} + +error_message <- function(colname, invalid_values){ + + res<- "\nValues in the field" + res<- paste(res, colname, sep=" ") + res<- paste(res, "should be", sep=" ") + + if(grepl("numeric", colname,fixed=TRUE)) + new_colname <- strsplit(x=colname,split="_")[[1]][1] + else + new_colname <- colname + + range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) + range <- range[5] + range <- as.data.frame(strsplit(range[1,1], '/')) + + #Range of values or null + if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){ + bounds <- as.data.frame(strsplit(range[1,1], '-')) + lower <- remove_space(bounds [1,1]) + higher <- remove_space(bounds [2,1]) + res<- paste(res, "numbers between", sep=" ") + res<- paste(res, lower, sep=" ") + res<- paste(res, "and", sep=" ") + res<- paste(res, higher, sep=" ") + res<- paste(res, "(both included)", sep=" ") + } + + if(nrow(range) == 3){ + res<- paste(res, "0 or 1", sep=" ") + } + + if(nrow(range) > 3){ + lower <- strtoi(remove_space(range[1,1])) + higher_b <- nrow(range)-1 + higher <- strtoi(remove_space(range[higher_b,1])) + res<- paste(res, "numbers between", sep=" ") + res<- paste(res, lower, sep=" ") + res<- paste(res, "and", sep=" ") + res<- paste(res, higher, sep=" ") + res<- paste(res, "(both included)", sep=" ") + } + + if(grepl("DAT",colname, fixed=TRUE)){ + res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ") + } + + + res<- paste(res, "\nBut values were:", sep=" ") + + for(j in 1:length(invalid_values)){ + res<- paste(res, invalid_values[[j]], sep=" ") + } + + res<- paste(res, "\n", sep="") + + if(!is_number(invalid_values[[1]])) + res<- paste(res, "(It's missing a \"numeric\" field)", sep="") + + res<- paste(res, "\n", sep="") + + return(res) + +} + +check_valid_values <- function(){ + + invalid_name_list <- c() + cannot_analyse_list <- c() + invalid_values_list <- list() + wrong_categoric_values <- list() + wrong_categoric <- c() + missing_numeric <- c() + j<- 1 + k <- 1 + + for(i in 1:(nrow(valid_colnames))){ + + data_table ="empty" + + if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ + + column <- "data$" + column <- paste(column, valid_colnames[i,1], sep="") + + tryCatch( + error = function(cnd) { + print("Unable to analyse data") + res <- FALSE + }, + data_table <- as.data.frame(ds.table(column)) + ) + + if(data_table == "empty"){ + + cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) + + }else{ + + + if (data_table[[1]] == "All studies failed for reasons identified below") + values <- get_values_from_quantiles(column) + else + values <- row.names(data_table) + + numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") + + if( valid_colnames[i,1] %in% categoric_vars ){ + + #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) + has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` + + if(!has_numeric) + missing_numeric <- c(missing_numeric, valid_colnames[i,1]) + + + if (data_table[[1]] == "All studies failed for reasons identified below"){ + + cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) + + }else if(!check_values_categoric(values,valid_colnames[i,1])){ + + print("Wrong categoric value:") + print(valid_colnames[i,1]) + + wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) + wrong_categoric_values[[k]] <- values + k <- k+1 + } + + # if((!is_numeric & !has_numeric) | is_numeric) + }else{ + + if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) + new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] + else + new_colname <- valid_colnames[i,1] + + valid <- check_values_not_categoric(values, new_colname) + + if (FALSE %in% valid){ + invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) + invalid_values_list[[j]] <- values + j <- j+1 + } + + + #print(valid_colnames[i,1]) + #print(values) + + }#else + # print("This variable has a numeric version") + } + + + } + } + + missing_numeric + if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){ + res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric) + }else{ + res <- "All values are valid \n" + } + + if(length(cannot_analyse_list)>0){ + res <- paste(res, "\n############################################################################ \n", sep="") + + res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) + } + + + + + return(res) + +} + +notify_unable_analyse <- function(x){ + + res <- "\nCould not obtain data from the fields:" + + for(i in 1:length(x)){ + res <- paste(res, x[i], sep=" ") + } + + return (res) + + +} + +auxConnections <- connect() +connections <- auxConnections[[1]] +inp <- auxConnections[[2]] + +#Conexión a la base de datos + +ds.dim("data", datasources = connections) +ds.colnames("data") + +#---------------------------------------------------------------------------- + +#Check valid column names +datastructure_name <- "data" +data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) + +data_colnames <- as.data.frame(data_colnames) + +check_valid_columns <- check_column_names(data_colnames) + +valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) + + + +#possible_values("CSXCTR") +result <- "" +result<-check_valid_values() +print(check_valid_columns) +datashield.logout(connections) +cat(result) + + + +# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna") +# # +# ds.summary("columna$DMXWT") +# ds.dim("columna$DMXWT") +# ds.table("columna$DMXWT") + +file_name<- paste(hospital_name,"_invalid_values.txt", sep="") + +#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections) + + +#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values") +dir.create("../invalid_values", showWarnings = FALSE) +setwd("../invalid_values") + + +cat(check_valid_columns,file=file_name,sep="\n") +cat(result,file=file_name,append=TRUE) + +datashield.logout(connections) + + diff --git a/valid_variables_script_local.R b/valid_variables_script_local.R new file mode 100755 index 0000000000000000000000000000000000000000..a37aa4955791cb63a54f8f7420ecda091bb64d9a --- /dev/null +++ b/valid_variables_script_local.R @@ -0,0 +1,218 @@ +rm(list=ls()) + +dir_name <- readline("Introduce the name of the directory please: ") + +setwd(dir_name) + +source("dependency_installer.R") + + +dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx") +install_dependencies(dep_list) + +#source("connection_parameters.R") +#source("necessary_functions_connection.R") + +codebook <- read.csv("new_harmon.csv" , sep = ",") + +codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name) + +names(codebook_col_names) <- c("col_names") + +setwd(paste(dir_name ,"/harmonized_data", sep="")) + +file_name <- readline("Introduce the name of the file to check the values: ") +harmonized_data <- "" + +if (grepl(".csv" , file_name , fixed = TRUE)){ + harmonized_data <- read.csv(file_name) +}else if (grepl(".xlsx" , file_name , fixed = TRUE)){ + harmonized_data <- read.xlsx(file_name) +} + +#---------------------------------------------------------------------------- + +#Test if column names are valid +check_column_names <- function(col_names){ + + str_res <- "The column names:" + valid_colnames <- c() + repeated_colnames <- c() + for(i in 1:(nrow(col_names))){ + col_name <- col_names[i,1] + number_of_column <- check_valid_name(col_name) + if( number_of_column == 0){ + str_res<- paste(str_res, col_name, sep=" ") + }else if (number_of_column == 1){ + valid_colnames = c(valid_colnames, col_name) + }else{ + repeated_colnames = c(repeated_colnames , col_name) + } + } + + str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") + + new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames) + return (new_list) + +} + + +#Test if a single variable name is valid +check_valid_name <- function(col_name){ + + valid <- 0 + + if(col_name %in% codebook_col_names$col_names){ + + valid <- length(grep(col_name, names(harmonized_data))) + + } + + return (valid) + +} + +check_valid_values_continuous <- function(colname , codebook_param , column){ + column <- column[column != "."] + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + possible_values_list = str_split(possible_values_format , "/")[[1]] + + # Fallará cuando el codebook no tenga min-max / . + range_as_str <- str_trim(possible_values_list[1]) + missing_value_format <- str_trim(str_trim(possible_values_list[2])) + + separate_range <- str_split(range_as_str , "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + + failing_values <- column[column < min_value | column > max_value] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + + + return(str_res) +} + +check_valid_values_binary <- function(colname , column){ + column <- column[column != "."] + failing_values <- column[column < 0 | column > 1] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + str_res <- "" + if (number_of_failing_values == 0) + str_res <- "No failing values" + else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + + + return(str_res) +} + +check_valid_values_categorical <- function(colname , codebook_param , column){ + column <- column[column != "."] + possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname] + possible_values_list <- str_split(possible_values_format , "/")[[1]] + + possible_values_list <- lapply(possible_values_list , str_trim) + + str_res <- "" + min_value <- 0 + max_value <- 0 + if (length(possible_values_list[[1]]) == 2){ + separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]] + min_value <- strtoi(separate_range[1]) + max_value <- strtoi(separate_range[2]) + + }else{ + + possible_values_list <- lapply(possible_values_list , strtoi)[[1]] + min_value <- possible_values_list[1] + max_value <- possible_values_list[length(possible_values_list) - 1] + } + + failing_values <- column[column < min_value | column > max_value ] + number_of_failing_values <- length(failing_values[!is.na(failing_values)]) + + if(number_of_failing_values == 0){ + str_res <- "No failing values" + }else{ + failing_values <- failing_values[!is.na(failing_values)] + str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" ")) + } + +} + +check_valid_values <- function(valid_colnames, codebook_param){ + + res <- "" + + for(i in 1:(ncol(valid_colnames))){ + name <- names(valid_colnames)[i] + #if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){ + # next + #} + + column <- valid_colnames[,i] + + # Esto falla si tu codebook no es mismo que new_harmon.csv + column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name] + + result = switch( + column_type, + "Continuous"= check_valid_values_continuous(name , codebook_param , column), + "Binary"= check_valid_values_binary(name , column), + "Categorical"= check_valid_values_categorical(name, codebook_param , column), + "Calendar date" = paste("No failing values"), + "ISO country code"= paste("No failing values"), + ) + + if (result != "No failing values"){ + res <- paste(res , result, sep="\n") + } + } + + return(res) + +} + + +data_colnames <- as.data.frame(colnames(harmonized_data)) + +check_valid_columns <- check_column_names(data_colnames) + +columns_not_valid <- check_valid_columns$not_colnames + +valid_colnames_column <- as.data.frame(check_valid_columns$colnames) +names(valid_colnames_column) = c("valid_colnames") +valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames) + + +result <- "" +result<-check_valid_values(valid_colnames_with_data, codebook) +print(check_valid_columns) +#datashield.logout(connections) +cat(result) + + + +file_name<- paste(hospital_name,"_invalid_values.txt", sep="") + + +dir.create("../invalid_values", showWarnings = FALSE) +setwd("../invalid_values") + +cat(check_valid_columns,file=file_name,sep="\n") +cat(result,file=file_name,append=TRUE) + +#datashield.logout(connections) +