Commit a6b18322 authored by Pepe Marquez's avatar Pepe Marquez

Merge branch 'cambios-pepe' into 'master'

Revision de codigo y propuesta de mejora.

See merge request !1
parents 3570d2a0 03c015c4
*.xlsx
harmonized_data/*.csv
rm(list=ls()) rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
#Cambiarlo por el nombre del ressource que se desea limpiar #Cambiarlo por el nombre del ressource que se desea limpiar
hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
hospital <- hospital %>% select(-contains("numeric")) hospital <- hospital %>% select(-contains("numeric"))
# hospital["NOT.HARMONISED"] <- NULL # hospital["NOT.HARMONISED"] <- NULL
# #
# names <- colnames(hospital) # names <- colnames(hospital)
# for (i in 1:length(names)){ # for (i in 1:length(names)){
# #
# if(grepl("NOT.HARMONISED", names[i])){ # if(grepl("NOT.HARMONISED", names[i])){
# hospital[names[i]] <- NULL # hospital[names[i]] <- NULL
# print(paste("quito ", names[i])) # print(paste("quito ", names[i]))
# } # }
# #
# } # }
# hospital <- hospital[-1,] # hospital <- hospital[-1,]
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Treatment)
harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt", rm(list=c("SiAndSympt",
"Complications", "Complications",
"ComAndRF", "ComAndRF",
"Dates", "Dates",
"Demographics", "Demographics",
"Home_med", "Home_med",
"Imaging_data", "Imaging_data",
"Complications", "Complications",
"Labo", "Labo",
"LifestyleAndDiet")) "LifestyleAndDiet"))
noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
noYesValues <- noYesValues$Harmonised.variable.name noYesValues <- noYesValues$Harmonised.variable.name
noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
is_number <- function(x){ is_number <- function(x){
res <- FALSE res <- FALSE
if(length(x)!=0){ if(length(x)!=0){
x <- str_replace(x,",",".") x <- str_replace(x,",",".")
aux <- as.numeric(x) aux <- as.numeric(x)
if(!is.na(aux)) if(!is.na(aux))
res <- TRUE res <- TRUE
} }
return(res) return(res)
} }
replaceNoYesValues <- function(x){ replaceNoYesValues <- function(x){
#Replace the value with Yes or No #Replace the value with Yes or No
if(is.na(x)) if(is.na(x)){
x <- "" x <- ""
else if (x==0 | x =="No" | x ==" No" | x =="NO") return (x)
x <- "No" }
else if (x==1 | x == "Yes" | x == " Yes" | x=="SI")
x <- "Yes" x <- trimws(toupper(x))
else if (x=="0" | x =="NO")
x <- "" x <- "No"
else if (x=="1" | x == "YES" | x=="SI")
return (x) x <- "Yes"
else
} x <- ""
fixNonCategoric <- function(x){ return (x)
if(!is_number(x)){ }
x <- ""
}else{ fixNonCategoric <- function(x){
x <- str_replace(x,",",".")
} if(!is_number(x)){
x <- ""
return(x) }else{
x <- str_replace(x,",",".")
} }
personalizedFun <- function(x, colname){ return(x)
if(colname == "DMRGENDR"){ }
if(is.na(x))
x <- "" personalizedFun <- function(x, colname){
else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female" if(colname == "DMRGENDR"){
else if (x == 0 | x =="M" | x == "m" | x== "Male") if(is.na(x))
x <- "Male" x <- ""
} else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female"
if(colname == "CSXCTR"){ else if (x == 0 | x =="M" | x == "m" | x== "Male")
if(is.na(x)) x <- "Male"
x <- "" }
else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive" if(colname == "CSXCTR"){
else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") if(is.na(x))
x <- "Negative" x <- ""
} else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive"
if(colname == "SMXFEA"){ else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
if(is.na(x)) x <- "Negative"
x <- "" }
else if(x == 1)
x <- "Yes" if(colname == "SMXFEA"){
else if (x == 0) x <- replaceNoYesValues(x)
x <- "No" #if(is.na(x))
else if (x == ".") # x <- ""
x <- "" #else if(x == 1)
} # x <- "Yes"
#else if (x == 0)
if(colname == "DMRRETH1"){ # x <- "No"
if(is.na(x)) #else if (x == ".")
x <- "" # x <- ""
else if(x ==1) }
x <- "Asian"
else if (x == 2) if(colname == "DMRRETH1"){
x <- "Black" if(is.na(x))
else if (x == 3) x <- ""
x <- "Hispanic" else if(x ==1)
else if (x == 4) x <- "Asian"
x <- "White" else if (x == 2)
else if (x == 5) x <- "Black"
x <- "Multiracial" else if (x == 3)
else if (x == 6) x <- "Hispanic"
x <- "Other" else if (x == 4)
} x <- "White"
else if (x == 5)
if(colname == "DMROCCU"){ x <- "Multiracial"
if(is.na(x)) else if (x == 6)
x <- "" x <- "Other"
else if(x ==1) }
x <- "Unemployed"
else if (x == 2) if(colname == "DMROCCU"){
x <- "Student" if(is.na(x))
else if (x == 3) x <- ""
x <- "Employed" else if(x ==1)
else if (x == 4) x <- "Unemployed"
x <- "Self-employed" else if (x == 2)
else if (x == 5) x <- "Student"
x <- "Retired" else if (x == 3)
else if (x == 6) x <- "Employed"
x <- "" else if (x == 4)
} x <- "Self-employed"
else if (x == 5)
if(colname == "DMRHREDU"){ x <- "Retired"
if(is.na(x)) else if (x == 6)
x <- "" x <- ""
else if(x ==1) }
x <- "High School"
else if (x == 2) if(colname == "DMRHREDU"){
x <- "Bachelors" if(is.na(x))
else if (x == 3) x <- ""
x <- "Postgraduate" else if(x ==1)
else if (x == 4) x <- "High School"
x <- "Other" else if (x == 2)
} x <- "Bachelors"
else if (x == 3)
if(colname =="DSXOS"){ x <- "Postgraduate"
else if (x == 4)
if(is.na(x)) x <- "Other"
x <- "" }
else if (x==0 | x == "Recovered")
x <- "Recovered" if(colname =="DSXOS"){
else if (x==1 | x == "Deceased")
x <- "Deceased" if(is.na(x))
else if (x==2 | x == "Transferred") x <- ""
x <- "Transferred" else if (x==0 | x == "Recovered")
else x <- "Recovered"
x <- "" else if (x==1 | x == "Deceased")
} x <- "Deceased"
else if (x==2 | x == "Transferred")
if(colname =="CSXCOT"){ x <- "Transferred"
if(is.na(x)) else
x <- "" x <- ""
else if (x==1 ) }
x <- "PCR"
else if (x==2 ) if(colname =="CSXCOT"){
x <- "antigen" if(is.na(x))
else if (x==3 ) x <- ""
x <- "other" else if (x==1 )
else x <- "PCR"
x <- "" else if (x==2 )
} x <- "antigen"
else if (x==3 )
x <- "other"
return(x) else
x <- ""
} }
dotToBar <- function (x){
return(x)
if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") }
else res <- x
return(res) dotToBar <- function (x){
} if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
else res <- x
return(res)
rm(newDf)
}
newDf <- hospital
rm(newDf)
names <- colnames(hospital)
newDf <- hospital
for (j in 1:ncol(hospital)){
percentage <- trunc(j/ncol(hospital)*100) names <- colnames(hospital)
mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
for (j in 1:ncol(hospital)){
print(names[j])
percentage <- trunc(j/ncol(hospital)*100)
for(i in 1:nrow(hospital)){ mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
if(names[j] %in% noYesValues){
newDf[i,j] <- replaceNoYesValues(hospital[i,j]) print(names[j])
}else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
newDf[i,j] <- fixNonCategoric(hospital[i,j]) for(i in 1:nrow(hospital)){
}
if(names[j] %in% noYesValues){
if(names[j] %in% personalized){ newDf[i,j] <- replaceNoYesValues(hospital[i,j])
newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
} newDf[i,j] <- fixNonCategoric(hospital[i,j])
}
if (is.na(hospital[i,j]))
newDf[i,j] <- "" if(names[j] %in% personalized){
newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
else if (hospital[i,j] == ".") }
newDf[i,j] <- ""
if (is.na(hospital[i,j]))
} newDf[i,j] <- ""
}
else if (hospital[i,j] == ".")
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") newDf[i,j] <- ""
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
}
}
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
rm(list=ls()) rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master")
source("required_folder_checker.R") source("required_folder_checker.R")
source("argument_hasher.R") source("argument_hasher.R")
source("dependency_installer.R") source("dependency_installer.R")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source") # install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source")
dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient")
install_dependencies(dep_list) install_dependencies(dep_list)
#,"DSI","DSOpal","DSLite" #,"DSI","DSOpal","DSLite"
setwd("C:/Users/victor/Documents/TFG/r-analytics-master") setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
source("connection_parameters.R") source("connection_parameters.R")
source("necessary_functions_connection.R") source("necessary_functions_connection.R")
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Treatment)
harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt", rm(list=c("SiAndSympt",
"Complications", "Complications",
"ComAndRF", "ComAndRF",
"Dates", "Dates",
"Demographics", "Demographics",
"Home_med", "Home_med",
"Imaging_data", "Imaging_data",
"Complications", "Complications",
"Labo", "Labo",
"LifestyleAndDiet")) "LifestyleAndDiet"))
categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
#Test if column names are valid #Test if column names are valid
check_column_names <- function(x){ check_column_names <- function(x){
str_res <- "The column names:" str_res <- "The column names:"
for(i in 1:(nrow(data_colnames))){ for(i in 1:(nrow(data_colnames))){
if(!check_valid_name(data_colnames[i,1])){ if(!check_valid_name(data_colnames[i,1])){
str_res<- paste(str_res, data_colnames[i,1], sep=" ") str_res<- paste(str_res, data_colnames[i,1], sep=" ")
} }
} }
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
return (str_res) return (str_res)
} }
#Test if a single variable name is valid #Test if a single variable name is valid
check_valid_name <- function(x){ check_valid_name <- function(x){
valid <- FALSE valid <- FALSE
aux <- as.data.frame(strsplit(x , split = "_")) aux <- as.data.frame(strsplit(x , split = "_"))
if(aux[1,1] %in% harmonised_data$Harmonised.variable.name) if(aux[1,1] %in% harmonised_data$Harmonised.variable.name)
valid <- TRUE valid <- TRUE
return (valid) return (valid)
} }
valid_data_colnames <- function(x){ valid_data_colnames <- function(x){
valid_colnames = c() valid_colnames = c()
for(i in 1:(nrow(data_colnames))){ for(i in 1:(nrow(data_colnames))){
if(check_valid_name(data_colnames[i,1])){ if(check_valid_name(data_colnames[i,1])){
valid_colnames = c(valid_colnames,data_colnames[i,1]) valid_colnames = c(valid_colnames,data_colnames[i,1])
} }
} }
return(valid_colnames) return(valid_colnames)
} }
remove_space <- function(x){ remove_space <- function(x){
searchString <- ' ' searchString <- ' '
replacementString <- '' replacementString <- ''
res = sub(searchString,replacementString,x) res = sub(searchString,replacementString,x)
return(res) return(res)
} }
remove_spaces_from_ds <- function(ds){ remove_spaces_from_ds <- function(ds){
res<- lapply(ds,remove_space ) res<- lapply(ds,remove_space )
return(as.data.frame(res)) return(as.data.frame(res))
} }
is_number <- function(x){ is_number <- function(x){
res <- FALSE res <- FALSE
if(length(x)!=0){ if(length(x)!=0){
x <- str_replace(x,",",".") x <- str_replace(x,",",".")
aux <- as.numeric(x) aux <- as.numeric(x)
if(!is.na(aux)) if(!is.na(aux))
res <- TRUE res <- TRUE
} }
return(res) return(res)
} }
check_values_not_categoric <- function(values, colname){ check_values_not_categoric <- function(values, colname){
valid_vals <- values valid_vals <- values
possible_vals <- possible_values(colname) possible_vals <- possible_values(colname)
for(i in 1:length(values)){ for(i in 1:length(values)){
res<- FALSE res<- FALSE
value <- values[[i]] value <- values[[i]]
if(is_number(value)){ if(is_number(value)){
value <- str_replace(value,",",".") value <- str_replace(value,",",".")
value <- as.numeric(value) value <- as.numeric(value)
} }
if(is.null(value)){ if(is.null(value)){
res <- TRUE res <- TRUE
} }
else if( value == "NA" | value == "nan" | value == ".") else if( value == "NA" | value == "nan" | value == ".")
res <- TRUE res <- TRUE
else{ else{
if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){ if(nrow(possible_vals) == 2 & (!grepl("DAT",colname, fixed=TRUE)) & colname !="CSXCTR"){
if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){ if(colname=="LBXBEH" | colname=="LBXBEHn" | colname=="LBXBEM"){
lower = possible_vals[1,1][[1]] lower = possible_vals[1,1][[1]]
higher = possible_vals[2,1][[1]] higher = possible_vals[2,1][[1]]
}else{ }else{
bounds <- as.data.frame(strsplit(possible_vals[1,1], '-')) bounds <- as.data.frame(strsplit(possible_vals[1,1], '-'))
lowerAux <- str_replace(bounds [1,1],",",".") lowerAux <- str_replace(bounds [1,1],",",".")
higherAux <- str_replace(bounds [2,1],",",".") higherAux <- str_replace(bounds [2,1],",",".")
lower <- as.numeric(remove_space(lowerAux)) lower <- as.numeric(remove_space(lowerAux))
higher <- as.numeric(remove_space(higherAux)) higher <- as.numeric(remove_space(higherAux))
} }
if ((value >= lower & value <= higher)) if ((value >= lower & value <= higher))
res <- TRUE res <- TRUE
} }
if(nrow(possible_vals) == 3 | colname=="CSXCTR"){ if(nrow(possible_vals) == 3 | colname=="CSXCTR"){
if(value == 0 | value == 1) if(value == 0 | value == 1)
res <- TRUE res <- TRUE
} }
if(nrow(possible_vals) > 3){ if(nrow(possible_vals) > 3){
lower <- strtoi(remove_space(possible_vals[1,1])) lower <- strtoi(remove_space(possible_vals[1,1]))
higher_b <- nrow(possible_vals)-1 higher_b <- nrow(possible_vals)-1
higher <- strtoi(remove_space(possible_vals[higher_b,1])) higher <- strtoi(remove_space(possible_vals[higher_b,1]))
if ((value >= lower & value <= higher)) if ((value >= lower & value <= higher))
res <- TRUE res <- TRUE
} }
} }
if(res == FALSE) if(res == FALSE)
valid_vals[i] <- res valid_vals[i] <- res
} }
return(valid_vals) return(valid_vals)
} }
possible_values <- function(x){ possible_values <- function(x){
if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM") if(x=="LBXBEH" | x=="LBXBEHn" | x=="LBXBEM")
res <- t(as.data.frame(list(-20,20))) res <- t(as.data.frame(list(-20,20)))
else{ else{
possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5]
res <- strsplit(x=possible_value,split="/") res <- strsplit(x=possible_value,split="/")
} }
return(as.data.frame(res)) return(as.data.frame(res))
} }
possible_values_categoric <- function(x){ possible_values_categoric <- function(x){
possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4]
res <- strsplit(x=possible_value,split="/") res <- strsplit(x=possible_value,split="/")
return(as.data.frame(res)) return(as.data.frame(res))
} }
check_values_categoric <- function(values, colname){ check_values_categoric <- function(values, colname){
possible_vals <- possible_values_categoric(colname) possible_vals <- possible_values_categoric(colname)
res <- TRUE res <- TRUE
for(i in 1:length(values)){ for(i in 1:length(values)){
if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){ if(!(values[[i]] %in% as.matrix(remove_spaces_from_ds(possible_vals)))){
res <- FALSE res <- FALSE
} }
} }
return(res) return(res)
} }
get_values_from_quantiles <- function(x){ get_values_from_quantiles <- function(x){
data_summary <- ds.summary(x) data_summary <- ds.summary(x)
low_quantile <- data_summary[[1]][3][[1]][[1]] low_quantile <- data_summary[[1]][3][[1]][[1]]
high_quantile <- data_summary[[1]][3][[1]][[7]] high_quantile <- data_summary[[1]][3][[1]][[7]]
return(list(low_quantile,high_quantile)) return(list(low_quantile,high_quantile))
} }
notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){ notify_error <- function(invalid_name_list,invalid_value_list,wrong_categoric, wrong_categoric_values, missing_numeric){
res <- "" res <- ""
if(length(invalid_name_list) !=0){ if(length(invalid_name_list) !=0){
res <- "There are invalid values in the numeric fields:" res <- "There are invalid values in the numeric fields:"
for(i in 1:length(invalid_name_list)){ for(i in 1:length(invalid_name_list)){
res <- paste(res, invalid_name_list[i], sep=" ") res <- paste(res, invalid_name_list[i], sep=" ")
} }
res <- paste(res, "\n", sep="") res <- paste(res, "\n", sep="")
for(i in 1:length(invalid_name_list)){ for(i in 1:length(invalid_name_list)){
res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ") res <- paste(res, error_message(invalid_name_list[i], invalid_value_list[[i]]), sep=" ")
} }
} }
if(length(wrong_categoric)!=0){ if(length(wrong_categoric)!=0){
res <- paste(res, "\n############################################################################ \n", sep="") res <- paste(res, "\n############################################################################ \n", sep="")
res <- paste(res,"\nThe following categoric values are invalid:", sep=" ") res <- paste(res,"\nThe following categoric values are invalid:", sep=" ")
for(i in 1:length(wrong_categoric)){ for(i in 1:length(wrong_categoric)){
res <- paste(res, wrong_categoric[i], sep=" ") res <- paste(res, wrong_categoric[i], sep=" ")
} }
res <- paste(res, "\n", sep="") res <- paste(res, "\n", sep="")
for(i in 1:length(wrong_categoric)){ for(i in 1:length(wrong_categoric)){
res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ") res <- paste(res, error_message_categoric(wrong_categoric[i], wrong_categoric_values[[i]]), sep=" ")
} }
} }
if(length(missing_numeric)!=0){ if(length(missing_numeric)!=0){
res <- paste(res, "\n############################################################################ \n", sep="") res <- paste(res, "\n############################################################################ \n", sep="")
res<- paste(res, "\nThe following fields are missing a numeric field:") res<- paste(res, "\nThe following fields are missing a numeric field:")
for(i in 1:length(wrong_categoric)){ for(i in 1:length(wrong_categoric)){
res <- paste(res, missing_numeric[i], sep=" ") res <- paste(res, missing_numeric[i], sep=" ")
} }
} }
res <- paste(res, "\n", sep="") res <- paste(res, "\n", sep="")
return(res) return(res)
} }
error_message_categoric <- function(colname, invalid_values){ error_message_categoric <- function(colname, invalid_values){
res<- "\nValues in the field" res<- "\nValues in the field"
res<- paste(res, colname, sep=" ") res<- paste(res, colname, sep=" ")
res<- paste(res, "should be", sep=" ") res<- paste(res, "should be", sep=" ")
range <- possible_values_categoric(colname) range <- possible_values_categoric(colname)
for(i in 1:nrow(range)){ for(i in 1:nrow(range)){
res <- paste(res, remove_space(range[i,1]), sep=" ") res <- paste(res, remove_space(range[i,1]), sep=" ")
} }
res<- paste(res, "\nBut values were:", sep=" ") res<- paste(res, "\nBut values were:", sep=" ")
for(j in 1:length(invalid_values)){ for(j in 1:length(invalid_values)){
res<- paste(res, invalid_values[[j]], sep=" ") res<- paste(res, invalid_values[[j]], sep=" ")
} }
res<- paste(res, "\n\n", sep="") res<- paste(res, "\n\n", sep="")
return(res) return(res)
} }
error_message <- function(colname, invalid_values){ error_message <- function(colname, invalid_values){
res<- "\nValues in the field" res<- "\nValues in the field"
res<- paste(res, colname, sep=" ") res<- paste(res, colname, sep=" ")
res<- paste(res, "should be", sep=" ") res<- paste(res, "should be", sep=" ")
if(grepl("numeric", colname,fixed=TRUE)) if(grepl("numeric", colname,fixed=TRUE))
new_colname <- strsplit(x=colname,split="_")[[1]][1] new_colname <- strsplit(x=colname,split="_")[[1]][1]
else else
new_colname <- colname new_colname <- colname
range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname)
range <- range[5] range <- range[5]
range <- as.data.frame(strsplit(range[1,1], '/')) range <- as.data.frame(strsplit(range[1,1], '/'))
#Range of values or null #Range of values or null
if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){ if(nrow(range) == 2 & !grepl("DAT",colname, fixed=TRUE)){
bounds <- as.data.frame(strsplit(range[1,1], '-')) bounds <- as.data.frame(strsplit(range[1,1], '-'))
lower <- remove_space(bounds [1,1]) lower <- remove_space(bounds [1,1])
higher <- remove_space(bounds [2,1]) higher <- remove_space(bounds [2,1])
res<- paste(res, "numbers between", sep=" ") res<- paste(res, "numbers between", sep=" ")
res<- paste(res, lower, sep=" ") res<- paste(res, lower, sep=" ")
res<- paste(res, "and", sep=" ") res<- paste(res, "and", sep=" ")
res<- paste(res, higher, sep=" ") res<- paste(res, higher, sep=" ")
res<- paste(res, "(both included)", sep=" ") res<- paste(res, "(both included)", sep=" ")
} }
if(nrow(range) == 3){ if(nrow(range) == 3){
res<- paste(res, "0 or 1", sep=" ") res<- paste(res, "0 or 1", sep=" ")
} }
if(nrow(range) > 3){ if(nrow(range) > 3){
lower <- strtoi(remove_space(range[1,1])) lower <- strtoi(remove_space(range[1,1]))
higher_b <- nrow(range)-1 higher_b <- nrow(range)-1
higher <- strtoi(remove_space(range[higher_b,1])) higher <- strtoi(remove_space(range[higher_b,1]))
res<- paste(res, "numbers between", sep=" ") res<- paste(res, "numbers between", sep=" ")
res<- paste(res, lower, sep=" ") res<- paste(res, lower, sep=" ")
res<- paste(res, "and", sep=" ") res<- paste(res, "and", sep=" ")
res<- paste(res, higher, sep=" ") res<- paste(res, higher, sep=" ")
res<- paste(res, "(both included)", sep=" ") res<- paste(res, "(both included)", sep=" ")
} }
if(grepl("DAT",colname, fixed=TRUE)){ if(grepl("DAT",colname, fixed=TRUE)){
res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ") res<- paste(res, "dates with the following format: dd/mm/yyyy", sep=" ")
} }
res<- paste(res, "\nBut values were:", sep=" ") res<- paste(res, "\nBut values were:", sep=" ")
for(j in 1:length(invalid_values)){ for(j in 1:length(invalid_values)){
res<- paste(res, invalid_values[[j]], sep=" ") res<- paste(res, invalid_values[[j]], sep=" ")
} }
res<- paste(res, "\n", sep="") res<- paste(res, "\n", sep="")
if(!is_number(invalid_values[[1]])) if(!is_number(invalid_values[[1]]))
res<- paste(res, "(It's missing a \"numeric\" field)", sep="") res<- paste(res, "(It's missing a \"numeric\" field)", sep="")
res<- paste(res, "\n", sep="") res<- paste(res, "\n", sep="")
return(res) return(res)
} }
check_valid_values <- function(){ check_valid_values <- function(){
invalid_name_list <- c() invalid_name_list <- c()
cannot_analyse_list <- c() cannot_analyse_list <- c()
invalid_values_list <- list() invalid_values_list <- list()
wrong_categoric_values <- list() wrong_categoric_values <- list()
wrong_categoric <- c() wrong_categoric <- c()
missing_numeric <- c() missing_numeric <- c()
j<- 1 j<- 1
k <- 1 k <- 1
for(i in 1:(nrow(valid_colnames))){ for(i in 1:(nrow(valid_colnames))){
data_table ="empty" data_table ="empty"
if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){ if(!grepl("DMRBORN",valid_colnames[i,1], fixed=TRUE) & (!grepl("DAT",valid_colnames[i,1], fixed=TRUE)) & (!grepl("ISO",valid_colnames[i,1], fixed=TRUE))& (!grepl("BEF",valid_colnames[i,1], fixed=TRUE))){
column <- "data$" column <- "data$"
column <- paste(column, valid_colnames[i,1], sep="") column <- paste(column, valid_colnames[i,1], sep="")
tryCatch( tryCatch(
error = function(cnd) { error = function(cnd) {
print("Unable to analyse data") print("Unable to analyse data")
res <- FALSE res <- FALSE
}, },
data_table <- as.data.frame(ds.table(column)) data_table <- as.data.frame(ds.table(column))
) )
if(data_table == "empty"){ if(data_table == "empty"){
cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
}else{ }else{
if (data_table[[1]] == "All studies failed for reasons identified below") if (data_table[[1]] == "All studies failed for reasons identified below")
values <- get_values_from_quantiles(column) values <- get_values_from_quantiles(column)
else else
values <- row.names(data_table) values <- row.names(data_table)
numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="") numeric_col<- paste(valid_colnames[i,1],"_numeric", sep="")
if( valid_colnames[i,1] %in% categoric_vars ){ if( valid_colnames[i,1] %in% categoric_vars ){
#is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE) #is_numeric <- grepl("numeric",valid_colnames[i,1], fixed=TRUE)
has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)` has_numeric <- numeric_col %in% valid_colnames$`valid_data_colnames(data_colnames)`
if(!has_numeric) if(!has_numeric)
missing_numeric <- c(missing_numeric, valid_colnames[i,1]) missing_numeric <- c(missing_numeric, valid_colnames[i,1])
if (data_table[[1]] == "All studies failed for reasons identified below"){ if (data_table[[1]] == "All studies failed for reasons identified below"){
cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1]) cannot_analyse_list <- c(cannot_analyse_list,valid_colnames[i,1])
}else if(!check_values_categoric(values,valid_colnames[i,1])){ }else if(!check_values_categoric(values,valid_colnames[i,1])){
print("Wrong categoric value:") print("Wrong categoric value:")
print(valid_colnames[i,1]) print(valid_colnames[i,1])
wrong_categoric <- c(wrong_categoric, valid_colnames[i,1]) wrong_categoric <- c(wrong_categoric, valid_colnames[i,1])
wrong_categoric_values[[k]] <- values wrong_categoric_values[[k]] <- values
k <- k+1 k <- k+1
} }
# if((!is_numeric & !has_numeric) | is_numeric) # if((!is_numeric & !has_numeric) | is_numeric)
}else{ }else{
if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1] new_colname <- strsplit(x=valid_colnames[i,1],split="_")[[1]][1]
else else
new_colname <- valid_colnames[i,1] new_colname <- valid_colnames[i,1]
valid <- check_values_not_categoric(values, new_colname) valid <- check_values_not_categoric(values, new_colname)
if (FALSE %in% valid){ if (FALSE %in% valid){
invalid_name_list <- c(invalid_name_list,valid_colnames[i,1]) invalid_name_list <- c(invalid_name_list,valid_colnames[i,1])
invalid_values_list[[j]] <- values invalid_values_list[[j]] <- values
j <- j+1 j <- j+1
} }
#print(valid_colnames[i,1]) #print(valid_colnames[i,1])
#print(values) #print(values)
}#else }#else
# print("This variable has a numeric version") # print("This variable has a numeric version")
} }
} }
} }
missing_numeric missing_numeric
if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){ if(length(invalid_name_list)>0 | length(wrong_categoric)>0 | length(missing_numeric)>0){
res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric) res <- notify_error(invalid_name_list, invalid_values_list, wrong_categoric, wrong_categoric_values, missing_numeric)
}else{ }else{
res <- "All values are valid \n" res <- "All values are valid \n"
} }
if(length(cannot_analyse_list)>0){ if(length(cannot_analyse_list)>0){
res <- paste(res, "\n############################################################################ \n", sep="") res <- paste(res, "\n############################################################################ \n", sep="")
res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" ) res <- paste(res, notify_unable_analyse(cannot_analyse_list), sep="\n" )
} }
return(res) return(res)
} }
notify_unable_analyse <- function(x){ notify_unable_analyse <- function(x){
res <- "\nCould not obtain data from the fields:" res <- "\nCould not obtain data from the fields:"
for(i in 1:length(x)){ for(i in 1:length(x)){
res <- paste(res, x[i], sep=" ") res <- paste(res, x[i], sep=" ")
} }
return (res) return (res)
} }
auxConnections <- connect() auxConnections <- connect()
connections <- auxConnections[[1]] connections <- auxConnections[[1]]
inp <- auxConnections[[2]] inp <- auxConnections[[2]]
#Conexión a la base de datos #Conexión a la base de datos
ds.dim("data", datasources = connections) ds.dim("data", datasources = connections)
ds.colnames("data") ds.colnames("data")
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
#Check valid column names #Check valid column names
datastructure_name <- "data" datastructure_name <- "data"
data_colnames <- ds.colnames(x=datastructure_name, datasources= connections) data_colnames <- ds.colnames(x=datastructure_name, datasources= connections)
data_colnames <- as.data.frame(data_colnames) data_colnames <- as.data.frame(data_colnames)
check_valid_columns <- check_column_names(data_colnames) check_valid_columns <- check_column_names(data_colnames)
valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) valid_colnames <- as.data.frame(valid_data_colnames(data_colnames))
#possible_values("CSXCTR") #possible_values("CSXCTR")
result <- "" result <- ""
result<-check_valid_values() result<-check_valid_values()
print(check_valid_columns) print(check_valid_columns)
datashield.logout(connections) datashield.logout(connections)
cat(result) cat(result)
# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna") # ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna")
# # # #
# ds.summary("columna$DMXWT") # ds.summary("columna$DMXWT")
# ds.dim("columna$DMXWT") # ds.dim("columna$DMXWT")
# ds.table("columna$DMXWT") # ds.table("columna$DMXWT")
file_name<- paste(hospital_name,"_invalid_values.txt", sep="") file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections) #ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections)
#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values") #setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values")
dir.create("../invalid_values", showWarnings = FALSE) dir.create("../invalid_values", showWarnings = FALSE)
setwd("../invalid_values") setwd("../invalid_values")
cat(check_valid_columns,file=file_name,sep="\n") cat(check_valid_columns,file=file_name,sep="\n")
cat(result,file=file_name,append=TRUE) cat(result,file=file_name,append=TRUE)
datashield.logout(connections) datashield.logout(connections)
rm(list=ls())
dir_name <- readline("Introduce the name of the directory please: ")
setwd(dir_name)
source("dependency_installer.R")
dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list)
#source("connection_parameters.R")
#source("necessary_functions_connection.R")
codebook <- read.csv("new_harmon.csv" , sep = ",")
codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
names(codebook_col_names) <- c("col_names")
setwd(paste(dir_name ,"/harmonized_data", sep=""))
file_name <- readline("Introduce the name of the file to check the values: ")
harmonized_data <- ""
if (grepl(".csv" , file_name , fixed = TRUE)){
harmonized_data <- read.csv(file_name)
}else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonized_data <- read.xlsx(file_name)
}
#----------------------------------------------------------------------------
#Test if column names are valid
check_column_names <- function(col_names){
str_res <- "The column names:"
valid_colnames <- c()
repeated_colnames <- c()
for(i in 1:(nrow(col_names))){
col_name <- col_names[i,1]
number_of_column <- check_valid_name(col_name)
if( number_of_column == 0){
str_res<- paste(str_res, col_name, sep=" ")
}else if (number_of_column == 1){
valid_colnames = c(valid_colnames, col_name)
}else{
repeated_colnames = c(repeated_colnames , col_name)
}
}
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames , "repeated_colnames" = repeated_colnames)
return (new_list)
}
#Test if a single variable name is valid
check_valid_name <- function(col_name){
valid <- 0
if(col_name %in% codebook_col_names$col_names){
valid <- length(grep(col_name, names(harmonized_data)))
}
return (valid)
}
check_valid_values_continuous <- function(colname , codebook_param , column){
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list = str_split(possible_values_format , "/")[[1]]
# Fallará cuando el codebook no tenga min-max / .
range_as_str <- str_trim(possible_values_list[1])
missing_value_format <- str_trim(str_trim(possible_values_list[2]))
separate_range <- str_split(range_as_str , "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
failing_values <- column[column < min_value | column > max_value]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
return(str_res)
}
check_valid_values_binary <- function(colname , column){
column <- column[column != "."]
failing_values <- column[column < 0 | column > 1]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
str_res <- ""
if (number_of_failing_values == 0)
str_res <- "No failing values"
else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
return(str_res)
}
check_valid_values_categorical <- function(colname , codebook_param , column){
column <- column[column != "."]
possible_values_format <- codebook_param$Possible.values.format[codebook_param$Harmonised.variable.name == colname]
possible_values_list <- str_split(possible_values_format , "/")[[1]]
possible_values_list <- lapply(possible_values_list , str_trim)
str_res <- ""
min_value <- 0
max_value <- 0
if (length(possible_values_list[[1]]) == 2){
separate_range <- str_split(possible_values_list[[1]][1], "-")[[1]]
min_value <- strtoi(separate_range[1])
max_value <- strtoi(separate_range[2])
}else{
possible_values_list <- lapply(possible_values_list , strtoi)[[1]]
min_value <- possible_values_list[1]
max_value <- possible_values_list[length(possible_values_list) - 1]
}
failing_values <- column[column < min_value | column > max_value ]
number_of_failing_values <- length(failing_values[!is.na(failing_values)])
if(number_of_failing_values == 0){
str_res <- "No failing values"
}else{
failing_values <- failing_values[!is.na(failing_values)]
str_res <- paste("The failing values of column ", colname , paste(unlist(failing_values) , collapse =" "))
}
}
check_valid_values <- function(valid_colnames, codebook_param){
res <- ""
for(i in 1:(ncol(valid_colnames))){
name <- names(valid_colnames)[i]
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column <- valid_colnames[,i]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type <- codebook_param$Variable.type[codebook_param$Harmonised.variable.name == name]
result = switch(
column_type,
"Continuous"= check_valid_values_continuous(name , codebook_param , column),
"Binary"= check_valid_values_binary(name , column),
"Categorical"= check_valid_values_categorical(name, codebook_param , column),
"Calendar date" = paste("No failing values"),
"ISO country code"= paste("No failing values"),
)
if (result != "No failing values"){
res <- paste(res , result, sep="\n")
}
}
return(res)
}
data_colnames <- as.data.frame(colnames(harmonized_data))
check_valid_columns <- check_column_names(data_colnames)
columns_not_valid <- check_valid_columns$not_colnames
valid_colnames_column <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames_column) = c("valid_colnames")
valid_colnames_with_data <- subset(harmonized_data , select = valid_colnames_column$valid_colnames)
result <- ""
result<-check_valid_values(valid_colnames_with_data, codebook)
print(check_valid_columns)
#datashield.logout(connections)
cat(result)
file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
dir.create("../invalid_values", showWarnings = FALSE)
setwd("../invalid_values")
cat(check_valid_columns,file=file_name,sep="\n")
cat(result,file=file_name,append=TRUE)
#datashield.logout(connections)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment