Commit 3a586839 authored by Pepe Márquez Romero's avatar Pepe Márquez Romero

modificando para que el chequeo de variables no sea en datashield, sea en local

parent 6d57f049
*.xlsx
harmonized_data/*.csv
rm(list=ls()) rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master") dir_name <- readline("Introduce the name of the directory please: ")
setwd(dir_name)
source("required_folder_checker.R") source("required_folder_checker.R")
source("argument_hasher.R") source("argument_hasher.R")
source("dependency_installer.R") source("dependency_installer.R")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSI/DSI_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSOpal/DSOpal_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/DSLite/DSLite_1.2.0.tar.gz", repos=NULL, type="source")
# install.packages("https://cran.r-project.org/src/contrib/Archive/opalr/opalr_2.1.0.tar.gz", repos=NULL, type="source")
dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient") dep_list = c("jsonlite", "stringr","DSI","DSOpal","DSLite", "fields", "metafor", "ggplot2", "gridExtra", "data.table", "dsBaseClient", "openxlsx")
install_dependencies(dep_list) install_dependencies(dep_list)
#,"DSI","DSOpal","DSLite"
setwd("C:/Users/victor/Documents/TFG/r-analytics-master")
source("connection_parameters.R")
source("necessary_functions_connection.R")
setwd(dir_name)
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") #source("connection_parameters.R")
#source("necessary_functions_connection.R")
codebook <- read.csv("harmon.csv" , sep = ";")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] codebook_col_names <- as.data.frame(codebook$Harmonised.variable.name)
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
names(codebook_col_names) <- c("col_names")
setwd(paste(dir_name ,"/harmonized_data", sep=""))
harmonised_data <- rbind(SiAndSympt,ComAndRF) file_name <- readline("Introduce the name of the file to check the values: ")
harmonised_data <- rbind(harmonised_data,Treatment) harmonized_data <- ""
harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics) if (grepl(".csv" , file_name , fixed = TRUE)){
harmonised_data <- rbind(harmonised_data,Home_med) harmonized_data <- read.csv(file_name)
harmonised_data <- rbind(harmonised_data,Imaging_data) }else if (grepl(".xlsx" , file_name , fixed = TRUE)){
harmonised_data <- rbind(harmonised_data,Complications) harmonized_data <- read.xlsx(file_name)
harmonised_data <- rbind(harmonised_data,Labo) }
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt",
"Complications",
"ComAndRF",
"Dates",
"Demographics",
"Home_med",
"Imaging_data",
"Complications",
"Labo",
"LifestyleAndDiet"))
categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
...@@ -65,50 +43,38 @@ categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DS ...@@ -65,50 +43,38 @@ categoric_vars = c("DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DS
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
#Test if column names are valid #Test if column names are valid
check_column_names <- function(x){ check_column_names <- function(col_names){
str_res <- "The column names:" str_res <- "The column names:"
valid_colnames <- c()
for(i in 1:(nrow(data_colnames))){ for(i in 1:(nrow(col_names))){
if(!check_valid_name(data_colnames[i,1])){ col_name <- col_names[i,1]
str_res<- paste(str_res, data_colnames[i,1], sep=" ") if(!check_valid_name(col_name)){
str_res<- paste(str_res, col_name, sep=" ")
}else{
valid_colnames = c(valid_colnames, col_name)
} }
} }
str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ") str_res<- paste(str_res,"are not registered in the harmonized data codebook \n", sep=" ")
return (str_res) new_list <- list("not_colnames" = str_res , "colnames" = valid_colnames)
return (new_list)
} }
#Test if a single variable name is valid #Test if a single variable name is valid
check_valid_name <- function(x){ check_valid_name <- function(col_name){
valid <- FALSE valid <- FALSE
aux <- as.data.frame(strsplit(x , split = "_")) if(col_name %in% codebook_col_names$col_names)
if(aux[1,1] %in% harmonised_data$Harmonised.variable.name)
valid <- TRUE valid <- TRUE
return (valid) return (valid)
} }
valid_data_colnames <- function(x){
valid_colnames = c()
for(i in 1:(nrow(data_colnames))){
if(check_valid_name(data_colnames[i,1])){
valid_colnames = c(valid_colnames,data_colnames[i,1])
}
}
return(valid_colnames)
}
remove_space <- function(x){ remove_space <- function(x){
searchString <- ' ' searchString <- ' '
replacementString <- '' replacementString <- ''
...@@ -219,7 +185,7 @@ possible_values <- function(x){ ...@@ -219,7 +185,7 @@ possible_values <- function(x){
else{ else{
possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,5] possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,5]
res <- strsplit(x=possible_value,split="/") res <- strsplit(x=possible_value,split="/")
} }
...@@ -228,7 +194,7 @@ possible_values <- function(x){ ...@@ -228,7 +194,7 @@ possible_values <- function(x){
possible_values_categoric <- function(x){ possible_values_categoric <- function(x){
possible_value <- subset(harmonised_data,harmonised_data$Harmonised.variable.name==x)[1,4] possible_value <- subset(harmonized_data,harmonized_data$harmonized.variable.name==x)[1,4]
res <- strsplit(x=possible_value,split="/") res <- strsplit(x=possible_value,split="/")
return(as.data.frame(res)) return(as.data.frame(res))
...@@ -254,7 +220,7 @@ check_values_categoric <- function(values, colname){ ...@@ -254,7 +220,7 @@ check_values_categoric <- function(values, colname){
get_values_from_quantiles <- function(x){ get_values_from_quantiles <- function(x){
data_summary <- ds.summary(x) data_summary <- summary(x)
low_quantile <- data_summary[[1]][3][[1]][[1]] low_quantile <- data_summary[[1]][3][[1]][[1]]
...@@ -357,7 +323,7 @@ error_message <- function(colname, invalid_values){ ...@@ -357,7 +323,7 @@ error_message <- function(colname, invalid_values){
else else
new_colname <- colname new_colname <- colname
range <- subset(harmonised_data, harmonised_data$Harmonised.variable.name == new_colname) range <- subset(harmonized_data, harmonized_data$harmonized.variable.name == new_colname)
range <- range[5] range <- range[5]
range <- as.data.frame(strsplit(range[1,1], '/')) range <- as.data.frame(strsplit(range[1,1], '/'))
...@@ -435,7 +401,7 @@ check_valid_values <- function(){ ...@@ -435,7 +401,7 @@ check_valid_values <- function(){
print("Unable to analyse data") print("Unable to analyse data")
res <- FALSE res <- FALSE
}, },
data_table <- as.data.frame(ds.table(column)) data_table <- as.data.frame(table(column))
) )
if(data_table == "empty"){ if(data_table == "empty"){
...@@ -475,7 +441,6 @@ check_valid_values <- function(){ ...@@ -475,7 +441,6 @@ check_valid_values <- function(){
k <- k+1 k <- k+1
} }
# if((!is_numeric & !has_numeric) | is_numeric)
}else{ }else{
if(grepl("numeric", valid_colnames[i,1],fixed=TRUE)) if(grepl("numeric", valid_colnames[i,1],fixed=TRUE))
...@@ -492,11 +457,7 @@ check_valid_values <- function(){ ...@@ -492,11 +457,7 @@ check_valid_values <- function(){
} }
#print(valid_colnames[i,1]) }
#print(values)
}#else
# print("This variable has a numeric version")
} }
...@@ -536,56 +497,32 @@ notify_unable_analyse <- function(x){ ...@@ -536,56 +497,32 @@ notify_unable_analyse <- function(x){
} }
auxConnections <- connect() data_colnames <- as.data.frame(colnames(harmonized_data))
connections <- auxConnections[[1]]
inp <- auxConnections[[2]]
#Conexión a la base de datos
ds.dim("data", datasources = connections)
ds.colnames("data")
#----------------------------------------------------------------------------
#Check valid column names
datastructure_name <- "data"
data_colnames <- ds.colnames(x=datastructure_name, datasources= connections)
data_colnames <- as.data.frame(data_colnames)
check_valid_columns <- check_column_names(data_colnames) check_valid_columns <- check_column_names(data_colnames)
valid_colnames <- as.data.frame(valid_data_colnames(data_colnames)) columns_not_valid <- check_valid_columns$not_colnames
valid_colnames <- as.data.frame(check_valid_columns$colnames)
names(valid_colnames) = c("valid_colnames")
#possible_values("CSXCTR")
result <- "" result <- ""
result<-check_valid_values() result<-check_valid_values()
print(check_valid_columns) print(check_valid_columns)
datashield.logout(connections) #datashield.logout(connections)
cat(result) cat(result)
# ds.dataFrameSubset(df.name = "data", V1.name = "data$DMXWT", "400" , Boolean.operator = '>', newobj = "columna")
# #
# ds.summary("columna$DMXWT")
# ds.dim("columna$DMXWT")
# ds.table("columna$DMXWT")
file_name<- paste(hospital_name,"_invalid_values.txt", sep="") file_name<- paste(hospital_name,"_invalid_values.txt", sep="")
#ds.heatmapPlot("data$LBDSALSIA", "data$RFXHC_numeric",type="combine", datasources = connections)
#setwd("C:/Users/victor/Desktop/TFG/r-analytics-master/invalid_values")
dir.create("../invalid_values", showWarnings = FALSE) dir.create("../invalid_values", showWarnings = FALSE)
setwd("../invalid_values") setwd("../invalid_values")
cat(check_valid_columns,file=file_name,sep="\n") cat(check_valid_columns,file=file_name,sep="\n")
cat(result,file=file_name,append=TRUE) cat(result,file=file_name,append=TRUE)
datashield.logout(connections) #datashield.logout(connections)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment