From 6d57f049d5b5da690f484076a80c674fc132d278 Mon Sep 17 00:00:00 2001 From: pxp9 Date: Thu, 2 Feb 2023 16:32:00 +0100 Subject: [PATCH] =?UTF-8?q?peque=C3=B1a=20mejora=20en=20la=20sustitucion?= =?UTF-8?q?=20de=20Yes/No?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ressourceCleaner.R | 565 +++++++++++++++++++++++---------------------- 1 file changed, 285 insertions(+), 280 deletions(-) diff --git a/ressourceCleaner.R b/ressourceCleaner.R index cb4f7ea..69090e8 100755 --- a/ressourceCleaner.R +++ b/ressourceCleaner.R @@ -1,280 +1,285 @@ - -rm(list=ls()) - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") -setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) - -#Cambiarlo por el nombre del ressource que se desea limpiar -hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) - - -hospital <- hospital %>% select(-contains("numeric")) - -# hospital["NOT.HARMONISED"] <- NULL -# -# names <- colnames(hospital) -# for (i in 1:length(names)){ -# -# if(grepl("NOT.HARMONISED", names[i])){ -# hospital[names[i]] <- NULL -# print(paste("quito ", names[i])) -# } -# -# } -# hospital <- hospital[-1,] - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") - -ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] -Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] -Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] -Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] -Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] -Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] -Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] -SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] -Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] -LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] - - -harmonised_data <- rbind(SiAndSympt,ComAndRF) -harmonised_data <- rbind(harmonised_data,Treatment) - -harmonised_data <- rbind(harmonised_data,Dates) -harmonised_data <- rbind(harmonised_data,Demographics) -harmonised_data <- rbind(harmonised_data,Home_med) -harmonised_data <- rbind(harmonised_data,Imaging_data) -harmonised_data <- rbind(harmonised_data,Complications) -harmonised_data <- rbind(harmonised_data,Labo) -harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) - -rm(list=c("SiAndSympt", - "Complications", - "ComAndRF", - "Dates", - "Demographics", - "Home_med", - "Imaging_data", - "Complications", - "Labo", - "LifestyleAndDiet")) - -noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") -noYesValues <- noYesValues$Harmonised.variable.name -noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") - -categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") - -personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") - -is_number <- function(x){ - res <- FALSE - - if(length(x)!=0){ - x <- str_replace(x,",",".") - - aux <- as.numeric(x) - - if(!is.na(aux)) - res <- TRUE - } - return(res) -} - - -replaceNoYesValues <- function(x){ - - #Replace the value with Yes or No - if(is.na(x)) - x <- "" - else if (x==0 | x =="No" | x ==" No" | x =="NO") - x <- "No" - else if (x==1 | x == "Yes" | x == " Yes" | x=="SI") - x <- "Yes" - else - x <- "" - - return (x) - -} - -fixNonCategoric <- function(x){ - - if(!is_number(x)){ - x <- "" - }else{ - x <- str_replace(x,",",".") - } - - return(x) - -} - -personalizedFun <- function(x, colname){ - - if(colname == "DMRGENDR"){ - if(is.na(x)) - x <- "" - else if(x == 1 | x == "F"| x == "f" | x== "Female") - x <- "Female" - else if (x == 0 | x =="M" | x == "m" | x== "Male") - x <- "Male" - } - - if(colname == "CSXCTR"){ - if(is.na(x)) - x <- "" - else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM") - x <- "Positive" - else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") - x <- "Negative" - } - - if(colname == "SMXFEA"){ - if(is.na(x)) - x <- "" - else if(x == 1) - x <- "Yes" - else if (x == 0) - x <- "No" - else if (x == ".") - x <- "" - } - - if(colname == "DMRRETH1"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "Asian" - else if (x == 2) - x <- "Black" - else if (x == 3) - x <- "Hispanic" - else if (x == 4) - x <- "White" - else if (x == 5) - x <- "Multiracial" - else if (x == 6) - x <- "Other" - } - - if(colname == "DMROCCU"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "Unemployed" - else if (x == 2) - x <- "Student" - else if (x == 3) - x <- "Employed" - else if (x == 4) - x <- "Self-employed" - else if (x == 5) - x <- "Retired" - else if (x == 6) - x <- "" - } - - if(colname == "DMRHREDU"){ - if(is.na(x)) - x <- "" - else if(x ==1) - x <- "High School" - else if (x == 2) - x <- "Bachelors" - else if (x == 3) - x <- "Postgraduate" - else if (x == 4) - x <- "Other" - } - - if(colname =="DSXOS"){ - - if(is.na(x)) - x <- "" - else if (x==0 | x == "Recovered") - x <- "Recovered" - else if (x==1 | x == "Deceased") - x <- "Deceased" - else if (x==2 | x == "Transferred") - x <- "Transferred" - else - x <- "" - } - - if(colname =="CSXCOT"){ - if(is.na(x)) - x <- "" - else if (x==1 ) - x <- "PCR" - else if (x==2 ) - x <- "antigen" - else if (x==3 ) - x <- "other" - else - x <- "" - } - - - return(x) - -} - -dotToBar <- function (x){ - - if (grepl(".", x, fixed = TRUE)) - res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") - else res <- x - return(res) - -} - - - -rm(newDf) - -newDf <- hospital - - - -names <- colnames(hospital) - - -for (j in 1:ncol(hospital)){ - - percentage <- trunc(j/ncol(hospital)*100) - mes <- paste(toString(percentage),"% completed", sep="") - print(mes) - - print(names[j]) - - for(i in 1:nrow(hospital)){ - - if(names[j] %in% noYesValues){ - newDf[i,j] <- replaceNoYesValues(hospital[i,j]) - }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){ - newDf[i,j] <- fixNonCategoric(hospital[i,j]) - } - - if(names[j] %in% personalized){ - newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) - } - - if (is.na(hospital[i,j])) - newDf[i,j] <- "" - - else if (hospital[i,j] == ".") - newDf[i,j] <- "" - - } -} - -setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") -setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) - - - - -write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) - - + +rm(list=ls()) + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") +setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) + +#Cambiarlo por el nombre del ressource que se desea limpiar +hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) + + +hospital <- hospital %>% select(-contains("numeric")) + +# hospital["NOT.HARMONISED"] <- NULL +# +# names <- colnames(hospital) +# for (i in 1:length(names)){ +# +# if(grepl("NOT.HARMONISED", names[i])){ +# hospital[names[i]] <- NULL +# print(paste("quito ", names[i])) +# } +# +# } +# hospital <- hospital[-1,] + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") + +ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] +Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] +Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] +Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] +Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] +Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] +Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] +SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] +Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] +LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] + + +harmonised_data <- rbind(SiAndSympt,ComAndRF) +harmonised_data <- rbind(harmonised_data,Treatment) + +harmonised_data <- rbind(harmonised_data,Dates) +harmonised_data <- rbind(harmonised_data,Demographics) +harmonised_data <- rbind(harmonised_data,Home_med) +harmonised_data <- rbind(harmonised_data,Imaging_data) +harmonised_data <- rbind(harmonised_data,Complications) +harmonised_data <- rbind(harmonised_data,Labo) +harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) + +rm(list=c("SiAndSympt", + "Complications", + "ComAndRF", + "Dates", + "Demographics", + "Home_med", + "Imaging_data", + "Complications", + "Labo", + "LifestyleAndDiet")) + +noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") +noYesValues <- noYesValues$Harmonised.variable.name +noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") + +categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") + +personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") + +is_number <- function(x){ + res <- FALSE + + if(length(x)!=0){ + x <- str_replace(x,",",".") + + aux <- as.numeric(x) + + if(!is.na(aux)) + res <- TRUE + } + return(res) +} + + +replaceNoYesValues <- function(x){ + + #Replace the value with Yes or No + if(is.na(x)){ + x <- "" + return (x) + } + + x <- trimws(toupper(x)) + if (x=="0" | x =="NO") + x <- "No" + else if (x=="1" | x == "YES" | x=="SI") + x <- "Yes" + else + x <- "" + + return (x) + +} + +fixNonCategoric <- function(x){ + + if(!is_number(x)){ + x <- "" + }else{ + x <- str_replace(x,",",".") + } + + return(x) + +} + +personalizedFun <- function(x, colname){ + + if(colname == "DMRGENDR"){ + if(is.na(x)) + x <- "" + else if(x == 1 | x == "F"| x == "f" | x== "Female") + x <- "Female" + else if (x == 0 | x =="M" | x == "m" | x== "Male") + x <- "Male" + } + + if(colname == "CSXCTR"){ + if(is.na(x)) + x <- "" + else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM") + x <- "Positive" + else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") + x <- "Negative" + } + + if(colname == "SMXFEA"){ + x <- replaceNoYesValues(x) + #if(is.na(x)) + # x <- "" + #else if(x == 1) + # x <- "Yes" + #else if (x == 0) + # x <- "No" + #else if (x == ".") + # x <- "" + } + + if(colname == "DMRRETH1"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "Asian" + else if (x == 2) + x <- "Black" + else if (x == 3) + x <- "Hispanic" + else if (x == 4) + x <- "White" + else if (x == 5) + x <- "Multiracial" + else if (x == 6) + x <- "Other" + } + + if(colname == "DMROCCU"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "Unemployed" + else if (x == 2) + x <- "Student" + else if (x == 3) + x <- "Employed" + else if (x == 4) + x <- "Self-employed" + else if (x == 5) + x <- "Retired" + else if (x == 6) + x <- "" + } + + if(colname == "DMRHREDU"){ + if(is.na(x)) + x <- "" + else if(x ==1) + x <- "High School" + else if (x == 2) + x <- "Bachelors" + else if (x == 3) + x <- "Postgraduate" + else if (x == 4) + x <- "Other" + } + + if(colname =="DSXOS"){ + + if(is.na(x)) + x <- "" + else if (x==0 | x == "Recovered") + x <- "Recovered" + else if (x==1 | x == "Deceased") + x <- "Deceased" + else if (x==2 | x == "Transferred") + x <- "Transferred" + else + x <- "" + } + + if(colname =="CSXCOT"){ + if(is.na(x)) + x <- "" + else if (x==1 ) + x <- "PCR" + else if (x==2 ) + x <- "antigen" + else if (x==3 ) + x <- "other" + else + x <- "" + } + + + return(x) + +} + +dotToBar <- function (x){ + + if (grepl(".", x, fixed = TRUE)) + res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") + else res <- x + return(res) + +} + + + +rm(newDf) + +newDf <- hospital + + + +names <- colnames(hospital) + + +for (j in 1:ncol(hospital)){ + + percentage <- trunc(j/ncol(hospital)*100) + mes <- paste(toString(percentage),"% completed", sep="") + print(mes) + + print(names[j]) + + for(i in 1:nrow(hospital)){ + + if(names[j] %in% noYesValues){ + newDf[i,j] <- replaceNoYesValues(hospital[i,j]) + }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){ + newDf[i,j] <- fixNonCategoric(hospital[i,j]) + } + + if(names[j] %in% personalized){ + newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) + } + + if (is.na(hospital[i,j])) + newDf[i,j] <- "" + + else if (hospital[i,j] == ".") + newDf[i,j] <- "" + + } +} + +setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") +setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) + + + + +write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) + + -- 2.24.1