rm(list=ls()) setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) #Cambiarlo por el nombre del ressource que se desea limpiar hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) hospital <- hospital %>% select(-contains("numeric")) # hospital["NOT.HARMONISED"] <- NULL # # names <- colnames(hospital) # for (i in 1:length(names)){ # # if(grepl("NOT.HARMONISED", names[i])){ # hospital[names[i]] <- NULL # print(paste("quito ", names[i])) # } # # } # hospital <- hospital[-1,] setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) rm(list=c("SiAndSympt", "Complications", "ComAndRF", "Dates", "Demographics", "Home_med", "Imaging_data", "Complications", "Labo", "LifestyleAndDiet")) noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") noYesValues <- noYesValues$Harmonised.variable.name noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") is_number <- function(x){ res <- FALSE if(length(x)!=0){ x <- str_replace(x,",",".") aux <- as.numeric(x) if(!is.na(aux)) res <- TRUE } return(res) } replaceNoYesValues <- function(x){ #Replace the value with Yes or No if(is.na(x)){ x <- "" return (x) } x <- trimws(toupper(x)) if (x=="0" | x =="NO") x <- "No" else if (x=="1" | x == "YES" | x=="SI") x <- "Yes" else x <- "" return (x) } fixNonCategoric <- function(x){ if(!is_number(x)){ x <- "" }else{ x <- str_replace(x,",",".") } return(x) } personalizedFun <- function(x, colname){ if(colname == "DMRGENDR"){ if(is.na(x)) x <- "" else if(x == 1 | x == "F"| x == "f" | x== "Female") x <- "Female" else if (x == 0 | x =="M" | x == "m" | x== "Male") x <- "Male" } if(colname == "CSXCTR"){ if(is.na(x)) x <- "" else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM") x <- "Positive" else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") x <- "Negative" } if(colname == "SMXFEA"){ x <- replaceNoYesValues(x) #if(is.na(x)) # x <- "" #else if(x == 1) # x <- "Yes" #else if (x == 0) # x <- "No" #else if (x == ".") # x <- "" } if(colname == "DMRRETH1"){ if(is.na(x)) x <- "" else if(x ==1) x <- "Asian" else if (x == 2) x <- "Black" else if (x == 3) x <- "Hispanic" else if (x == 4) x <- "White" else if (x == 5) x <- "Multiracial" else if (x == 6) x <- "Other" } if(colname == "DMROCCU"){ if(is.na(x)) x <- "" else if(x ==1) x <- "Unemployed" else if (x == 2) x <- "Student" else if (x == 3) x <- "Employed" else if (x == 4) x <- "Self-employed" else if (x == 5) x <- "Retired" else if (x == 6) x <- "" } if(colname == "DMRHREDU"){ if(is.na(x)) x <- "" else if(x ==1) x <- "High School" else if (x == 2) x <- "Bachelors" else if (x == 3) x <- "Postgraduate" else if (x == 4) x <- "Other" } if(colname =="DSXOS"){ if(is.na(x)) x <- "" else if (x==0 | x == "Recovered") x <- "Recovered" else if (x==1 | x == "Deceased") x <- "Deceased" else if (x==2 | x == "Transferred") x <- "Transferred" else x <- "" } if(colname =="CSXCOT"){ if(is.na(x)) x <- "" else if (x==1 ) x <- "PCR" else if (x==2 ) x <- "antigen" else if (x==3 ) x <- "other" else x <- "" } return(x) } dotToBar <- function (x){ if (grepl(".", x, fixed = TRUE)) res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") else res <- x return(res) } rm(newDf) newDf <- hospital names <- colnames(hospital) for (j in 1:ncol(hospital)){ percentage <- trunc(j/ncol(hospital)*100) mes <- paste(toString(percentage),"% completed", sep="") print(mes) print(names[j]) for(i in 1:nrow(hospital)){ if(names[j] %in% noYesValues){ newDf[i,j] <- replaceNoYesValues(hospital[i,j]) }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){ newDf[i,j] <- fixNonCategoric(hospital[i,j]) } if(names[j] %in% personalized){ newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) } if (is.na(hospital[i,j])) newDf[i,j] <- "" else if (hospital[i,j] == ".") newDf[i,j] <- "" } } setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)