rm(list=ls()) setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean")) hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=",")) Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) rm(list=c("SiAndSympt", "Complications", "ComAndRF", "Dates", "Demographics", "Home_med", "Imaging_data", "Complications", "Labo", "LifestyleAndDiet")) noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") noYesValues <- noYesValues$Harmonised.variable.name noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO","SMXFEA") categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") personalized <- c("DMRGENDR", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS") is_number <- function(x){ res <- FALSE if(length(x)!=0){ x <- str_replace(x,",",".") aux <- as.numeric(x) if(!is.na(aux)) res <- TRUE } return(res) } fixNonCategoric <- function(x){ if(!is_number(x)){ x <- "" } return(x) } getNumericValue <- function (x,colname) { if(colname %in% noYesValues){ if(is.na(x)) x <- "" else if (x == "No") x <- 0 else if (x == "Yes") x <- 1 } if(colname == "DMRGENDR"){ if(x == "Female") x <- 1 else if (x == "Male") x <- 0 } if(colname == "DSXOS"){ if(is.na(x)) x<- "" else if (x == "Missing") x <- "" else if (x == "Recovered") x <- 0 else if (x == "Deceased") x <- 1 else if (x == "Transferred") x <- 2 } if(colname == "DMRRETH1"){ if(is.na(x)) x <- "" else if(x =="Asian") x <- 1 else if (x == "Black") x <- 2 else if (x == "Hispanic") x <- 3 else if (x == "White") x <- 4 else if (x == "Multiracial") x <- 5 else if (x == "Other") x <- 6 } if(colname == "DMROCCU"){ if(is.na(x)) x <- "" else if(x == "Unemployed") x <- 1 else if (x == "Student") x <- 2 else if (x == "Employed") x <- 3 else if (x == "Self-employed") x <- 4 else if (x == "Retired") x <- 5 } if(colname == "DMRHREDU"){ if(is.na(x)) x <- "" else if(x =="High School") x <- 1 else if (x == "Bachelors") x <- 2 else if (x == "Postgraduate") x <- 3 else if (x == "Other") x <- 4 } if(colname =="CSXCOT"){ if(is.na(x)) x <- "" else if (x=="PCR" ) x <- 1 else if (x=="antigen" ) x <- 2 else if (x=="other" ) x <- 3 else x <- "" } return(x) } noNa <- function(x){ if(is.na(x)) x <- "" return(x) } rm(newDf) newDf <- hospital names <- colnames(hospital) list_numeric <- c() for(k in 1:length(names)){ mes <- paste(names[k], "_numeric",sep ="") if(names[k] %in% noYesValues){ list_numeric <- c(list_numeric,mes) }else if(names[k] %in% personalized){ list_numeric <- c(list_numeric,mes) } } numericDf <- data.frame(matrix(NA, nrow = nrow(hospital), ncol = length(list_numeric))) colnames(numericDf) <- list_numeric newDf<-bind_cols(newDf,numericDf) #newDf[,"DMRGENDR_numeric"] <- NA for (j in 1:ncol(hospital)){ percentage <- trunc(j/ncol(hospital)*100) mes <- paste(toString(percentage),"% completed", sep="") print(mes) numeric_col <- paste(names[j], "_numeric", sep="") print(names[j]) for(i in 1:nrow(hospital)){ if(i %% 10000 == 0) print(i) if(numeric_col %in% list_numeric){ newDf[i,numeric_col] <- getNumericValue(hospital[i,j],names[j]) } if(is.na(hospital[i,j])) newDf[i,j] <- "" } } setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources/Numeric_derived") setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean/harmonized")) write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)