rm(list=ls())

setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))

#Cambiarlo por el nombre del ressource que se desea limpiar
hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))


hospital <- hospital %>% select(-contains("numeric"))

# hospital["NOT.HARMONISED"] <- NULL
# 
# names <- colnames(hospital)
# for (i in 1:length(names)){
# 
#   if(grepl("NOT.HARMONISED", names[i])){
#     hospital[names[i]] <- NULL
#     print(paste("quito ", names[i]))
#   }
# 
# }
# hospital <- hospital[-1,]

setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")

ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]


harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment)

harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)

rm(list=c("SiAndSympt",
          "Complications",
          "ComAndRF",
          "Dates",
          "Demographics",
          "Home_med",
          "Imaging_data",
          "Complications",
          "Labo",
          "LifestyleAndDiet"))

noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
noYesValues <- noYesValues$Harmonised.variable.name
noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")

categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")

personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")

is_number <- function(x){
  res <- FALSE
  
  if(length(x)!=0){
    x <- str_replace(x,",",".")
    
    aux <- as.numeric(x)
    
    if(!is.na(aux))
      res <- TRUE
  }
  return(res)
}


replaceNoYesValues <- function(x){
  
  #Replace the value with Yes or No
    if(is.na(x)){
      x <- ""
      return (x)
    }
  
    x <- trimws(toupper(x))
    if (x=="0" | x =="NO")
      x <- "No"
    else if (x=="1" | x == "YES" |  x=="SI")
      x <- "Yes"
    else
      x <- ""
    
    return (x)
  
}

fixNonCategoric <- function(x){
  
  if(!is_number(x)){
    x <- ""
  }else{
    x <- str_replace(x,",",".")
  }
  
  return(x)
  
}

personalizedFun <- function(x, colname){
  
  if(colname == "DMRGENDR"){
    if(is.na(x))
      x <- ""
    else if(x == 1 | x == "F"| x == "f" | x== "Female")
      x <- "Female"
    else if (x == 0 | x =="M" | x == "m" | x== "Male")
      x <- "Male"
  }
  
  if(colname == "CSXCTR"){
    if(is.na(x))
      x <- ""
    else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
      x <- "Positive"
    else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
      x <- "Negative"
  }

  if(colname == "SMXFEA"){
    x <- replaceNoYesValues(x)
    #if(is.na(x))
     # x <- ""
    #else if(x == 1)
     # x <- "Yes"
    #else if (x == 0)
     # x <- "No"
    #else if (x == ".")
     # x <- ""
  }
  
  if(colname == "DMRRETH1"){
    if(is.na(x))
      x <- ""
    else if(x ==1)
      x <- "Asian"
    else if (x == 2)
      x <- "Black"
    else if (x == 3)
      x <- "Hispanic"
    else if (x == 4)
      x <- "White"
    else if (x == 5)
      x <- "Multiracial"
    else if (x == 6)
      x <- "Other"
  }
  
  if(colname == "DMROCCU"){
    if(is.na(x))
      x <- ""
    else if(x ==1)
      x <- "Unemployed"
    else if (x == 2)
      x <- "Student"
    else if (x == 3)
      x <- "Employed"
    else if (x == 4)
      x <- "Self-employed"
    else if (x == 5)
      x <- "Retired"
    else if (x == 6)
      x <- ""
  }
  
  if(colname == "DMRHREDU"){
    if(is.na(x))
      x <- ""
    else if(x ==1)
      x <- "High School"
    else if (x == 2)
      x <- "Bachelors"
    else if (x == 3)
      x <- "Postgraduate"
    else if (x == 4)
      x <- "Other"
  }
  
  if(colname =="DSXOS"){
    
    if(is.na(x))
      x <- ""
    else if (x==0 | x == "Recovered")
      x <- "Recovered"
    else if (x==1 | x == "Deceased")
      x <- "Deceased"
    else if (x==2 | x == "Transferred")
      x <- "Transferred"
    else
      x <- ""
  }
  
  if(colname =="CSXCOT"){
    if(is.na(x))
      x <- ""
    else if (x==1 )
      x <- "PCR"
    else if (x==2 )
      x <- "antigen"
    else if (x==3 )
      x <- "other"
    else
      x <- ""
  }
  

  return(x)
  
}

dotToBar <- function (x){
  
  if (grepl(".", x, fixed = TRUE))
    res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
  else res <- x
  return(res)
  
}


rm(newDf)

newDf <- hospital


names <- colnames(hospital)


for (j in 1:ncol(hospital)){
  
  percentage <- trunc(j/ncol(hospital)*100)
  mes <- paste(toString(percentage),"% completed", sep="")
  print(mes)
  
  print(names[j])
  
  for(i in 1:nrow(hospital)){
    
    if(names[j] %in% noYesValues){
      newDf[i,j] <- replaceNoYesValues(hospital[i,j])
    }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
      newDf[i,j] <- fixNonCategoric(hospital[i,j])
    }
    
    if(names[j] %in% personalized){
      newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
    }
    
    if (is.na(hospital[i,j]))
      newDf[i,j] <- ""
    
    else if (hospital[i,j] == ".")
      newDf[i,j] <- ""
    
  }
}

setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))


write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)