differential_expresssion_analysis.R

#install.packages("DBI")
#install.packages("RMySQL")
library(DBI)
library(RMySQL)
library(limma)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(reshape2)
library(grid)
library(gtable)
#dbDisconnect(con)
con <- dbConnect(MySQL(), 
                 dbname = "disnet_biolayer", 
                 host = "ares.ctb.upm.es", 
                 port = 30604, 
                 user = "gene_expression", 
                 password = "gene_expression")


graphs_dir <- "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise_graphs"
gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")


differential_expression_analysis <- function(gds_id, con) {
  #extraction of data from expr_values table
  query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
  expr_values <- dbGetQuery(con, query_expr_values,  columns = c("gsm_id" = "character", 
                                                                 "id_ref" = "character", 
                                                                 "gene_symbol" = "character", 
                                                                 "value" = "numeric"))
  
  
  if (nrow(expr_values) == 0) {
    warning(paste("No data found for gds_id:", gds_id))
    return(NULL)
  }
  
  if (!is.numeric(expr_values$value)) {
    expr_values$value <- as.numeric(expr_values$value)
  }
  
  
  #extraction of data from expr_processed_annot table with flag = 1
  query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "' AND flag = 1")
  expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
  
  if (nrow(expr_processed_annot) == 0) {
    warning(paste("No data found in expr_processed_annot for gds_id:", gds_id))
    return(NULL)
  }
  
  #ensure there are enough disease state levels
  if (length(unique(expr_processed_annot$disease_state)) < 2) {
    warning(paste("Not enough disease state levels for gds_id:", gds_id))
    return(NULL)
  }
  
  #to keep gene_symbol for reference at the end
  gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
  
  #creation of the expression matrix without gene_symbol
  expression <- reshape2::dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
  rownames(expression) <- expression$id_ref
  expression <- expression[, -1]  # Remove the id_ref column
  
  if (ncol(expression) == 0) {
    warning(paste("No expression data available for gds_id:", gds_id))
    return(NULL)
  }
  
  #normalize expression matrix
  exprs_matrix <- as.matrix(expression)
  exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
  
  #create design matrix
  target <- expr_processed_annot
  target$disease_state <- factor(target$disease_state)
  
  if (length(levels(target$disease_state)) < 2) {
    warning(paste("Not enough disease state levels for gds_id:", gds_id))
    return(NULL)
  }
  
  design <- model.matrix(~ 0 + disease_state, data = target)
  colnames(design) <- levels(target$disease_state)
  
  # Fit linear model
  fit <- lmFit(exprs_normalized, design)
  
  #to handle NA coefficients by excluding probes with missing data
  na_probes <- rownames(fit$coefficients)[apply(is.na(fit$coefficients), 1, any)]
  if (length(na_probes) > 0) {
    warning(paste("Partial NA coefficients for", length(na_probes), "probe(s). These will be excluded from the analysis."))
    fit <- fit[-match(na_probes, rownames(fit$coefficients)), ]
  }
  
  #create and apply contrasts
  contrast_matrix <- makeContrasts(
    control_vs_disease = c - d, 
    levels = design
  )
  
  fit2 <- contrasts.fit(fit, contrast_matrix)
  fit2 <- eBayes(fit2)
  
  #handle zero sample variances by offsetting or excluding
  zero_variances <- fit2$sigma == 0
  if (any(zero_variances)) {
    warning("Zero sample variances detected, have been offset away from zero or excluded")
    fit2$sigma[zero_variances] <- min(fit2$sigma[!zero_variances]) * 1e-6
  }
  
  # results
  results <- topTable(fit2, adjust = "fdr", number = Inf)
  results$significant <- ifelse(results$adj.P.Val < 0.05 & abs(results$logFC) > 1, 
                                ifelse(results$logFC > 0, "o", "u"),
                                "n")
  
  #check if results contain any significant genes
  if (nrow(results) == 0) {
    warning(paste("No significant results for gds_id:", gds_id))
    return(NULL)
  }
  
  #add gene symbols back to results to have a reference
  results$id_ref <- rownames(results)
  results <- merge(results, gene_symbols, by = "id_ref", all.x = TRUE)
  
  #add gds_id column to know from which dataset comes from 
  results$gds_id <- gds_id
  results <- results[, c("gds_id", "id_ref", "gene_symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "significant")]
  return(list(results = results, gene_symbols=gene_symbols, exprs_normalized = exprs_normalized,target=target))
}


# Function: individual_expression_analysis
# Description: Performs individual expression analysis by calculating z-scores for each gene in each sample, 
#              based on normalized gene expression data. It identifies significant gene expression changes 
#              in individual samples compared to a control group.
# Inputs:
#   - gds_id: Identifier for the gene expression dataset.
#   - target: Data frame containing sample metadata, including the disease state (control or diseased).
#   - exprs_normalized: Matrix of normalized gene expression data, where rows represent genes and columns 
#                        represent samples.
#   - gene_symbols: Data frame containing gene symbols or identifiers corresponding to the rows of exprs_normalized.
#   - output_dir: Directory path where the output results will be stored.
# Outputs:
#   - A data frame containing the following columns:
#     - gds_id: Identifier for the gene expression dataset.
#     - gsm_id: Identifier for the individual sample.
#     - id_ref: Identifier for the gene with significant expression changes in the sample.
#     - gene_symbol: Symbol or identifier for the gene with significant expression changes.
#     - z_score: Z-score indicating the magnitude and direction of expression change for the gene in the sample.
#   - If no individual significant results are found for a given gds_id, the function issues a warning message indicating this outcome.

individual_expression_analysis <- function(gds_id, target, exprs_normalized, gene_symbols, output_dir) {
  if (!is.matrix(exprs_normalized) || ncol(exprs_normalized) < 2 || nrow(exprs_normalized) < 2) {
    warning(paste("exprs_normalized does not have at least dos dimensiones para gds_id:", gds_id))
    return(NULL)
  }
  
  if (sum(target$disease_state == 'c') == 0) {
    warning("No control samples found.")
    return(NULL)
  }
  
  gsm_control <- subset(target, disease_state == 'c')
  gsm_control_ids <- gsm_control$gsm_id
  
  exprs_control <- exprs_normalized[, gsm_control_ids]
  
  gene_means_control <- rowMeans(exprs_control, na.rm = TRUE)
  gene_sds_control <- apply(exprs_control, 1, sd, na.rm = TRUE)
  
  z <- sweep(exprs_normalized, 1, gene_means_control, "-")  
  z <- sweep(z, 1, gene_sds_control, "/")
  
  cutoff <- 2.5
  
  results_list <- list()
  for (i in 1:ncol(exprs_normalized)) {
    df <- data.frame(
      gsm_id = colnames(exprs_normalized)[i],
      id_ref = rownames(exprs_normalized),
      z_score = z[, i]
    )
    df$significant <- ifelse(abs(df$z_score) > cutoff, 
                             ifelse(df$z_score > 0, "o", "u"), 
                             "n")
    results_list[[length(results_list) + 1]] <- df
  }
  
  individual_results <- do.call(rbind, results_list)
  individual_results <- merge(individual_results, gene_symbols, by = "id_ref", all.x = TRUE)
  individual_results$gds_id <- gds_id
  individual_results <- individual_results[, c("gds_id", "gsm_id", "id_ref", "gene_symbol", "z_score", "significant")]
  
  return(individual_results)
}


randomized_model <- function(exprs_normalized, target) {
  set.seed(123)  # Para la reproducibilidad
  G <- nrow(exprs_normalized)  # Número de genes
  n <- ncol(exprs_normalized)  # Número de muestras
  
  num_simulations <- 1000
  z_cutoff <- 2.5
  max_subjects_per_gene_random <- numeric(num_simulations)
  
  for (sim in 1:num_simulations) {
    # Generar una matriz de Z-scores aleatorizados
    random_z_scores <- matrix(rnorm(G * n), nrow = G, ncol = n)
    
    # Contar el número de sujetos con Z-score mayor que el umbral en cada gen
    diseased_subjects <- which(target$disease_state == 'd')
    counts_per_gene <- rowSums(random_z_scores[, diseased_subjects] > z_cutoff)
    
    # Almacenar el máximo número de sujetos por gen en esta simulación
    max_subjects_per_gene_random[sim] <- max(counts_per_gene)
  }
  
  # Determinar el umbral X (percentil 95 de las simulaciones)
  X_rand <- quantile(max_subjects_per_gene_random, 0.95)
  return(X_rand)
}


# This function generates a volcano plot from differential expression results and saves it as a PNG file.
# 
# Args:
#   results: A data frame containing the results of differential expression analysis with columns for 
#            log fold change (logFC), p-values (P.Value), and significance status (significant).
#   gds_id: A string representing the GDS ID, which will be used in the plot title and filename.
#   graphs_dir: A string representing the directory where the plot will be saved.
# 
# Output:
#   A volcano plot is saved as a PNG file in the specified directory.

create_volcano_plot <- function(results, gds_id, graphs_dir) {
  
  # Map the significant values to corresponding labels for better readability in the legend
  results$significant_label <- factor(results$significant, levels = c("n", "o", "u"),
                                      labels = c("not significant", "overexpressed", "underexpressed"))
  
  # Create the volcano plot using ggplot2
  volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value), color = significant_label)) +
    geom_point() +  # Plot the points
    theme_minimal() +  # Use a minimal theme for the plot
    labs(
      title = paste("Volcano Plot for", gds_id),  # Title of the plot
      x = "Log Fold Change",  # X-axis label
      y = "-log10 adj-p-value",  # Y-axis label
      color = "Differential Expression"  # Legend title
    ) +
    scale_color_manual(
      values = c("not significant" = "grey", "overexpressed" = "red", "underexpressed" = "blue")  # Custom colors
    ) +
    theme(
      plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  # Customize plot title
      axis.title.x = element_text(size = 14),  # Customize X-axis title
      axis.title.y = element_text(size = 14),  # Customize Y-axis title
      axis.text.x = element_text(size = 12),  # Customize X-axis text
      axis.text.y = element_text(size = 12),  # Customize Y-axis text
      legend.position = "right",  # Position the legend on the right
      legend.justification = "center",  # Center the legend
      legend.title = element_text(size = 14),  # Customize legend title
      legend.text = element_text(size = 12),  # Customize legend text
      legend.box.background = element_rect(color = "black", linewidth = 0.5, linetype = "solid"),  # Legend box background
      legend.box.margin = margin(5, 5, 5, 5),  # Margin around legend box
      legend.box.spacing = unit(0.5, "cm"),  # Spacing within legend box
      legend.direction = "vertical"  # Legend direction
    )
  
  # Save the plot as a PNG file
  ggsave(file.path(graphs_dir, paste0("volcano_plot_", gds_id, ".png")), plot = volcano_plot, bg = 'white')
}


create_heatmap <- function(exprs_normalized, results, gds_id, graphs_dir, target) {
  # Seleccionar los 50 genes más significativos
  top_genes <- results[order(results$adj.P.Val), ][1:50, ]
  top_gene_names <- top_genes$id_ref
  
  if (length(top_gene_names) < 50) {
    warning(paste("Less than 50 significant genes for gds_id:", gds_id))
  }
  
  exprs_top <- exprs_normalized[top_gene_names, ]
  
  if (any(!is.finite(as.matrix(exprs_top)))) {
    warning(paste("Non-finite values detected in the top expression data for gds_id:", gds_id))
    return(NULL)
  }
  
  if (!is.factor(target$disease_state)) {
    target$disease_state <- factor(target$disease_state)
  }
  
  annotation_col <- data.frame(DiseaseState = target$disease_state)
  rownames(annotation_col) <- target$gsm_id
  annotation_colors <- list(DiseaseState = c("c" = "white", "d" = "grey"))
  
  heatmap <- pheatmap(
    exprs_top,
    scale = "row",
    clustering_distance_rows = "correlation",
    clustering_distance_cols = "correlation",
    main = paste("Heatmap of top 50 genes present in the most GSMs for", gds_id),
    fontsize_row = 8,
    fontsize_col = 4,
    cluster_cols = FALSE, 
    annotation_col = annotation_col,
    annotation_colors = annotation_colors
  )
  
  
  ggsave(file.path(graphs_dir, paste0("heatmap_", gds_id, ".png")), plot = heatmap)
}


all_results <- list()
all_individual_results <- list()
summary_results <- list()
for (gds_id in gds_ids$gds_id) {
  de_analysis <- differential_expression_analysis(gds_id, con)

  
  if (!is.null(de_analysis)) {
    create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
    #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
    all_results[[length(all_results) + 1]] <- de_analysis$results
  }
  
  
  individual_results <- individual_expression_analysis(gds_id, de_analysis$target, de_analysis$exprs_normalized, de_analysis$gene_symbols, output_dir)
  if (is.null(individual_results)) next
  
  #DEG only in diseased individuals
  filtered_ind_results <- individual_results %>%
    filter(significant != "n") %>%
    inner_join(de_analysis$target %>% filter(disease_state == "d"), by = "gsm_id")
  
  #define X with the random model
  X <- randomized_model(de_analysis$exprs_normalized, de_analysis$target)
  
  #calculate the number of individuals with DEG
  gene_counts <- filtered_ind_results %>%
    group_by(gene_symbol) %>%
    summarize(num_subjects = n_distinct(gsm_id))
  
  #filter the genes that are DEG in a number of individuals > x
  genes_above_threshold <- gene_counts %>%
    filter(num_subjects >= X)
  
  # get results
  final_results <- filtered_ind_results %>%
    filter(gene_symbol %in% genes_above_threshold$gene_symbol)
  
  
  summary_results[[length(summary_results) + 1]] <- data.frame(
    gds_id = gds_id,
    gsm_min = X,
    sig_gene_count = nrow(genes_above_threshold)
  )
  
  all_results[[length(all_results) + 1]] <- de_analysis$results
  all_individual_results[[length(all_individual_results) + 1]] <- final_results
}


if (length(all_results) > 0) {
  combined_results <- do.call(rbind, all_results)
  colnames(combined_results)[colnames(combined_results) == "adj.P.Val"] <- "adj_p_val"
  colnames(combined_results)[colnames(combined_results) == "P.Value"] <- "p_val"
  combined_results_round <- combined_results %>% mutate(across(where(is.numeric), \(x) round(x, 4)))
  
  significant_results <- subset(combined_results_round, significant != 'n')
 # write.csv(significant_results, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/significant_results.csv", row.names = FALSE)
  # write.csv(combined_results_round, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/combined_results.csv", row.names = FALSE)
}


#upload group wise results to MySQL
#dbWriteTable(con, "expr_dea_groupwise", combined_results_round, append = TRUE, row.names = FALSE)


if (length(all_individual_results) > 0) {
  combined_individual_results <- do.call(rbind, all_individual_results)
  combined_individual_results_round <- combined_individual_results %>% 
    mutate(across(where(is.numeric), \(x) round(x, 4)))
  
}


individual_summary <- do.call(rbind, summary_results)
#write.csv(individual_summary, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/individual_summary.csv", row.names = FALSE)


df_without_disease_state <- combined_individual_results_round %>% select(-disease_state)

#write.csv(df_without_disease_state, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/combined_individual_results.csv", row.names = FALSE)


#dbWriteTable(con, "expr_dea_perindividual", df_without_disease_state, append = TRUE, row.names = FALSE)


#Plot the adj p value of all the gds_id
p <- ggplot(combined_results, aes(x = adj_p_val, color = as.factor(gds_id))) +
  geom_histogram(binwidth = 0.01, fill = NA, position = "identity") +
  labs(title = "Histogram of adjusted p-value for each GDS",
       x = "Adjusted p-value",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 1.3) +
  scale_color_discrete(name = "GDS ID")

ggsave(file.path(graphs_dir, "adj_p_val_distribution_all_gds_ids.png"), plot = p, bg = 'white')


#initialize an empty list to store all results
all_results <- list()
all_individual_results <- list()

for (gds_id in gds_ids$gds_id) {
  de_analysis <- differential_expression_analysis(gds_id, con)
  individual_results <- individual_expression_analysis(gds_id, de_analysis$target,de_analysis$exprs_normalized,de_analysis$gene_symbols, output_dir)
  # disease_associated_genes <- randomized_model(gds_id, de_analysis$exprs_normalized, de_analysis$target)
  if (!is.null(individual_results)) {
    all_individual_results[[length(all_individual_results) + 1]] <- individual_results
  }
  if (!is.null(de_analysis)) {
    # create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
    #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
    all_results[[length(all_results) + 1]] <- de_analysis$results
  }
}