#install.packages("DBI") #install.packages("RMySQL") library(DBI) library(RMySQL) library(limma) library(ggplot2) library(pheatmap) library(dplyr) library(reshape2) library(grid) library(gtable) #dbDisconnect(con) con <- dbConnect(MySQL(), dbname = "disnet_biolayer", host = "ares.ctb.upm.es", port = 30604, user = "gene_expression", password = "gene_expression") graphs_dir <- "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise_graphs" gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1") differential_expression_analysis <- function(gds_id, con) { #extraction of data from expr_values table query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'") expr_values <- dbGetQuery(con, query_expr_values, columns = c("gsm_id" = "character", "id_ref" = "character", "gene_symbol" = "character", "value" = "numeric")) if (nrow(expr_values) == 0) { warning(paste("No data found for gds_id:", gds_id)) return(NULL) } if (!is.numeric(expr_values$value)) { expr_values$value <- as.numeric(expr_values$value) } #extraction of data from expr_processed_annot table with flag = 1 query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "' AND flag = 1") expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot) if (nrow(expr_processed_annot) == 0) { warning(paste("No data found in expr_processed_annot for gds_id:", gds_id)) return(NULL) } #ensure there are enough disease state levels if (length(unique(expr_processed_annot$disease_state)) < 2) { warning(paste("Not enough disease state levels for gds_id:", gds_id)) return(NULL) } #to keep gene_symbol for reference at the end gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")]) #creation of the expression matrix without gene_symbol expression <- reshape2::dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean) rownames(expression) <- expression$id_ref expression <- expression[, -1] # Remove the id_ref column if (ncol(expression) == 0) { warning(paste("No expression data available for gds_id:", gds_id)) return(NULL) } #normalize expression matrix exprs_matrix <- as.matrix(expression) exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile') #create design matrix target <- expr_processed_annot target$disease_state <- factor(target$disease_state) if (length(levels(target$disease_state)) < 2) { warning(paste("Not enough disease state levels for gds_id:", gds_id)) return(NULL) } design <- model.matrix(~ 0 + disease_state, data = target) colnames(design) <- levels(target$disease_state) # Fit linear model fit <- lmFit(exprs_normalized, design) #to handle NA coefficients by excluding probes with missing data na_probes <- rownames(fit$coefficients)[apply(is.na(fit$coefficients), 1, any)] if (length(na_probes) > 0) { warning(paste("Partial NA coefficients for", length(na_probes), "probe(s). These will be excluded from the analysis.")) fit <- fit[-match(na_probes, rownames(fit$coefficients)), ] } #create and apply contrasts contrast_matrix <- makeContrasts( control_vs_disease = c - d, levels = design ) fit2 <- contrasts.fit(fit, contrast_matrix) fit2 <- eBayes(fit2) #handle zero sample variances by offsetting or excluding zero_variances <- fit2$sigma == 0 if (any(zero_variances)) { warning("Zero sample variances detected, have been offset away from zero or excluded") fit2$sigma[zero_variances] <- min(fit2$sigma[!zero_variances]) * 1e-6 } # results results <- topTable(fit2, adjust = "fdr", number = Inf) results$significant <- ifelse(results$adj.P.Val < 0.05 & abs(results$logFC) > 1, ifelse(results$logFC > 0, "o", "u"), "n") #check if results contain any significant genes if (nrow(results) == 0) { warning(paste("No significant results for gds_id:", gds_id)) return(NULL) } #add gene symbols back to results to have a reference results$id_ref <- rownames(results) results <- merge(results, gene_symbols, by = "id_ref", all.x = TRUE) #add gds_id column to know from which dataset comes from results$gds_id <- gds_id results <- results[, c("gds_id", "id_ref", "gene_symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "significant")] return(list(results = results, gene_symbols=gene_symbols, exprs_normalized = exprs_normalized,target=target)) } # Function: individual_expression_analysis # Description: Performs individual expression analysis by calculating z-scores for each gene in each sample, # based on normalized gene expression data. It identifies significant gene expression changes # in individual samples compared to a control group. # Inputs: # - gds_id: Identifier for the gene expression dataset. # - target: Data frame containing sample metadata, including the disease state (control or diseased). # - exprs_normalized: Matrix of normalized gene expression data, where rows represent genes and columns # represent samples. # - gene_symbols: Data frame containing gene symbols or identifiers corresponding to the rows of exprs_normalized. # - output_dir: Directory path where the output results will be stored. # Outputs: # - A data frame containing the following columns: # - gds_id: Identifier for the gene expression dataset. # - gsm_id: Identifier for the individual sample. # - id_ref: Identifier for the gene with significant expression changes in the sample. # - gene_symbol: Symbol or identifier for the gene with significant expression changes. # - z_score: Z-score indicating the magnitude and direction of expression change for the gene in the sample. # - If no individual significant results are found for a given gds_id, the function issues a warning message indicating this outcome. individual_expression_analysis <- function(gds_id, target, exprs_normalized, gene_symbols, output_dir) { if (!is.matrix(exprs_normalized) || ncol(exprs_normalized) < 2 || nrow(exprs_normalized) < 2) { warning(paste("exprs_normalized does not have at least dos dimensiones para gds_id:", gds_id)) return(NULL) } if (sum(target$disease_state == 'c') == 0) { warning("No control samples found.") return(NULL) } gsm_control <- subset(target, disease_state == 'c') gsm_control_ids <- gsm_control$gsm_id exprs_control <- exprs_normalized[, gsm_control_ids] gene_means_control <- rowMeans(exprs_control, na.rm = TRUE) gene_sds_control <- apply(exprs_control, 1, sd, na.rm = TRUE) z <- sweep(exprs_normalized, 1, gene_means_control, "-") z <- sweep(z, 1, gene_sds_control, "/") cutoff <- 2.5 results_list <- list() for (i in 1:ncol(exprs_normalized)) { df <- data.frame( gsm_id = colnames(exprs_normalized)[i], id_ref = rownames(exprs_normalized), z_score = z[, i] ) df$significant <- ifelse(abs(df$z_score) > cutoff, ifelse(df$z_score > 0, "o", "u"), "n") results_list[[length(results_list) + 1]] <- df } individual_results <- do.call(rbind, results_list) individual_results <- merge(individual_results, gene_symbols, by = "id_ref", all.x = TRUE) individual_results$gds_id <- gds_id individual_results <- individual_results[, c("gds_id", "gsm_id", "id_ref", "gene_symbol", "z_score", "significant")] return(individual_results) } randomized_model <- function(exprs_normalized, target) { set.seed(123) # Para la reproducibilidad G <- nrow(exprs_normalized) # Número de genes n <- ncol(exprs_normalized) # Número de muestras num_simulations <- 1000 z_cutoff <- 2.5 max_subjects_per_gene_random <- numeric(num_simulations) for (sim in 1:num_simulations) { # Generar una matriz de Z-scores aleatorizados random_z_scores <- matrix(rnorm(G * n), nrow = G, ncol = n) # Contar el número de sujetos con Z-score mayor que el umbral en cada gen diseased_subjects <- which(target$disease_state == 'd') counts_per_gene <- rowSums(random_z_scores[, diseased_subjects] > z_cutoff) # Almacenar el máximo número de sujetos por gen en esta simulación max_subjects_per_gene_random[sim] <- max(counts_per_gene) } # Determinar el umbral X (percentil 95 de las simulaciones) X_rand <- quantile(max_subjects_per_gene_random, 0.95) return(X_rand) } # This function generates a volcano plot from differential expression results and saves it as a PNG file. # # Args: # results: A data frame containing the results of differential expression analysis with columns for # log fold change (logFC), p-values (P.Value), and significance status (significant). # gds_id: A string representing the GDS ID, which will be used in the plot title and filename. # graphs_dir: A string representing the directory where the plot will be saved. # # Output: # A volcano plot is saved as a PNG file in the specified directory. create_volcano_plot <- function(results, gds_id, graphs_dir) { # Map the significant values to corresponding labels for better readability in the legend results$significant_label <- factor(results$significant, levels = c("n", "o", "u"), labels = c("not significant", "overexpressed", "underexpressed")) # Create the volcano plot using ggplot2 volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value), color = significant_label)) + geom_point() + # Plot the points theme_minimal() + # Use a minimal theme for the plot labs( title = paste("Volcano Plot for", gds_id), # Title of the plot x = "Log Fold Change", # X-axis label y = "-log10 adj-p-value", # Y-axis label color = "Differential Expression" # Legend title ) + scale_color_manual( values = c("not significant" = "grey", "overexpressed" = "red", "underexpressed" = "blue") # Custom colors ) + theme( plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Customize plot title axis.title.x = element_text(size = 14), # Customize X-axis title axis.title.y = element_text(size = 14), # Customize Y-axis title axis.text.x = element_text(size = 12), # Customize X-axis text axis.text.y = element_text(size = 12), # Customize Y-axis text legend.position = "right", # Position the legend on the right legend.justification = "center", # Center the legend legend.title = element_text(size = 14), # Customize legend title legend.text = element_text(size = 12), # Customize legend text legend.box.background = element_rect(color = "black", linewidth = 0.5, linetype = "solid"), # Legend box background legend.box.margin = margin(5, 5, 5, 5), # Margin around legend box legend.box.spacing = unit(0.5, "cm"), # Spacing within legend box legend.direction = "vertical" # Legend direction ) # Save the plot as a PNG file ggsave(file.path(graphs_dir, paste0("volcano_plot_", gds_id, ".png")), plot = volcano_plot, bg = 'white') } create_heatmap <- function(exprs_normalized, results, gds_id, graphs_dir, target) { # Seleccionar los 50 genes más significativos top_genes <- results[order(results$adj.P.Val), ][1:50, ] top_gene_names <- top_genes$id_ref if (length(top_gene_names) < 50) { warning(paste("Less than 50 significant genes for gds_id:", gds_id)) } exprs_top <- exprs_normalized[top_gene_names, ] if (any(!is.finite(as.matrix(exprs_top)))) { warning(paste("Non-finite values detected in the top expression data for gds_id:", gds_id)) return(NULL) } if (!is.factor(target$disease_state)) { target$disease_state <- factor(target$disease_state) } annotation_col <- data.frame(DiseaseState = target$disease_state) rownames(annotation_col) <- target$gsm_id annotation_colors <- list(DiseaseState = c("c" = "white", "d" = "grey")) heatmap <- pheatmap( exprs_top, scale = "row", clustering_distance_rows = "correlation", clustering_distance_cols = "correlation", main = paste("Heatmap of top 50 genes present in the most GSMs for", gds_id), fontsize_row = 8, fontsize_col = 4, cluster_cols = FALSE, annotation_col = annotation_col, annotation_colors = annotation_colors ) ggsave(file.path(graphs_dir, paste0("heatmap_", gds_id, ".png")), plot = heatmap) } all_results <- list() all_individual_results <- list() summary_results <- list() for (gds_id in gds_ids$gds_id) { de_analysis <- differential_expression_analysis(gds_id, con) if (!is.null(de_analysis)) { create_volcano_plot(de_analysis$results, gds_id, graphs_dir) #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target) all_results[[length(all_results) + 1]] <- de_analysis$results } individual_results <- individual_expression_analysis(gds_id, de_analysis$target, de_analysis$exprs_normalized, de_analysis$gene_symbols, output_dir) if (is.null(individual_results)) next #DEG only in diseased individuals filtered_ind_results <- individual_results %>% filter(significant != "n") %>% inner_join(de_analysis$target %>% filter(disease_state == "d"), by = "gsm_id") #define X with the random model X <- randomized_model(de_analysis$exprs_normalized, de_analysis$target) #calculate the number of individuals with DEG gene_counts <- filtered_ind_results %>% group_by(gene_symbol) %>% summarize(num_subjects = n_distinct(gsm_id)) #filter the genes that are DEG in a number of individuals > x genes_above_threshold <- gene_counts %>% filter(num_subjects >= X) # get results final_results <- filtered_ind_results %>% filter(gene_symbol %in% genes_above_threshold$gene_symbol) summary_results[[length(summary_results) + 1]] <- data.frame( gds_id = gds_id, gsm_min = X, sig_gene_count = nrow(genes_above_threshold) ) all_results[[length(all_results) + 1]] <- de_analysis$results all_individual_results[[length(all_individual_results) + 1]] <- final_results } if (length(all_results) > 0) { combined_results <- do.call(rbind, all_results) colnames(combined_results)[colnames(combined_results) == "adj.P.Val"] <- "adj_p_val" colnames(combined_results)[colnames(combined_results) == "P.Value"] <- "p_val" combined_results_round <- combined_results %>% mutate(across(where(is.numeric), \(x) round(x, 4))) significant_results <- subset(combined_results_round, significant != 'n') # write.csv(significant_results, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/significant_results.csv", row.names = FALSE) # write.csv(combined_results_round, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/combined_results.csv", row.names = FALSE) } #upload group wise results to MySQL #dbWriteTable(con, "expr_dea_groupwise", combined_results_round, append = TRUE, row.names = FALSE) if (length(all_individual_results) > 0) { combined_individual_results <- do.call(rbind, all_individual_results) combined_individual_results_round <- combined_individual_results %>% mutate(across(where(is.numeric), \(x) round(x, 4))) } individual_summary <- do.call(rbind, summary_results) #write.csv(individual_summary, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/individual_summary.csv", row.names = FALSE) df_without_disease_state <- combined_individual_results_round %>% select(-disease_state) #write.csv(df_without_disease_state, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/combined_individual_results.csv", row.names = FALSE) #dbWriteTable(con, "expr_dea_perindividual", df_without_disease_state, append = TRUE, row.names = FALSE) #Plot the adj p value of all the gds_id p <- ggplot(combined_results, aes(x = adj_p_val, color = as.factor(gds_id))) + geom_histogram(binwidth = 0.01, fill = NA, position = "identity") + labs(title = "Histogram of adjusted p-value for each GDS", x = "Adjusted p-value", y = "Frequency") + theme_minimal() + xlim(0, 1.3) + scale_color_discrete(name = "GDS ID") ggsave(file.path(graphs_dir, "adj_p_val_distribution_all_gds_ids.png"), plot = p, bg = 'white') #initialize an empty list to store all results all_results <- list() all_individual_results <- list() for (gds_id in gds_ids$gds_id) { de_analysis <- differential_expression_analysis(gds_id, con) individual_results <- individual_expression_analysis(gds_id, de_analysis$target,de_analysis$exprs_normalized,de_analysis$gene_symbols, output_dir) # disease_associated_genes <- randomized_model(gds_id, de_analysis$exprs_normalized, de_analysis$target) if (!is.null(individual_results)) { all_individual_results[[length(all_individual_results) + 1]] <- individual_results } if (!is.null(de_analysis)) { # create_volcano_plot(de_analysis$results, gds_id, graphs_dir) #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target) all_results[[length(all_results) + 1]] <- de_analysis$results } }