P-value as a covariate

1. Stratified histograms by covariate “p-value”
2. Significant signals change across methods and covariates
3. The performance of covariate p-value by method IHW
4. GO and KEGG analysis for ST results
5. Validate the IHW results by method CAMT

Library R packages and basic setttings

library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(UpSetR)
library(ggplotify)
library(missMethyl)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(limma)
library(DT)
library(cowplot)
library(IHW)
selected_measure <- c("mean", "sd.b", "mad", "precision", "dhs", "direction",
                      "chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")

1. Stratified histograms by covariate “p-value”

# stratified histogram of EWAS29's p-value by z-score transfromed p-value from EWAS28
EWAS28 <- readRDS("Data/dataset/EWAS28.RDS")
EWAS29 <- readRDS("Data/dataset/EWAS29.RDS")

overlap <- intersect(rownames(EWAS28), rownames(EWAS29))
EWAS28_pvalue <- EWAS28[overlap, "P.value"]
EWAS29_pvalue <- EWAS29[overlap, "P.value"]

## z-score tranformation for EWAS28's p-value
ztransform <- function(p.value, tol = 1E-15) {
  # Transform p-values to z-scores
  p.value[p.value <= tol] <- tol
  p.value[p.value >= 1 - tol] <- 1 - tol
  z <- -qnorm(p.value)
  return(z)
}
EWAS28_pvalue_zscore <- ztransform(EWAS28_pvalue)

## histogram of all raw p-values
p1 <- ggplot(data = EWAS29[overlap, ], aes(x = P.value, y = stat(density))) + 
  geom_histogram(fill = "#00AFBB", color = "black", binwidth = 0.05, size = 0.05, boundary = 0) +
  scale_y_continuous(breaks = c(0, 0.4, 0.8, 1.2)) +
  theme_classic() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8))

## histogram stratified by covariate p-value
ihw.obj <- ihw(EWAS29_pvalue, EWAS28_pvalue_zscore, alpha = 0.05, covariate_type = "ordinal")
result_df <- as.data.frame(ihw.obj)
result_df$ihwGroup <- groups_by_filter(result_df$group, 4)

p2 <- ggplot(data = result_df, aes(x = pvalue, y = stat(density))) + 
  geom_histogram(fill = "#00AFBB", color = "black", binwidth = 0.05, boundary = 0, size = 0.05) +
  facet_wrap(~ ihwGroup, nrow = 1, scales = "fixed") + 
  scale_y_continuous(breaks = c(0, 0.4, 0.8, 1.2)) +
  theme_classic()  +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8))

## put figures together
fs18 <- plot_grid(p1, p2, nrow = 1, labels = "AUTO", rel_widths = c(1, 3))
fs18

2. Significant signals change across methods and covariates

# read in adjusted p-values to a list
pvalues <- list()
for(s in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[s]] <- readRDS(paste0("Data/pvalue_as_covariate_", s, ".RDS"))
}

# calculate the significant DMPs' number
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(c(selected_measure, "pvalue"))

plot_data <- as.matrix((t(plot_data) - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))

col_fun = colorRamp2(c(-200, 0, 400), c("royalblue", "white", "red"), space = "RGB")
panel_A <- Heatmap(plot_data,
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 6))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        show_row_dend = FALSE,
        show_column_dend = FALSE,
        heatmap_legend_param = list(title = "Color Key",
                                    labels_gp = gpar(fontsize = 5),
                                    labels = c("-100%", 0, "200%", "400%"),
                                    direction = "horizontal",
                                    title_position = "lefttop",
                                    title_gp = gpar(fontsize = 8),
                                    grid_height = unit(2, "mm"),
                                    grid_width = unit(4, "mm")),
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 6),
        column_names_side ="top")
draw(panel_A, heatmap_legend_side = "bottom")

panel_A <- grid.grabExpr(draw(panel_A, heatmap_legend_side = "bottom"))

2. The performance of covariate p-value by method IHW

2.1 The intersections

raw_probes <- rownames(pvalues[[1]])
sig_probes <- lapply(pvalues, function(x)apply(x, 2, function(x)raw_probes[x < 0.05]))
selected_method <- "IHW"
upset_list <- sig_probes[[selected_method]]

# upset
selected_measure <- c(selected_measure, "pvalue", "ST")
panel_B <- upset(fromList(upset_list),
      sets = selected_measure, 
      order.by = "freq",
      point.size = 1.5,
      line.size = 0.7, 
      mb.ratio = c(0, 1))

panel_B <- as.ggplot(panel_B)
panel_B

2.2 GO and KEGG enrichment analysis

## look at the results obtained by covariate pvalue with method IHW
sigCpGs <- upset_list[["pvalue"]]
gst <- gometh(sig.cpg=sigCpGs, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)

# Top 5 GO categories
top10_GO <- topGO(gst, number=10)
datatable(top10_GO, rownames = TRUE, filter= "none", options = list(pageLength = 5, scrollX=T))

top10_GO$ratio <- top10_GO$DE / top10_GO$N
# plot top 10 terms
top10_GO$Term <- factor(top10_GO$Term, levels = as.character(top10_GO$Term[10:1]))
panel_C <- ggplot(data = top10_GO, mapping = aes(x = Term, y = ratio)) + 
  geom_col(aes(fill = FDR), width = 0.5) + 
  coord_flip() + 
  theme_bw() + 
  ylab("Ratio") +
  theme(legend.position = "none", 
        axis.title.y = element_blank(),
        axis.text = element_text(size = 6))
panel_C

# Top 10 KEGG categories
kegg <- gometh(sig.cpg = sigCpGs, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

# organize figures
top_row <- plot_grid(panel_A, panel_C, rel_widths = c(1, 1), 
                     labels = c("A", "C"), label_size = 18)
f6 <- plot_grid(top_row, panel_B, ncol = 1, labels = c("", "B"), label_size = 18)

3. GO and KEGG analysis for ST results

reference <- upset_list[["ST"]]

# Top 20 GO categories
gst <- gometh(sig.cpg=reference, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)
datatable(topGO(gst, number=20), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

# Top 10 KEGG categories
kegg <- gometh(sig.cpg = reference, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

4. Validate the IHW results by method CAMT

4.1 The intersections

selected_method <- "CAMT"
upset_list <- sig_probes[[selected_method]]

# convert list to data frame
selected <- unique(unlist(upset_list))
upset_df <- as.data.frame(matrix(nrow = length(selected), ncol = length(upset_list),
                                 dimnames = list(selected, names(upset_list))))
for(i in 1:nrow(upset_df)){
  for(j in 1:ncol(upset_df)){
    if(rownames(upset_df)[i] %in% upset_list[[colnames(upset_df)[j]]]){
      upset_df[i, j] <- 1
    }else{
      upset_df[i, j] <- 0
    }
  }
}

# add annotations for EWAS28 significant probes by method BH
EWAS28_sigprobes <- readRDS("Data/EWAS28_sigprobes.RDs")
upset_df$EWAS28 <- ifelse(rownames(upset_df) %in% EWAS28_sigprobes, "yes", "no")

# upset
fs19_1 <- upset(upset_df,
      sets = selected_measure, 
      order.by = "freq",
      queries = list(list(query = elements, params = list("EWAS28", "yes"), color = "red", active = TRUE)),
      mb.ratio = c(0.6, 0.4),
      point.size = 1.5,
      line.size = 0.5)
fs19_1 <- as.ggplot(fs19_1)
fs19_1

4.2 GO and KEGG enrichment analysis

sigCpGs <- upset_list[["pvalue"]]
gst <- gometh(sig.cpg=sigCpGs, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)

# Top 10 GO categories
top10_GO <- topGO(gst, number=10)
datatable(top10_GO, rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

top10_GO$ratio <- top10_GO$DE / top10_GO$N
top10_GO$Term <- factor(top10_GO$Term, levels = as.character(top10_GO$Term[10:1]))

options(digits = 2)

fs19_2 <- ggplot(data = top10_GO, mapping = aes(x = Term, y = ratio)) + 
  geom_col(aes(fill = FDR), width = 0.5) + 
  coord_flip() + 
  theme_classic() +
  theme(axis.title.y = element_blank(),
        legend.title = element_text(size = 7),
        legend.text = element_text(size = 5),
        legend.key.size = unit(4, "mm"))
fs19_2

# Top 10 KEGG categories
kegg <- gometh(sig.cpg = sigCpGs, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

# organize figures
fs19 <- plot_grid(fs19_1, fs19_2, ncol = 1, labels = c("A", "B"), label_size = 18,
                  rel_heights = c(2, 1))

P-value as a covariate

Bai Ling

21 January 2020

Contents

Library R packages and basic setttings

1. Stratified histograms by covariate “p-value”

2. Significant signals change across methods and covariates

2. The performance of covariate p-value by method IHW

2.1 The intersections

2.2 GO and KEGG enrichment analysis

3. GO and KEGG analysis for ST results

4. Validate the IHW results by method CAMT

4.1 The intersections

4.2 GO and KEGG enrichment analysis