Contents


Library R packages and basic settings

library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(reshape2)
library(UpSetR)
library(ggplotify)
library(cowplot)
library(VennDiagram)
library(ggpubr)
library(readxl)
library(ggforce)

measure_col <- c("BH"="#D55E00", "ST"="#CC79A7", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3", 
                 "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99", 
                 "icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#1B9E77", 
                 "dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9", 
                "FDRreg" = "#009E73", "BL" = "#0072B2", BH="#D55E00", ST="#CC79A7")

1. Down-sampling analysis


EWAS51 is a dataset with binary phenotype (FASD and control). We sampled each phenotype randomly at sample size 10, 20, 30, 40, 50, 60, 70, 80, 90 and 100 by method IHW and CAMT. The power of each method with different covariates are displyed as follows.

1.1 Method IHW

selected_measure <- c("BH", "ST", "chr", "cpg.loc", "dhs", "dip", "direction", "mad", "mean",
                      "precision", "probe.type", "refgene.pos", "sd.b", "sd.m")
selected_samplesize <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
IHW_power <- readRDS("Data/EWAS51_IHW_downsample_power.RDS")

IHW_power <- IHW_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(IHW_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

fs13 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), 
               width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

fs13



1.2 Method CAMT

CAMT_power <- readRDS("Data/EWAS51_CAMT_downsample_power.RDS")
CAMT_power <- CAMT_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(CAMT_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

f4 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

f4

2. Increasing power to discovery true signals


2.1 Overview of dataset EWAS45 across different methods and covariates

selected_measure <- c("mean", "sd.b", "mad", "icc.m", "icc.b", "precision", "dhs", "direction",
                      "chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS45_", i, "_adjusted_pvalues.RDS"))
}

# calculate the number of significant DMPs
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST DMPs' numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))

col_fun = colorRamp2(c(-200, -100, 0, 200), 
                     c("royalblue", "blue", "white", "red"), 
                     space = "RGB")
panel_A <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(title_gp = gpar(fontsize = 8), 
                                    labels_gp = gpar(fontsize = 6),
                                    labels = c("-200%", "-100%", 0, "100%", "200%")))
draw(panel_A)

panel_A <- grid.grabExpr(draw(panel_A))



2.2 Boxplot of gold standard CpGs’ significance orders by covariate under different methods

# get the significance order for each cpg
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "random"))))
selected_method <- "IHW"

gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])
        
panel_B <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")
panel_B

selected_method <- c("AdaPT", "BL", "CAMT", "FDRreg")
fs14 <- as.data.frame(ranks[selected_method])[selected_cpg, ] %>%
  melt(value.name = "Significance_rank") %>%
  mutate(method = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[1]),
         measure = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[2])) %>%
  filter(measure %in% selected_measure) %>%
  select(Significance_rank, method, measure) %>%
  group_by(method, measure) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(measure, Significance_rank, FUN = median), y = Significance_rank, color = measure, fill = measure)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16, show.legend = FALSE) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = measure, xend = measure,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  facet_wrap(~ method, nrow = 2) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_text(size = 8),
        axis.title.y = element_text(size = 10))

fs14



2.3 Boxplot of gold standard CpGs’ significance orders by different methods

# choose the most powerful covariate mean
subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
panel_C <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)

panel_C



# organize figures into Figure 5
bottom_row <- plot_grid(panel_B, panel_C, nrow = 1, align = "h", 
                        labels = c("B", "C"), label_size = 18)
f5 <- plot_grid(panel_A, bottom_row, ncol = 1,
                labels = c("A", ""), label_size = 18)
f5

2.4 Validation in another data set EWAS27

selected_measure <- c("mad", "mean", "sd.b", "precision", "sd.m", "dhs", "dip", "chr",  "direction", "probe.type", "refgene.pos", "cpg.loc")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS27_", i, "_adjusted_pvalues.RDS"))
}

## heatmap
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))
col_fun = colorRamp2(c(-100, 0, 200), c("blue", "white", "red"), space = "RGB")
fs15_1 <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(labels_gp = gpar(fontsize = 6),
                                    title_gp = gpar(fontsize = 8),
                                    at = c(-100, 0, 100, 200),
                                    labels = c("-100%", 0, "100%", "200%")))
fs15_1 <- grid.grabExpr(draw(fs15_1))

## boxplot
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "average"))))

selected_method <- "IHW"
gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", 
                          header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])

fs15_2 <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")

subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
fs15_3 <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)
## put all figures together
bottom_row <- plot_grid(fs15_2, fs15_3, labels = c("B", "C"), align = "h", nrow = 1, label_size = 18)
fs15 <- plot_grid(fs15_1, bottom_row, labels = c("A", ""), ncol = 1, label_size = 18)
fs15

3. Some well-known smoking-associated CpGs were recovered


3.2 Compare the methylation level of cg18092474 between groups

methy_data <- readRDS("Data/cg18092474.RDS")

# count samples within each group
table(methy_data$group)
## 
## current smoker  former smoker          never 
##             22            263            179


fs16 <- methy_data %>%
  ggplot(aes(x = group, y = bVals, fill = group)) + 
  stat_boxplot(geom = "errorbar", width = 0.3) + 
  geom_boxplot(width = 0.6) + 
  scale_x_discrete(labels = c("current smoker (n=22)", "former smoker (n=263)", "never (n=179)")) +
  annotate("text", label = "BH: 0.090       ST: 0.088\nIHW: 0.021       CAMT: 0.019",
           x = 2, y = 0.9:1, color = "red") +
  geom_segment(aes(x = 1, y = 0.9, xend = 3, yend = 0.9), size = 0.25) +
  geom_segment(aes(x = 1, y = 0.8, xend = 1, yend = 0.9), size = 0.1) +
  geom_segment(aes(x = 3, y = 0.85, xend = 3, yend = 0.9), size = 0.1) +
  ylim(0, 1) +
  xlab("Group") +
  theme_bw() + 
  theme(legend.position = "none") 
fs16



3.3 Intersetion of method IHW and CAMT

fs17 <- draw.pairwise.venn(area1 = length(IHW),
                           area2 = length(CAMT),
                           cross.area = length(intersect(IHW, CAMT)),
                           category = c("IHW", "CAMT"),
                           fill = c("pink", "skyblue"),
                           cat.pos = c(0, 0),
                           scaled = FALSE)