07_atacseq.Rmd

---
title: "Bender et al (2024) -- EMBO"
subtitle: ATAC-seq
author:
- name: Alexander Bender
  affiliation: Institute of Molecular Tumor Biology, Muenster/Germany
date: "`r paste('Compiled:', format(Sys.time(), '%d-%b-%Y'))`"
output:
  rmdformats::readthedown:
    code_folding: show
    keep_md: false
    highlight: tango
    toc_float:
      collapsed: false
editor_options: 
  markdown: 
    wrap: 200
params:
  save_final: true
  run_homer: false
---

<style>
body {
text-align: justify}
</style>

# Setup

Define root directory that contains the folder with source data. Run script that
loads packages and define document-specific variables.

```{r setup}

# Inside this Docker container we mount the directory with all the source data as "/projectdir/"
rootdir <- "/projectdir/"
source(paste0(rootdir, "/runStartup.R"))
```

# ATAC-seq & motifs enrichment

## Differential analysis

```{r atac_testing}

dds_atac <-
  read.delim(paste0(rootdir, "/source_data/GSE250619_atacseq_rawCounts.tsv.gz"), row.names = "peak") %>%
  magrittr::set_colnames(gsub("_filtered.bam", "", colnames(.))) %>%
  dplyr::select(contains("_rep")) %>%
  DESeqDataSetFromMatrix(
    countData = .,
    colData = data.frame(group = gsub("_rep.*", "", colnames(.)), stringsAsFactors = TRUE),
    design = ~group
  )

# Normalize with DESeq2 using the top-20% peaks with largest baseMean
use_rows <- head(order(rowMeans(DESeq2::fpm(dds_atac)), decreasing = TRUE), ceiling(nrow(dds_atac) * .2))
sizeFactors(dds_atac) <- DESeq2::sizeFactors(estimateSizeFactors(dds_atac[use_rows, ]))

# PU.1 ChIP-seq peaks from Pundhir et al and CEBPa peaks from Jakobsen et al
read2granges <- function(x) {
  GenomicRanges::makeGRangesFromDataFrame(x, seqnames.field = "V1", start.field = "V2", end.field = "V3")
}

PU1_peaks <- list(
  LSK = read2granges(read.delim(paste0(rootdir, "/source_data/LSK_PU1_IDR.txt.gz"), header = FALSE)),
  GMP = read2granges(read.delim(paste0(rootdir, "/source_data/GMP_PU1_IDR.txt.gz"), header = FALSE))
)

CEBPA_peaks <- list(GMP = read2granges(read.delim(paste0(rootdir, "/source_data/GMP_CEBPA_IDR.txt.gz"), header = FALSE)))

# Differential testing with DESeq2
dds_atac <- DESeq2::DESeq(dds_atac)

# Test all possible pairwise contrasts and then later extract what we need
cons <- make_all_contrasts(dds_atac$group, deseq2 = TRUE)
cons

list.atacseq_contrasts <- bplapply(names(cons), BPPARAM = bpparam, function(x) {
  s <- sapply(c("greaterAbs", "lessAbs"), function(j) {

    # set lfcThresholds in a reasonable way to avoid overlaps between the DE lists:
    if (j == "greaterAbs") lf <- 0
    if (j == "lessAbs") lf <- log2(1.75)

    # run the testing, and set NAs at padj to 1 so lists are identical despite ind.filtering
    con <- cons[[x]]
    r <- DESeq2::results(object = dds_atac, contrast = con, alpha = 0.05, lfcThreshold = lf, altHypothesis = j)
    g <- DESeq2::lfcShrink(dds = dds_atac, contrast = con, res = r, type = "ashr", quiet = TRUE) %>%
      data.frame() %>%
      dplyr::mutate(baseMean = log2(baseMean + 1)) %>%
      data.frame(Gene = rownames(.), .)
    g <- g[complete.cases(g), ]

    # overlap with PU.1 binding data:
    g$PU1_LSK <- GenomicRanges::makeGRangesFromDataFrame(string2chr(g$Gene)) %over% PU1_peaks$LSK
    g$PU1_GMP <- GenomicRanges::makeGRangesFromDataFrame(string2chr(g$Gene)) %over% PU1_peaks$GMP
    g$CEBPA_GMP <- GenomicRanges::makeGRangesFromDataFrame(string2chr(g$Gene)) %over% CEBPA_peaks$GMP

    return(g)
  }, simplify = FALSE)
})
names(list.atacseq_contrasts) <- names(cons)

# MA-plots, either for all peaks or for those overlapping a PU.1 peak
padj_cut <- 0.05
lfc_cut <- log2(2)

lsk <- list.atacseq_contrasts$URE_LSK_vs_WT_LSK$greaterAbs
lskpu1 <- lsk[lsk$PU1_LSK, ]

gmp <- list.atacseq_contrasts$URE_GMP_vs_WT_GMP$greaterAbs
gmppu1 <- gmp[gmp$PU1_GMP, ]

# MA-plots -- do binning and also remove many of the non-significant regions to reduce file size
do_maplot <- function(x) {
  x2 <- x %>%
    mutate(category = case_when(
      padj < padj_cut & log2FoldChange > lfc_cut ~ "up",
      padj < padj_cut & log2FoldChange < -lfc_cut ~ "down",
      TRUE ~ "ns"
    )) %>%
    group_by(category) %>%
    dplyr::mutate(
      categoryN = factor(paste(category, paste0("(", n(), ")"))),
      categoryN = factor(categoryN, levels = c(
        grep("^up", levels(categoryN), value = TRUE),
        grep("^down", levels(categoryN), value = TRUE),
        grep("^ns", levels(categoryN), value = TRUE)
      ))
    ) %>%
    ggplot(aes(x = baseMean, y = log2FoldChange, color = categoryN, fill = categoryN)) +
    geom_bin2d(bins = 100, size = .5) +
    scale_color_manual(values = c("firebrick", "dodgerblue", "grey"), name = "") +
    scale_fill_manual(values = c("firebrick", "dodgerblue", "grey"), name = "") +
    theme(legend.position = "top") +
    ylim(c(-7, 7)) +
    guides(color = guide_legend(ncol = 1))

  return(x2)
}

Figure_6A <- do_maplot(lsk)
Figure_6A

Figure_6B <- do_maplot(gmp)
Figure_6B

Figure_6C <- do_maplot(lskpu1)
Figure_6C

Figure_6D <- do_maplot(gmppu1)
Figure_6D
```

## Motif enrichment

```{r atac_assignment}

# Change to download from journal upon publication
signatures <- openxlsx::read.xlsx(paste0(outdir, "/Dataset_EV2_raw.xlsx"))

# The tx2gene map that also contains the TSS coordinates for every transcript, made from the GENCODE vM25 GTF
tx2gene <- data.table::fread(
  paste0(rootdir, "/source_data/tx2gene.txt.gz"),
  data.table = FALSE
)

tmp <- gsub("\\..*", "", gsub("_.*", "", tx2gene$gene))
tx2gene$gene <- paste(tmp, gsub(".*_", "", tx2gene$gene), sep = "_")
rm(tmp)

# Combine signature gene names and genomic coordinates into a single table
signature_genes_coords <-
  dplyr::left_join(x = signatures, y = tx2gene, by = "gene") %>%
  dplyr::select(-length, -start, -end) %>%
  GenomicRanges::makeGRangesFromDataFrame(start.field = "TSS", end.field = "TSS", keep.extra.columns = TRUE)

# Assign signature TSS to differential ATAC-seq regions within 10kb if they follow
# the same "pattern" as the gene expressiopn changes
use_width <- 10000

REs <- sapply(paste0("signature", 1:4), function(x) {
  gns <- signature_genes_coords[signature_genes_coords$signature == x]

  if (x == "signature1") {
    featureset1 <- list.atacseq_contrasts$URE_GMP_vs_WT_LSK$greaterAbs
    featureset1 <- featureset1[featureset1$padj < padj_cut & featureset1$log2FoldChange < -lfc_cut, ]
  }
  if (x == "signature2") {
    featureset1 <- list.atacseq_contrasts$URE_LSK_vs_WT_GMP$greaterAbs
    featureset1 <- featureset1[featureset1$padj < padj_cut & featureset1$log2FoldChange < -lfc_cut, ]
  }
  if (x == "signature3") {
    featureset1 <- list.atacseq_contrasts$URE_LSK_vs_WT_GMP$greaterAbs
    featureset1 <- featureset1[featureset1$padj < padj_cut & featureset1$log2FoldChange > lfc_cut, ]
  }
  if (x == "signature4") {
    featureset1 <- list.atacseq_contrasts$URE_GMP_vs_WT_LSK$greaterAbs
    featureset1 <- featureset1[featureset1$padj < padj_cut & featureset1$log2FoldChange > lfc_cut, ]
  }

  x1 <- string2chr(featureset1$Gene)
  x2 <- featureset1[, 2:ncol(featureset1)]
  featureset1 <- GenomicRanges::makeGRangesFromDataFrame(cbind(x1, x2), keep.extra.columns = TRUE)

  # Assign regions to signature TSS
  gns_resized <- GenomicRanges::resize(gns, fix = "start", width = use_width)
  sbo <- IRanges::subsetByOverlaps(featureset1, gns_resized)

  # Overlap of assigned regions with PU.1
  d_pu1 <- data.frame(
    signature = x,
    celltype = c("LSK", "GMP"),
    bound = c(sum(sbo$PU1_LSK), sum(sbo$PU1_GMP)),
    unbound = c(sum(!sbo$PU1_LSK), sum(!sbo$PU1_GMP))
  )

  # Overlap of assigned regions with CEBPa
  d_cebpa <- data.frame(
    signature = x,
    celltype = c("GMP"),
    bound = c(sum(sbo$CEBPA_GMP)),
    unbound = c(sum(!sbo$CEBPA_GMP))
  )


  to_return <- SimpleList(
    regulatory_elements = sbo,
    summary_pu1 = d_pu1,
    summary_cebpa = d_cebpa
  )

  return(to_return)
}, simplify = FALSE)

# Get the regions and extract sequences for motif scanning as findMotilfs.pl
# directly works with sequences (fasta) rather than coordinates (=fasta files)
REs_regions <- sapply(REs, function(x) x$regulatory_elements, simplify = FALSE)

# As background we use the regions from the DESeq2 "lessAbs" testing, so regions
# that in all tested contasts were significantly not changing, and then take the intersect
REs_regions$background <-
  names(list.atacseq_contrasts) %>%
  grep("WT", ., value = TRUE) %>%
  grep("URE", ., value = TRUE) %>%
  lapply(., function(x) {
    l <- list.atacseq_contrasts[[x]]$lessAbs
    ll <- l[l$padj < 0.05, ]
    rownames(ll)
  }) %>%
  Reduce(intersect, .) %>%
  string2chr(.) %>%
  makeGRangesFromDataFrame(.)

# Extract DNA sequences for these ranges
motifdir <- paste0(outdir, "/motifs/")

if (!dir.exists(motifdir)) dir.create(motifdir, recursive = TRUE)

# Extract regions
lapply(names(REs_regions), function(x) {
  r <- REs_regions[[x]]
  fa <- getSeq(BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10, r)
  names(fa) <- paste(seqnames(r), paste(start(r), end(r), sep = "-"), sep = ":")

  # Save as fasta to disk
  Biostrings::writeXStringSet(x = fa, filepath = paste0(motifdir, "/", x, "_REs.fa"))
  return(NULL)
}) %>% invisible()

rm(GRCm38)

Sys.setenv(MOTIFDIR = motifdir)
Sys.setenv(CORES = mc_workers)
```

## Run Homer findMotifs.pl

The output of this is Figure 6 E-H, arranged manually as a table in Inkscape.

```{bash run_homer, eval=params$run_homer}

# This is how the files were named that were used for enrichment analysis
# $ ls *.fa
# $ background_REs.fa  signature1_REs.fa  signature2_REs.fa  signature3_REs.fa  signature4_REs.fa

cd $MOTIFDIR

HOMERBIN="/software/homer/bin/"
export PATH=$HOMERBIN:$PATH
BACKGROUND="background_REs.fa"

MOTIFS_REFERENCE="HOCOMOCOv11_core_MOUSE_mono_homer_format_0.0005.motif"

if [[ ! -f "$MOTIFS_REFERENCE" ]]; then
  wget https://hocomoco11.autosome.ru/final_bundle/hocomoco11/core/MOUSE/mono/HOCOMOCOv11_core_MOUSE_mono_homer_format_0.0005.motif
fi

ls signature[1-4]_REs.fa \
| while read FOREGROUND
    do
    NAME=${FOREGROUND%.fa}
    "${HOMERBIN}/findMotifs.pl" "$FOREGROUND" fasta "${NAME}_motifs" -fasta "$BACKGROUND" -mcheck "$MOTIFS_REFERENCE" -mknown "$MOTIFS_REFERENCE" -p $CORES
    done
    
```

The output is what is provided in **Figure 6E-H**.

## Transcription factor overlap

```{r atac_tf}

# As background for the Chi2 tests we need the entire regulome which is
# basically the entire count table from the ATAC-seq experiment.
regulome <- makeGRangesFromDataFrame(string2chr(rownames(dds_atac)))
regulome$PU1_LSK <- regulome %over% PU1_peaks$LSK
regulome$PU1_GMP <- regulome %over% PU1_peaks$GMP
regulome$CEBPA_GMP <- regulome %over% CEBPA_peaks$GMP

# Extract the overlap tables from the REs object
REs_summary_PU1 <-
  lapply(REs, function(x) x$summary_pu1) %>%
  do.call(rbind, .) %>%
  rbind(
    .,
    data.frame(
      signature = "background",
      celltype = c("LSK", "GMP"),
      bound = c(sum(regulome$PU1_LSK), sum(regulome$PU1_GMP)),
      unbound = c(sum(!regulome$PU1_LSK), sum(!regulome$PU1_GMP))
    )
  ) %>%
  remove_rownames()

REs_summary_CEBPA <-
  lapply(REs, function(x) x$summary_cebpa) %>%
  do.call(rbind, .) %>%
  rbind(
    .,
    data.frame(
      signature = "background",
      celltype = c("GMP"),
      bound = c(sum(regulome$CEBPA_GMP)),
      unbound = c(sum(!regulome$CEBPA_GMP))
    )
  ) %>%
  remove_rownames()

# Calculate overlap stats for PU.1
REs_summary_pvalue_PU1 <-
  lapply(setdiff(unique(REs_summary_PU1$signature), "background"), function(x) {
    s <- REs_summary_PU1 %>% filter(signature == x)
    b <- REs_summary_PU1 %>% filter(signature == "background")

    lapply(1:nrow(s), function(x) {
      ct <- s[x, "celltype"]
      sn <- s[x, "signature"]
      target <- s[x, c("bound", "unbound")]
      bg <- b[x, c("bound", "unbound")]

      p <- chisq.test(rbind(target, bg))$p.value

      data.frame(signature = sn, celltype = ct, bound = target[, "bound"], unbound = target[, "unbound"], pvalue = format(p, digits = 2))
    }) %>% do.call(rbind, .)
  }) %>%
  do.call(rbind, .) %>%
  dplyr::mutate(
    FDR = p.adjust(pvalue, method = "BH"),
    direction = ifelse(bound > unbound, "enriched", "depleted")
  ) %>%
  rbind(., REs_summary_PU1[REs_summary_PU1$signature == "background", ] %>% mutate(pvalue = NA, FDR = NA, direction = NA))

# Calculate overlap stats for CEBPA
REs_summary_pvalue_CEBPA <-
  lapply(setdiff(unique(REs_summary_CEBPA$signature), "background"), function(x) {
    s <- REs_summary_CEBPA %>% filter(signature == x)
    b <- REs_summary_CEBPA %>% filter(signature == "background")

    lapply(1:nrow(s), function(x) {
      ct <- s[x, "celltype"]
      sn <- s[x, "signature"]
      target <- s[x, c("bound", "unbound")]
      bg <- b[x, c("bound", "unbound")]

      p <- chisq.test(rbind(target, bg))$p.value

      data.frame(signature = sn, celltype = ct, bound = target[, "bound"], unbound = target[, "unbound"], pvalue = format(p, digits = 2))
    }) %>% do.call(rbind, .)
  }) %>%
  do.call(rbind, .) %>%
  dplyr::mutate(
    FDR = p.adjust(pvalue, method = "BH"),
    direction = ifelse(bound > unbound, "enriched", "depleted")
  ) %>%
  rbind(., REs_summary_CEBPA[REs_summary_CEBPA$signature == "background", ] %>% mutate(pvalue = NA, FDR = NA, direction = NA))

# Make the actual plots. It's quite custom to ensure that the geom_signif
# part looks somewhat aesthetic
REs_summary_pvalue <-
  rbind(
    REs_summary_pvalue_PU1 %>% mutate(TF = "PU.1"),
    REs_summary_pvalue_CEBPA %>% mutate(TF = "CEBPA")
  )

REs_summary_pvalue_plotobj <-
  REs_summary_pvalue %>%
  group_by(signature, celltype, TF) %>%
  mutate(value = round(100 * bound / sum(bound, unbound), 2)) %>%
  mutate(signature = factor(signature, levels = c(paste0("signature", 1:4), "background")))

signif_pu1 <-
  lapply(setdiff(unique(REs_summary_pvalue_plotobj$signature), "background"), function(x) {
    tmp <- REs_summary_pvalue_plotobj %>% filter(signature == x & TF == "PU.1")
    tmp$value_bg <- REs_summary_pvalue_plotobj %>%
      filter(signature == "background" & TF == "PU.1") %>%
      pull(value)

    m <- as.numeric(gsub("signature", "", x))

    data.frame(
      start = x, end = "background", celltype = factor(tmp$celltype, levels = c("LSK", "GMP")),
      label = format(tmp$FDR, digits = 2)
    )
  }) %>%
  do.call(rbind, .) %>%
  arrange(celltype)

signif_pu1$height <- c(c(40, 50, 60, 70) + 2.5, 65, 75, 85, 95)

signif_cebpa <-
  lapply(setdiff(unique(REs_summary_pvalue_plotobj$signature), "background"), function(x) {
    tmp <- REs_summary_pvalue_plotobj %>% filter(signature == x & TF == "CEBPA")
    tmp$value_bg <- REs_summary_pvalue_plotobj %>%
      filter(signature == "background" & TF == "CEBPA") %>%
      pull(value)

    m <- as.numeric(gsub("signature", "", x))

    data.frame(start = x, end = "background", celltype = tmp$celltype, label = format(tmp$FDR, digits = 2))
  }) %>%
  do.call(rbind, .) %>%
  arrange(celltype)

signif_cebpa$height <- c(47.5, 57.5, 67.5, 77.5)

Figure_6I <-
  REs_summary_pvalue_plotobj %>%
  dplyr::filter(TF == "PU.1") %>%
  dplyr::mutate(celltype = factor(celltype, levels = c("LSK", "GMP"))) %>%
  ggplot(aes(x = signature, y = value)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(round(value, 2), "%")), vjust = .5, color = "black", size = list.ggplot$textsize) +
  suppressWarnings(
    geom_signif(
      data = signif_pu1 %>% mutate(label = paste0("FDR = ", label)), manual = TRUE, textsize = 4, tip_length = .025,
      aes(xmin = start, xmax = end, annotations = label, y_position = height)
    )
  ) +
  facet_wrap(~celltype, ncol = 2) +
  guides(x = guide_axis(angle = 45)) +
  xlab("") +
  ylab("regulatory elements bound by PU.1 [%]") +
  scale_y_continuous(breaks = seq(0, 120, 20), labels = c(seq(0, 100, 20), "")) +
  theme(strip.background = element_blank(), strip.text.x = element_blank())

Figure_6I

Figure_6J <-
  REs_summary_pvalue_plotobj %>%
  dplyr::filter(TF == "CEBPA") %>%
  dplyr::mutate(celltype = factor(celltype, levels = c("LSK", "GMP"))) %>%
  ggplot(aes(x = signature, y = value)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(round(value, 2), "%")), vjust = .5, color = "black", size = list.ggplot$textsize) +
  suppressWarnings(
    geom_signif(
      data = signif_cebpa %>% mutate(label = paste0("FDR = ", label)), manual = TRUE, textsize = 4, tip_length = .025,
      aes(xmin = start, xmax = end, annotations = label, y_position = height)
    )
  ) +
  facet_wrap(~celltype, ncol = 2) +
  guides(x = guide_axis(angle = 45)) +
  xlab("") +
  ylab("regulatory elements bound by CEBPa [%]") +
  scale_y_continuous(breaks = seq(0, 120, 20), labels = c(seq(0, 100, 20), "")) +
  theme(strip.background = element_blank(), strip.text.x = element_blank())

Figure_6J <- (Figure_6J | plot_spacer()) # force same width as Figure_6I

Figure_6J
```

## ChromVAR

GO the other way around. Match motifs to differential regions and see which transcription factors motifs undergo the most opening and closing.

```{r chromVar}

# Get peaks and counts for LSKs - retain regions that overlap any of the LSK peaks
assay(dds_atac, "vst") <- assay(DESeq2::vst(dds_atac, blind = FALSE))

atac.gr <- makeGRangesFromDataFrame(string2chr(rownames(dds_atac)), starts.in.df.are.0based = FALSE)
atac.se <- SummarizedExperiment(
  assays = assays(dds_atac),
  colData = colData(dds_atac),
  rowRanges = atac.gr
)
colData(atac.se) <- droplevels.data.frame(colData(atac.se))

# We put the vst as first assay because chromVAR uses the first assay automatically
assay(atac.se, "counts") <- assay(atac.se, "vst")
assay(atac.se, "vst") <- NULL

# Match motifs using HOCOMOCO collection in jaspar format
hoco_url <- "https://hocomoco11.autosome.org/final_bundle/hocomoco11/core/MOUSE/mono/HOCOMOCOv11_core_MOUSE_mono_jaspar_format.txt"
hoco <- tempfile()
download.file(hoco_url, hoco)

lbl <- suppressWarnings(readLines(hoco))

motifs.hocomoco <- lapply(seq(1, length(lbl) - 4, by = 5), function(x) {
  nm <- gsub(">", "", gsub("_MOUSE.*", "", lbl[x]))
  sp <- strsplit(lbl[(x + 1):(x + 4)], "\t")
  mt <- lapply(sp, function(y) t(as.matrix(as.numeric(y)))) %>% do.call(rbind, .)
  rownames(mt) <- c("A", "C", "G", "T")
  PFMatrix(ID = nm, name = nm, profileMatrix = mt)
}) %>% as(., "SimpleList")

names(motifs.hocomoco) <- unlist(lapply(motifs.hocomoco, function(x) x@ID))

atac.se <- chromVAR::addGCBias(object = atac.se, genome = BSgenome.Mmusculus.UCSC.mm10)
motifs.jaspar.matched <- motifmatchr::matchMotifs(motifs.hocomoco, rowRanges(atac.se), genome = BSgenome.Mmusculus.UCSC.mm10)

chromvar.dev <- chromVAR::computeDeviations(object = atac.se, annotations = motifs.jaspar.matched)
chromvar.dev.score <- chromVAR::deviationScores(chromvar.dev) %>%
  data.frame() %>%
  rownames_to_column("Gene")

Figure_7A <-
  chromvar.dev.score %>%
  reshape2::melt() %>%
  mutate(
    group = factor(gsub("_rep.*", "", variable), levels = c("WT_LSK", "URE_LSK", "WT_GMP", "URE_GMP")),
    celltype = factor(gsub(".*_", "", group), levels = c("LSK", "GMP")),
    genotype = factor(gsub("_.*", "", group), levels = c("WT", "URE"))
  ) %>%
  filter(grepl("RUNX1|SPI1|CEBPA", Gene)) %>%
  mutate(Gene = str_to_title(Gene), Gene = gsub("Spi1", "PU.1", Gene), Gene = factor(Gene, c("PU.1", "Runx1", "Cebpa"))) %>%
  ggplot(aes(x = celltype, y = value, color = genotype)) +
  geom_point(size = 2, position = position_jitter(seed = 1, width = .1)) +
  stat_summary(fun = "median", fun.min = "median", fun.max = "median", size = 0.25, geom = "crossbar", show.legend = FALSE) +
  facet_wrap(~Gene, scales = "free") +
  guides(x = guide_axis(angle = 45)) +
  xlab("") +
  ylab("deviation score") +
  scale_color_manual(name = "", values = list.ggplot$genotype_colors) +
  theme(legend.position = "top")

Figure_7A
```