07_CTCF_Threshold.Rmd

---
title: "CTCF threshold analysis"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
  pdf_document:
    toc: no
  html_document:
    theme: cerulean
    toc: yes
---

```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700, 
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
```

# Libraries

```{r libraries}
library(tidyverse)
library(readxl)
library(writexl)
library(cowplot)
library(stringr)
library("ggsci")
library(scales)
# scales::show_col(pal_lancet("lanonc")(8))
mycols = pal_lancet("lanonc")(8)
# Color palette for the heatmap, https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf
col3 <- colorRampPalette(c('blue', 'white', 'red'))(20)
# col3 <- colorRampPalette(c('blue', 'gray', 'yellow'))(20)
# col3 <- colorRampPalette(c('green', 'black', 'red'))(20)
# col3 <- colorRamps::green2red(n = 20)
library(GenomicRanges)
library(rtracklayer)
library(plyranges)
```

# Settings

```{r settings}
dir_data <- "/Users/mdozmorov/Documents/Data/GoogleDrive/CTCF.dev"
# Project folder path
dir_project <- "/Users/mdozmorov/Documents/Work/GitHub/CTCF.dev"

# Input files
fileNameIn1 <- file.path(dir_project, "RData", "BEDfiles", "hg38.MA0139.1.bed")
fileNameIn2 <- file.path(dir_project, "data", "GRCh38-CTCF.bed")
fileNameIn3 <- file.path(dir_project, "data", "UCSC_CTCF.tsv")
# Output files
fileNameOut1 <- file.path(dir_project, "results", "Figure_human_pvalues_threshold.svg")
fileNameOut2 <- file.path(dir_project, "results", "overlap_meme_below.csv")
fileNameOut3 <- file.path(dir_project, "results", "overlap_meme_above.csv")
```

# Load data

```{r data}
# Read in data
mtx <- read_tsv(fileNameIn1, col_names = FALSE, col_types = c("cddcdcdd"))
gr_meme <- GRanges(seqnames = mtx$X1, ranges = IRanges(start = mtx$X2, end = mtx$X3), strand = mtx$X6)
gr_meme$pval <- mtx$X8
gr_meme$qval <- mtx$X9

mtx1 <- read_tsv(fileNameIn2, col_names = FALSE, col_types = c("cddccc"))
gr_encode <- GRanges(seqnames = mtx1$X1, ranges = IRanges(start = mtx1$X2, end = mtx1$X3))
```

## Summary statistics

```{r}
print("Summary of full MEME p-value distribution")
summary(gr_meme$pval)

print(paste("Length of full MEME CTCF set:", length(gr_meme)))
print(paste("Length of ENCODE SCREEN CTCF set:", length(gr_encode)))
```

## Scanning the threshold

```{r}
# thresholds <- seq(from = 1E-8, to = 1E-4, length.out = 10) %>% rev()
thresholds <- c(1.00E-4, 5.00E-4, 1.00E-5, 5.00E-5, 1.00E-6, 5.00E-6, 1.00E-7, 5.00E-7, 1.00E-8)
# formatC(thresholds, format = "e")

gr_meme_pos_list <- vector(mode = "numeric", length = length(thresholds))
gr_meme_neg_list <- vector(mode = "numeric", length = length(thresholds))
for (i in 1:length(thresholds)) {
  pval_cutoff <- thresholds[i]
  gr_meme_cutoff <- gr_meme[gr_meme$pval < pval_cutoff]
  gr_meme_overlaps <- countOverlaps(gr_meme_cutoff, gr_encode)
  gr_meme_pos_list[i] <- sum(gr_meme_overlaps > 0) / length(gr_meme_overlaps)
  gr_meme_neg_list[i] <- sum(gr_meme_overlaps == 0) / length(gr_meme_overlaps)
}

gr_summary <- data.frame(Threshold = thresholds, TP = gr_meme_pos_list, FP = gr_meme_neg_list)

mtx_to_plot <- pivot_longer(gr_summary, cols = c("TP", "FP"), names_to = "Type")
ggplot(mtx_to_plot, aes(x = Threshold, y = value, color = Type)) +
  geom_line() +
  scale_x_continuous(breaks = thresholds, labels = scientific) +
  scale_y_continuous(breaks = seq(0, 1, 0.2))   + 
  scale_color_manual(values = mycols[1:2]) +
  coord_trans(x ="log10") +
  geom_vline(xintercept = 1E-6, linetype = "dashed", color = "black") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(), 
      # axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      plot.title = element_text(hjust = 0.5),
      legend.text = element_text(size = 6)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) +
  xlab("FIMO p-value threshold") +
  ylab("FP/TP rate")

ggsave(filename = fileNameOut1, width = 4.5, height = 1.5)
# dev.off()
# svg(filename = fileNameOut1, width = 7, height = 3)
# print(p)
# dev.off()
```

## Overlap with cell-specific sites

```{r}
# Separate FIMO CTCF peaks into above and below the threshold
pval_threshold <- 1E-6
gr_meme_above <- gr_meme %>% filter(pval <= pval_threshold) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
gr_meme_below <- gr_meme %>% filter(pval >  pval_threshold) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
print(paste("Number of CTCFs above", pval_threshold, "p-value threshold", length(gr_meme_above)))
print(paste("Number of CTCFs below", pval_threshold, "p-value threshold", length(gr_meme_below)))
# Get all cell type-specific CTCF sites
# IDs of all CTCF experiments
mtx <- read_tsv(fileNameIn3, col_names = FALSE)
# Session for UCSC broser
mySession <- browserSession()
genome_id <- "hg38"
genome(mySession) <- genome_id
# Overlap counts
overlap_meme_above <- cbind()
overlap_meme_below <- cbind()
for (i in 158:nrow(mtx)) {
  print(paste(i, "out of", nrow(mtx)))
  table_name <- paste0("encTfChipPk", mtx[i, "X4"])
  query <- ucscTableQuery(mySession, table = table_name)
  gr_ctcf <- getTable(query) %>% makeGRangesFromDataFrame(.) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
  overlap_meme_below <- cbind(overlap_meme_below, countOverlaps(gr_meme_below, gr_ctcf))
  overlap_meme_above <- cbind(overlap_meme_above, countOverlaps(gr_meme_above, gr_ctcf))
  gc()
  write_csv(as.data.frame(overlap_meme_below), fileNameOut2)
  write_csv(as.data.frame(overlap_meme_above), fileNameOut3)
}

overlap_meme_below <- read_csv(paste0(fileNameOut2, ".gz"))
dim(overlap_meme_below)
overlap_meme_above <- read_csv(paste0(fileNameOut3, ".gz"))
dim(overlap_meme_above)

# The proportion of CTCF sites detected at p-value > 1E-6 overlapping at least one experimentally detected cell type-specific CTCF sites was XX +/-YY.
# In contrast, this number for CTCF sites detected at p-value <= 1E-6 was XX+/-YY, suggesting higher proportion of false positives among less significant CTCF sites.
overlap_meme_above_summary <- rowSums(overlap_meme_above)
overlap_meme_below_summary <- rowSums(overlap_meme_below)
print(paste("Average number of times less significant CTCF sites overlap with experimental CTCF sites", mean(overlap_meme_below_summary), "+/-", sd(overlap_meme_below_summary)))
# [1] "Average number of times less significant CTCF sites overlap with experimental CTCF sites 7.43024371211658 +/- 29.2354917718536"
print(paste("Average number of times more significant CTCF sites overlap with experimental CTCF sites", mean(overlap_meme_above_summary), "+/-", sd(overlap_meme_above_summary)))
# [1] "Average number of times more significant CTCF sites overlap with experimental CTCF sites 120.151631212219 +/- 81.188988520227"

# Among less significant CTCF sites, XX% did not overlap any cell type-specific CTCF sites.
# In contrast, only YY% more significant sites overlap
print(paste("The proportion of less significant CTCF sites overlapping at least one experimental CTCF site", 1 - (sum(overlap_meme_below_summary == 0) / length(overlap_meme_below_summary)) ))
# [1] "The proportion of less significant CTCF sites overlapping at least one experimental CTCF site 0.218351650508075"
print(paste("The proportion of more significant CTCF sites overlapping at least one experimental CTCF site", 1 - (sum(overlap_meme_above_summary == 0) / length(overlap_meme_above_summary)) ))
# [1] "The proportion of more significant CTCF sites overlapping at least one experimental CTCF site 0.888007013981819"

# P-value of the differences
res <- wilcox.test(overlap_meme_above_summary, overlap_meme_below_summary)
res$p.value
```