-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07_CTCF_Threshold.Rmd
186 lines (159 loc) · 7.75 KB
/
07_CTCF_Threshold.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
---
title: "CTCF threshold analysis"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: cerulean
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
```
# Libraries
```{r libraries}
library(tidyverse)
library(readxl)
library(writexl)
library(cowplot)
library(stringr)
library("ggsci")
library(scales)
# scales::show_col(pal_lancet("lanonc")(8))
mycols = pal_lancet("lanonc")(8)
# Color palette for the heatmap, https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf
col3 <- colorRampPalette(c('blue', 'white', 'red'))(20)
# col3 <- colorRampPalette(c('blue', 'gray', 'yellow'))(20)
# col3 <- colorRampPalette(c('green', 'black', 'red'))(20)
# col3 <- colorRamps::green2red(n = 20)
library(GenomicRanges)
library(rtracklayer)
library(plyranges)
```
# Settings
```{r settings}
dir_data <- "/Users/mdozmorov/Documents/Data/GoogleDrive/CTCF.dev"
# Project folder path
dir_project <- "/Users/mdozmorov/Documents/Work/GitHub/CTCF.dev"
# Input files
fileNameIn1 <- file.path(dir_project, "RData", "BEDfiles", "hg38.MA0139.1.bed")
fileNameIn2 <- file.path(dir_project, "data", "GRCh38-CTCF.bed")
fileNameIn3 <- file.path(dir_project, "data", "UCSC_CTCF.tsv")
# Output files
fileNameOut1 <- file.path(dir_project, "results", "Figure_human_pvalues_threshold.svg")
fileNameOut2 <- file.path(dir_project, "results", "overlap_meme_below.csv")
fileNameOut3 <- file.path(dir_project, "results", "overlap_meme_above.csv")
```
# Load data
```{r data}
# Read in data
mtx <- read_tsv(fileNameIn1, col_names = FALSE, col_types = c("cddcdcdd"))
gr_meme <- GRanges(seqnames = mtx$X1, ranges = IRanges(start = mtx$X2, end = mtx$X3), strand = mtx$X6)
gr_meme$pval <- mtx$X8
gr_meme$qval <- mtx$X9
mtx1 <- read_tsv(fileNameIn2, col_names = FALSE, col_types = c("cddccc"))
gr_encode <- GRanges(seqnames = mtx1$X1, ranges = IRanges(start = mtx1$X2, end = mtx1$X3))
```
## Summary statistics
```{r}
print("Summary of full MEME p-value distribution")
summary(gr_meme$pval)
print(paste("Length of full MEME CTCF set:", length(gr_meme)))
print(paste("Length of ENCODE SCREEN CTCF set:", length(gr_encode)))
```
## Scanning the threshold
```{r}
# thresholds <- seq(from = 1E-8, to = 1E-4, length.out = 10) %>% rev()
thresholds <- c(1.00E-4, 5.00E-4, 1.00E-5, 5.00E-5, 1.00E-6, 5.00E-6, 1.00E-7, 5.00E-7, 1.00E-8)
# formatC(thresholds, format = "e")
gr_meme_pos_list <- vector(mode = "numeric", length = length(thresholds))
gr_meme_neg_list <- vector(mode = "numeric", length = length(thresholds))
for (i in 1:length(thresholds)) {
pval_cutoff <- thresholds[i]
gr_meme_cutoff <- gr_meme[gr_meme$pval < pval_cutoff]
gr_meme_overlaps <- countOverlaps(gr_meme_cutoff, gr_encode)
gr_meme_pos_list[i] <- sum(gr_meme_overlaps > 0) / length(gr_meme_overlaps)
gr_meme_neg_list[i] <- sum(gr_meme_overlaps == 0) / length(gr_meme_overlaps)
}
gr_summary <- data.frame(Threshold = thresholds, TP = gr_meme_pos_list, FP = gr_meme_neg_list)
mtx_to_plot <- pivot_longer(gr_summary, cols = c("TP", "FP"), names_to = "Type")
ggplot(mtx_to_plot, aes(x = Threshold, y = value, color = Type)) +
geom_line() +
scale_x_continuous(breaks = thresholds, labels = scientific) +
scale_y_continuous(breaks = seq(0, 1, 0.2)) +
scale_color_manual(values = mycols[1:2]) +
coord_trans(x ="log10") +
geom_vline(xintercept = 1E-6, linetype = "dashed", color = "black") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
plot.title = element_text(hjust = 0.5),
legend.text = element_text(size = 6)) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) +
xlab("FIMO p-value threshold") +
ylab("FP/TP rate")
ggsave(filename = fileNameOut1, width = 4.5, height = 1.5)
# dev.off()
# svg(filename = fileNameOut1, width = 7, height = 3)
# print(p)
# dev.off()
```
## Overlap with cell-specific sites
```{r}
# Separate FIMO CTCF peaks into above and below the threshold
pval_threshold <- 1E-6
gr_meme_above <- gr_meme %>% filter(pval <= pval_threshold) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
gr_meme_below <- gr_meme %>% filter(pval > pval_threshold) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
print(paste("Number of CTCFs above", pval_threshold, "p-value threshold", length(gr_meme_above)))
print(paste("Number of CTCFs below", pval_threshold, "p-value threshold", length(gr_meme_below)))
# Get all cell type-specific CTCF sites
# IDs of all CTCF experiments
mtx <- read_tsv(fileNameIn3, col_names = FALSE)
# Session for UCSC broser
mySession <- browserSession()
genome_id <- "hg38"
genome(mySession) <- genome_id
# Overlap counts
overlap_meme_above <- cbind()
overlap_meme_below <- cbind()
for (i in 158:nrow(mtx)) {
print(paste(i, "out of", nrow(mtx)))
table_name <- paste0("encTfChipPk", mtx[i, "X4"])
query <- ucscTableQuery(mySession, table = table_name)
gr_ctcf <- getTable(query) %>% makeGRangesFromDataFrame(.) %>% keepStandardChromosomes(., pruning.mode = "tidy") %>% sort()
overlap_meme_below <- cbind(overlap_meme_below, countOverlaps(gr_meme_below, gr_ctcf))
overlap_meme_above <- cbind(overlap_meme_above, countOverlaps(gr_meme_above, gr_ctcf))
gc()
write_csv(as.data.frame(overlap_meme_below), fileNameOut2)
write_csv(as.data.frame(overlap_meme_above), fileNameOut3)
}
overlap_meme_below <- read_csv(paste0(fileNameOut2, ".gz"))
dim(overlap_meme_below)
overlap_meme_above <- read_csv(paste0(fileNameOut3, ".gz"))
dim(overlap_meme_above)
# The proportion of CTCF sites detected at p-value > 1E-6 overlapping at least one experimentally detected cell type-specific CTCF sites was XX +/-YY.
# In contrast, this number for CTCF sites detected at p-value <= 1E-6 was XX+/-YY, suggesting higher proportion of false positives among less significant CTCF sites.
overlap_meme_above_summary <- rowSums(overlap_meme_above)
overlap_meme_below_summary <- rowSums(overlap_meme_below)
print(paste("Average number of times less significant CTCF sites overlap with experimental CTCF sites", mean(overlap_meme_below_summary), "+/-", sd(overlap_meme_below_summary)))
# [1] "Average number of times less significant CTCF sites overlap with experimental CTCF sites 7.43024371211658 +/- 29.2354917718536"
print(paste("Average number of times more significant CTCF sites overlap with experimental CTCF sites", mean(overlap_meme_above_summary), "+/-", sd(overlap_meme_above_summary)))
# [1] "Average number of times more significant CTCF sites overlap with experimental CTCF sites 120.151631212219 +/- 81.188988520227"
# Among less significant CTCF sites, XX% did not overlap any cell type-specific CTCF sites.
# In contrast, only YY% more significant sites overlap
print(paste("The proportion of less significant CTCF sites overlapping at least one experimental CTCF site", 1 - (sum(overlap_meme_below_summary == 0) / length(overlap_meme_below_summary)) ))
# [1] "The proportion of less significant CTCF sites overlapping at least one experimental CTCF site 0.218351650508075"
print(paste("The proportion of more significant CTCF sites overlapping at least one experimental CTCF site", 1 - (sum(overlap_meme_above_summary == 0) / length(overlap_meme_above_summary)) ))
# [1] "The proportion of more significant CTCF sites overlapping at least one experimental CTCF site 0.888007013981819"
# P-value of the differences
res <- wilcox.test(overlap_meme_above_summary, overlap_meme_below_summary)
res$p.value
```