-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1. CAGEfightR.R
263 lines (198 loc) · 12 KB
/
1. CAGEfightR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#### CAGEfightR core analysis
#### Axel Thieffry - November 2020
set.seed(42)
library(tidyverse)
library(tidylog)
library(magrittr)
library(stringr)
library(reshape2)
library(CAGEfightR)
library(RColorBrewer)
library(BiocParallel)
library(TeMPO)
register(MulticoreParam(workers=6))
library(TxDb.Athaliana.BioMart.plantsmart28)
txdb <- TxDb.Athaliana.BioMart.plantsmart28
library(BSgenome.Athaliana.TAIR.TAIR9)
library(org.At.tair.db)
odb <- org.At.tair.db
options(scipen=10) # disable scientific notation
'%!in%' <- function(x,y)!('%in%'(x,y))
'select' <- dplyr::select
'rename' <- dplyr::rename
# 1. PREPARATION ####
# -------------------
myseqinfo <- readRDS('data/RDS files/myseqinfo.rds')
colData <- readRDS('data/RDS files/colData.rds')
# 2. MAKE CAGE CTSSs ####
# -----------------------
# 2a. Create CTSS object
bw_plus <- list.files(path='data/CAGE bws/bigwigs_raw', pattern='*.plus.chrWchr.bw', full.names=T) %>% BigWigFileList()
bw_minus <- list.files(path='data/CAGE bws/bigwigs_raw', pattern='*.minus.chrWchr.bw', full.names=T) %>% BigWigFileList()
names(bw_plus) <- names(bw_minus) <- colData$Name
# 2b. quantify CTSS across samples
CTSSs <- quantifyCTSSs(plusStrand=bw_plus, minusStrand=bw_minus, genome=myseqinfo, design=colData)
#### remove noise and normalize in TPM
# 2c. calculate support based on readcounts
CTSSs <- calcSupport(CTSSs, inputAssay='counts', outputColumn='CTSS_support', unexpressed=0)
# 2d. remove CTSS supported by less than 3 libraries (min group size)
CTSSs <- subset(CTSSs, CTSS_support >= 3)
# 2e. normalize to TPM: add supported_TPM assay as well as a 'totalTags' column to colData
CTSSs <- calcTPM(CTSSs, inputAssay='counts', outputAssay='TPM', outputColumn='totalTags')
# sanitycheck: total TPM is 1 million
assay(CTSSs, 'TPM') %>% colSums() %>% as.data.frame()
# 2f. add TPM support
CTSSs <- calcSupport(CTSSs, inputAssay='TPM', outputColumn='TPM_support', unexpressed=0)
# 2g. calculate pooled TPM: as score in the rowRanges
CTSSs <- calcPooled(CTSSs, inputAssay='TPM', outputColumn='pooled_TPM')
# 3. UNIDIRECTIONAL TAG CLUSTERING ####
# -------------------------------------
# 3a. clustering
score(rowRanges(CTSSs)) <- rowRanges(CTSSs)$pooled_TPM # score column is needed for the next step, use pooled_TPM signal
TCs <- clusterUnidirectionally(CTSSs)
# 3b. quantify TCs with CTSS counts
TCs <- quantifyClusters(CTSSs, clusters=TCs, inputAssay='counts', sparse=F)
# 3c. calculate TPM from CTSS counts
TCs <- calcTPM(TCs, totalTags='totalTags', inputAssay='counts', outputAssay='TPM')
# 3d. calculate TPM support
TCs <- calcSupport(TCs, inputAssay='TPM', outputColumn='TPM_support', unexpressed=1)
# 3e. keep only TCs with a minimum of 1 TPM in at least 3 samples (min group size)
TCs <- subset(TCs, TPM_support >= 3)
# 4. BIDIRECTIONAL TAG CLUSTERING ####
# ------------------------------------
# 4a. clustering
enhancers <- clusterBidirectionally(CTSSs, window=251)
# 4b. calculate bidirectionality support (must have counts > 0 in both arms)
enhancers <- calcBidirectionality(enhancers, samples=CTSSs)
# 4c. plot enhancer bidirectionality support
mcols(enhancers) %>%
as.data.frame() %>%
ggplot(aes(x=as.factor(bidirectionality))) +
geom_bar(stat='count') +
geom_vline(xintercept=3.5, col='red', lty=2) +
labs(x='Support (libraries with bidirectional signal)', y='Enhancers',
title='Bidirectional support of enhancers',
subtitle='Sliding window length: 500bp\n') +
theme(aspect.ratio=1)
# 4d. keep only enhancers that are bidirectional in at least in 3 samples
enhancers <- subset(enhancers, bidirectionality >= 3)
# 4e. quantify enhancers (counts)
enhancers <- quantifyClusters(CTSSs, clusters=enhancers, inputAssay='counts')
# 4f. quantify enhancers (TPM)
enhancers <- calcTPM(enhancers, totalTags='totalTags', inputAssay='counts', outputAssay='TPM')
# 4g. calculate pooled TPM as a new column
enhancers <- calcPooled(enhancers, inputAssay='TPM', outputColumn='TPM_pooled')
# 5. BUILD CUSTOM TAIR10 ANNOTATION ####
# --------------------------------------
# 5a. Build sense annotation hierarchy (first to appear is higher priority)
# NB: using a GRangesList for the txModels will make assignTxType to not use any argument/option, so everything has to be build here
if(FALSE) {
Antis_tair <- transcripts(txdb) %>% invertStrand()
# make reverse strand (aka PROMPT regions, antisense to gene and up to 400bp upstream of TSS) for TAIR10
Reverse_tair <- suppressWarnings( promoters(txdb, upstream=400, downstream=0) %>% trim() %>% invertStrand() )
# build
custom_hierarchy_tair10 <- GRangesList('promoter' = suppressWarnings( promoters(txdb, upstream=100, downstream=100) %>% trim() %>% granges() ),
'fiveUTR' = fiveUTRsByTranscript(txdb) %>% unlist() %>% granges(),
'threeUTR' = threeUTRsByTranscript(txdb) %>% unlist() %>% granges(),
'CDS' = cds(txdb) %>% granges(),
'exon' = exons(txdb) %>% granges(),
'intron' = intronsByTranscript(txdb) %>% unlist() %>% granges(),
'proximal' = suppressWarnings( promoters(txdb, upstream=400, downstream=0) %>% trim() %>% granges() ),
'reverse' = Reverse_tair,
'antisense' = Antis_tair)
# fix its seqinfo
seqlevelsStyle(custom_hierarchy_tair10) <- seqlevelsStyle(myseqinfo)
seqlevels(custom_hierarchy_tair10) <- seqlevels(myseqinfo)
seqinfo(custom_hierarchy_tair10) <- myseqinfo
# save
saveRDS(custom_hierarchy_tair10, file='data/RDS files/TAIR10_custom_annotation_hierarchy.rds')
}
custom_hierarchy_tair10 <- readRDS('data/RDS files/TAIR10_custom_annotation_hierarchy.rds')
# 5b. Build antisense annotation hierarchy (first to appear is higher priority)
if(FALSE) {
antisense_tmp <- invertStrand(custom_hierarchy_tair10)
names(antisense_tmp) <- paste0('antisense_', names(antisense_tmp))
# merge with sense hierarchy (after, because order defines priority)
custom_annotation_hierarchy_extended_TAIR10 <- c(custom_hierarchy_tair10, antisense_tmp)
# remove redundant categories
data.frame('antisense_annot'=names(custom_annotation_hierarchy_extended_TAIR10))
custom_annotation_hierarchy_extended_TAIR10$reverse <- NULL # because it does not overlap the gene!!!!
custom_annotation_hierarchy_extended_TAIR10$antisense <- NULL # because it will be annotated
custom_annotation_hierarchy_extended_TAIR10$antisense_antisense <- NULL # because redundant
custom_annotation_hierarchy_extended_TAIR10$antisense_promoter <- NULL # because it will either be 5'UTR or reverse
custom_annotation_hierarchy_extended_TAIR10$antisense_reverse <- NULL # because reverse already captures them
custom_annotation_hierarchy_extended_TAIR10$antisense_proximal <- NULL # because reverse already captures them
# save
saveRDS(custom_annotation_hierarchy_extended_TAIR10, 'data/RDS files/TAIR10_custom_annotation_hierarchy_extended_antisense.rds')
}
custom_hierarchy_tair10_extended_antisense <- readRDS('data/RDS files/TAIR10_custom_annotation_hierarchy_extended_antisense.rds')
# 6. ANNOTATE CAGE TCs & enhancers ####
# -------------------------------------
# 6a. fix seqinfo of Arabidopsis transcripts database
seqlevelsStyle(txdb) <- seqlevelsStyle(myseqinfo)
seqlevels(txdb) <- seqlevels(myseqinfo)
# 6b. assign TX to CAGE TCs: sense TCs are up to 400bp upstream of the TX start will be assigned to that TX
TCs <- assignTxID(TCs, txModels=txdb, outputColumn='txID', swap='thick', upstream=400, downstream=0)
# 6c. assign TxType based on custom hierarchical annotation
TCs <- assignTxType(TCs, txModels=custom_hierarchy_tair10, outputColumn='txType', swap='thick', noOverlap='intergenic')
# 6d. assign extended antisense TxType
TCs <- assignTxType(TCs, txModels=custom_hierarchy_tair10_extended_antisense, outputColumn='txType_extended', swap='thick', noOverlap='intergenic')
# 6e. relevel TxType to somewhat follow a canonical gene structure
rowRanges(TCs)$txType %<>% factor(levels=c('reverse', 'proximal', 'promoter', 'fiveUTR', 'CDS', 'exon', 'intron', 'threeUTR', 'antisense', 'intergenic'))
rowRanges(TCs)$txType_extended %<>% factor(levels=c('reverse', 'proximal', 'promoter', 'fiveUTR', 'antisense_fiveUTR', 'CDS', 'antisense_CDS', 'exon', 'antisense_exon', 'intron', 'antisense_intron','threeUTR', 'antisense_threeUTR', 'antisense', 'intergenic'))
# 6f. annotate enhancers
enhancers <- assignTxType(enhancers, txModels=txdb, outputColumn='txType')
rowRanges(enhancers)$txType %<>% factor(levels=c('reverse', 'proximal', 'promoter', 'fiveUTR', 'CDS', 'exon', 'intron', 'threeUTR', 'antisense', 'intergenic'))
enhancers <- assignGeneID(enhancers, geneModels=txdb, outputColumn='geneID', upstream=0, downstream=0, swap='thick')
# 6g. only keep intronic and intergenic enhancers
enhancers %<>% subset(txType %in% c('intron', 'intergenic')) # 170 enhancers
# 6h. only keep enhancers on canonical chromosomes
seqlevels(enhancers, pruning.mode='coarse') <- paste0('Chr', 1:5) # 155
# 6i. save
saveRDS(TCs, 'data/RDS files/SE_TCs.rds')
saveRDS(enhancers, 'data/RDS files/SE_enhancers.rds')
# 7. CAGE TC SHAPE STATISTICS ####
# --------------------------------
# 7a. temporarily assign the pooled_TPM value to the score column as it will be needed for the next step:
score(rowRanges(CTSSs)) <- rowRanges(CTSSs)$pooled_TPM
TCs <- calcShape(TCs, pooled=CTSSs, outputColumn='shape_IQR', shapeFunction=shapeIQR, lower=0.1, upper=0.9)
TCs <- calcShape(TCs, pooled=CTSSs, outputColumn='shape_entropy', shapeFunction=shapeEntropy)
TCs <- calcShape(TCs, pooled=CTSSs, outputColumn='shape_mean', shapeFunction=shapeMean)
# 7b. remove tmp column
score(rowRanges(CTSSs)) <- NULL
# 8. ANNOTATION AT GENE-LEVEL ####
# --------------------------------
# 8a. normal sense genes
TCs <- assignGeneID(TCs, geneModels=txdb, outputColumn='geneID', upstream=400, downstream=0, swap='thick')
# 8b. antisense gene (must extend to the PROMPT region before inverting strand!)
extended_genes <- genes(txdb)
promoters_genes <- promoters(extended_genes, upstream=400, downstream=0) %>% trim()
start(extended_genes) <- ifelse(strand(extended_genes)=='+', start(promoters_genes) , start(extended_genes))
end(extended_genes) <- ifelse(strand(extended_genes)=='-', end(promoters_genes), end(extended_genes))
TCs <- assignGeneID(TCs, geneModels=invertStrand(extended_genes), outputColumn='geneID_anti', upstream=0, downstream=0, swap='thick')
# 8c. gene symbols
symbols <- mapIds(odb, keys=rowRanges(TCs)$geneID, keytype='TAIR', column='SYMBOL')
symbols_anti <- mapIds(odb, keys=rowRanges(TCs)$geneID_anti, keytype='TAIR', column='SYMBOL')
rowRanges(TCs)$symbol <- as.character(symbols)
rowRanges(TCs)$symbol_anti <- as.character(symbols_anti)
# 8d. Quantify expression at Gene-level
genelevel <- quantifyGenes(TCs, genes='geneID', inputAssay='counts')
genelevel <- calcTPM(genelevel, inputAssay='counts', outputAssay='TPM')
# 9. FILTERING TCs BASED ON COMPOSITION - to use for later call of DTUs ####
# --------------------------------------------------------------------------
# 9a. remove TSSs not belonging to any gene
intragenicTSSs <- subset(TCs, !is.na(geneID))
# 9b. calculate composition: the number of samples expressing TSSs above 10% of the total gene expression
intragenicTSSs <- calcComposition(intragenicTSSs, inputAssay='counts', outputColumn='composition', unexpressed=0.1, genes='geneID')
# 9c. subset for composition in a minimum of 3 libraries (smallest sample group size)
intragenicTSSs <- subset(intragenicTSSs, composition >= 3)
# 10. SAVE EVERYTHING ####
# ------------------------
saveRDS(CTSSs, file='data/RDS files/SE_CTSSs.rds')
saveRDS(TCs, file='data/RDS files/SE_TCs.rds')
saveRDS(enhancers, file='data/RDS files/SE_enhancers.rds')
saveRDS(genelevel, file='data/RDS files/SE_genelevel.rds')
saveRDS(intragenicTSSs, file='data/RDS files/SE_intragenicTSSs.rds')
# 10b. bedfiles
rowRanges(TCs) %>% export.bed('data/beds/TCs.bed')
rowRanges(enhancers) %>% export.bed('data/beds/enhancers.bed')