-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path13. PROTEIN DOMAINS.R
164 lines (136 loc) · 7.05 KB
/
13. PROTEIN DOMAINS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#### ROTEIN DOMAIN ANALYSIS
#### Axel Thieffry - December 2020
set.seed(42)
library(tidyverse)
library(tidylog)
library(ggforce)
library(magrittr)
library(stringr)
library(reshape2)
library(CAGEfightR)
library(RColorBrewer)
library(GenomicRanges)
library(SummarizedExperiment)
library(gprofiler2)
library(eulerr)
library(BiocParallel)
library(patchwork)
library(WriteXLS)
library(TxDb.Athaliana.BioMart.plantsmart28)
library(BSgenome.Athaliana.TAIR.TAIR9)
register(MulticoreParam(workers=6))
options(scipen=10) # disable scientific notation
'%!in%' <- function(x,y)!('%in%'(x,y))
'select' <- dplyr::select
'rename' <- dplyr::rename
'count' <- dplyr::count
'l' <- length
'h' <- head
# 1. READ INPUTS ####
# -------------------
universe <- readRDS('data/RDS files/universe_geneID.rds')$geneID
myseqinfo <- readRDS('data/RDS files/myseqinfo.rds')
idmapping <- readRDS('data/RDS files/id_mapping.rds')
idmapping2 <- readRDS('data/RDS files/id_mapping2.rds')
intragenicTCs <- readRDS('data/RDS files/SE_intragenicTSSs.rds')
deTCs <- readRDS('data/RDS files/DE_TCs.rds')
# 2. MAP PROTEIN DOMAINS TO TAIR10 ####
# -------------------------------------
# 2a. read TAIR10 protein domains & parse
# source: https://www.arabidopsis.org/download_files/Proteins/Domains/all.domains.txt
prot_domains <- read.csv2('data/ProteinDomains/TAIR10_all.domains', h=F, '\t', strip.white=T, blank.lines.skip=T, stringsAsFactors=F, na.strings=c('', 'NUL', 'NULL')) %>% as_tibble() %>% select(-V2)
# rename columns
colnames(prot_domains) <- c('tx_id', 'len', 'app', 'dom_id', 'dom_desc', 'start_dom', 'end_dom', 'evalue', 'date', 'interpro_id', 'interpro_name')
# clean a bit
prot_domains %<>% select(-len, -evalue, -date)
# get geneID from tx_id and re-order
prot_domains %<>% mutate('geneID'=str_remove(tx_id, '\\.[:digit:]+$')) %>% select(tx_id, geneID, everything())
# put 'no description' to NA
prot_domains %<>% mutate('dom_desc'=ifelse(dom_desc=='no description', NA, dom_desc))
# 2b. extract CDS regions from TAIR10 annotation
txdb <- TxDb.Athaliana.BioMart.plantsmart28
cds_regions <- cdsBy(txdb, 'tx', use.names=T)
# fix seqinfo of CDS regions
seqlevels(cds_regions) <- seqlevels(myseqinfo)
seqinfo(cds_regions) <- myseqinfo
# 2c. make IRanges from protein domains
prot_domains_gr <- with(prot_domains, IRanges(start=start_dom * 3, end=end_dom * 3))
mcols(prot_domains_gr) <- prot_domains
names(prot_domains_gr) <- mcols(prot_domains_gr)$dom_id
# 2d. map to genome
on_genome <- pmapFromTranscripts(x=prot_domains_gr, transcripts=cds_regions[mcols(prot_domains_gr)$tx_id])
mcols(on_genome) <- mcols(prot_domains_gr)
# 2e. sort and export
for_export <- on_genome
names(for_export) <- mcols(for_export)$dom_id
for_export <- for_export[width(for_export) > 0]
for_export <- for_export[elementNROWS(for_export) > 0]
export(for_export, 'data/beds/protein_domains_on_genome.bed')
# 3. ANNOTATE TCs WITH DOMAINS ####
# ---------------------------------
# 3a. flatten domains and remove those with negative width
flat_domains <- unlist(on_genome)
flat_domains$tx_id <- names(flat_domains)
flat_domains$geneID <- str_remove(flat_domains$tx_id, '\\.[:digit:]+$')
flat_domains <- subset(flat_domains, width > 0)
# 3b. TCs *in* domain
disruptive_TCs <- rowRanges(intragenicTCs)
hits <- findOverlaps(swapRanges(disruptive_TCs), flat_domains)
disruptive_TCs$in_domain <- extractList(flat_domains$geneID, as(hits, 'List'))
disruptive_TCs$in_domain <- mapply(is.element, disruptive_TCs$geneID, disruptive_TCs$in_domain)
# 3c. TC *downstream* domain
disruptive_TCs$downstream_domain <- follow(swapRanges(disruptive_TCs), flat_domains) # find ANY domain downstream of TC
disruptive_TCs$downstream_domain <- flat_domains$geneID[disruptive_TCs$downstream_domain] # substitute with GENEID of that domain
disruptive_TCs$downstream_domain <- with(disruptive_TCs, ifelse(geneID == downstream_domain, T, F)) # test if protein domain geneID is same as CAGE TC geneID
disruptive_TCs$downstream_domain <- ifelse(is.na(disruptive_TCs$downstream_domain), F, disruptive_TCs$downstream_domain) # change NAs to FALSE
# 3d. protein domain disruption: stats for text
disruptive_TCs %<>% subset(in_domain | downstream_domain)
l(disruptive_TCs) # 454 intragenic TCs
n_distinct(disruptive_TCs$geneID) # in 428 genes
sum(disruptive_TCs$in_domain) # 220 TCs in a domain
subset(disruptive_TCs, in_domain)$geneID %>% n_distinct() # for 216 unique genes
sum(disruptive_TCs$downstream_domain) # 339 TCs downstream a domain
subset(disruptive_TCs, downstream_domain)$geneID %>% n_distinct() # for 317 unique genes
# 3e. make dataframe version
disruptive_TCs_df <- as.data.frame(disruptive_TCs) %>%
rownames_to_column('TC_id') %>%
as_tibble()
# 3f. proportion of intragenic TCs that are disruptive, by txType category
left_join(disruptive_TCs_df$txType %>% table() %>% enframe(name='txType', value='disruptive_TCs'),
rowRanges(intragenicTCs)$txType %>% table() %>% enframe(name='txType',value='total_TCs'),
by='txType') %>%
subset(disruptive_TCs != 0) %>%
mutate('percent'=round(disruptive_TCs/total_TCs*100, 2))
# 4. DOMAIN-DISRUPTIVE TCs DE in TIMECOURSE ###
# ---------------------------------------------
# 4a. get DE TCs in timecourse & make wider version of direction
deTCs_timecourse <- deTCs %>% filter(coef %in% c('t10', 't3010', 't30'))
deTCs_timecourse_wide <- deTCs_timecourse %>%
select(coef, TC_id, direction) %>%
pivot_wider(names_from='coef', values_from='direction')
# 4b. add info to disruptive TCs
disruptive_TCs_df %<>% mutate('TC_DE_in_timecourse'=ifelse(TC_id %in% unique(deTCs_timecourse$TC_id), T, F))
disruptive_TCs_df %<>% left_join(deTCs_timecourse_wide, by='TC_id')
# 4c. stats
# number of disruptive TCs that are DE in timecourse
filter(disruptive_TCs_df, TC_DE_in_timecourse) %>% nrow() # 127 TCs
filter(disruptive_TCs_df, TC_DE_in_timecourse)$geneID %>% n_distinct() # in 125 genes
# 4d. GSEA on genes with induced, domain-disrupting CAGE TC
go_all <- filter(disruptive_TCs_df, TC_DE_in_timecourse) %>%
pull(geneID) %>%
unique() %>%
gost(organism='athaliana', significant=T, user_threshold=0.05, correction_method='fdr', custom_bg=universe, ordered_query=F)
go_all$result %>%
mutate('p_value_transformed'=-log(p_value, base=10)) %>%
slice_max(order_by=p_value_transformed, n=10, with_ties=F) %>%
ggplot(aes(x=10:1, y=p_value_transformed)) +
geom_col(fill='grey30', alpha=.5) +
geom_text(aes(label=term_name, y=.1), hjust=0, col='black') +
coord_flip() +
cowplot::theme_cowplot() + theme(axis.text.y=element_blank(), aspect.ratio=.75) +
scale_y_continuous(expand=c(0, 0)) +
labs(x='Top GO term', y='-log(adj.P.Value, base=10)', title='GSEA',
subtitle='Genes with domain-disrupting TC induced\nin flg22 treatment (N=125)')
# 4e. export and save
left_join(disruptive_TCs_df, select(idmapping2, -symbol), by='geneID') %>%
WriteXLS(ExcelFileName='data/excel_outputs/protein_domain_disruptive_TCs.xlsx', row.names=F, col.names=T, AdjWidth=T, BoldHeaderRow=T, verbose=T)