-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathptm.Rmd
550 lines (452 loc) · 21.6 KB
/
ptm.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
---
title: "The Growing Landscape of Protein Modifications"
author: "E. Keith Keenan & Matthew D. Hirschey"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output:
pdf_document:
toc: yes
html_notebook:
df_print: paged
toc: yes
---
This is an [R Markdown](http://rmarkdown.rstudio.com) notebook accompanying a review on protein modifications. When you execute code within the notebook, the results appear beneath the code and Figures will be save to the working directory.
##Load libraries
```{r load_block, warning=FALSE, echo=TRUE}
library(tidyverse)
library(janitor)
library(viridis)
library(XML)
library(feather)
library(rmarkdown)
library(beepr) #long analysis; get some coffee, and comeback when ready
library(patchwork)
#clear environment
#rm(list=ls())
#print Session information for provenance and reproducibility
utils:::print.sessionInfo(sessionInfo()[-8])
#You can remove an item from sessionInfo(), which is a list with a class attribute, by printing the resulting object omitting one of the list items (omitted list of packages installed, but not loaded)
#Set theme
theme_set(theme_light())
```
##Figure 3a
Overall goal is to quanitfy known landscape of protein amino acids. Chose to get data from Uniprot, as a comprehensive and validated resouce containing data for human proteins.
```{r fig1}
ptm_raw <- read_tsv("https://www.uniprot.org/docs/ptmlist.txt", col_names = FALSE, skip = 48)
#skip 48 first lines which contain data file dictionary
#URL points to a datafile, to increase reproducibility; datafile is also downloaded 12/21/2018 and saved in working data directory in case URL link breaks
#alt
#ptm_raw <- read_tsv("data/ptmlist.txt", col_names = FALSE, skip = 48)
#make working df
ptm <- ptm_raw %>%
separate(X1, c("key", "value"), sep = 3) %>%
mutate(id = if_else(grepl("ID", key), value, NA_character_)) %>% #must call NA_char so that fill fxn works
fill(id) #need fill fxn to populate ids across all observations, so that spread can work
#clean more
ptm$key <- str_trim(ptm$key, side = "right") #use stringr pkg to remove white space *janitor works on colnames only
#drop rows, duplicate rows are causing problems with spread, and don't need them
ptm <- ptm %>%
filter(!key %in% c("//", "TR", "DR", "---"))
#This is code I used to ensure that there were no duplicates
#ptm <- ptm %>%
# unite(key_id, c("key", "id"), sep = "_", remove = FALSE)
#ptm_dup <- get_dupes(ptm, key_id)
#I check the ptm_dup df and made to sure to drop the keys that had more than one entry (immediate preceeding code chunk)
#spread data
ptm <- ptm %>%
spread(key, value) #not clever names, but appropriate
#gives a tibble of 645 observations, therefore 645 unique PTMs
#double check to see no duplicates
#get_dupes(ptm, id)
#more cleaning steps
ptm$MM <- as.numeric(ptm$MM)
ptm$MA <- as.numeric(ptm$MA)
ptm$KW <- str_replace(ptm$KW, "\\.", "") #need two \\ to mean literal "."
ptm$KW <- as.factor(ptm$KW)
ptm$FT <- str_trim(ptm$FT, side = "left") #use stringr pkg to remove white space
ptm$TG <- str_trim(ptm$TG, side = "left")
ptm$TG <- str_replace(ptm$TG, "\\.", "") #need two \\ to mean literal "."
ptm$KW <- fct_explicit_na(ptm$KW, na_level = "Other") #get rid of NAs in KW by making a factor
ptm <- ptm %>% select(-Cop, -Dis) #remove copyright and distribution columns
ptm$KW <- str_trim(ptm$KW, side = "both")
#a little bit of eda
count(ptm, FT, sort = TRUE)
#should I include crosslinks? Or just modifications?
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
summarize(n = n())
#This code snippet give me the total number of unique modifications, with CROSSLINK removed; total is 498. But I know the list doens't include 3 recently published modifications discovered by our group, plus a few others by YMZ (considering this a lag in DB updating), so OK to conclude over 500.
count(ptm, KW, sort = TRUE) #number of modifications by keyword
#a lot of glycoproteins!
#ptm %>%
# filter(FT == "MOD_RES") %>% #include only modified AAs, no cross links, no lipids, no glycoproteins?
# count(TG, sort = TRUE) %>% #target (TG) is exactly what I need
# mutate(TG = fct_reorder(TG, n)) %>%
# ggplot(aes(TG, n)) +
# geom_col() +
# coord_flip() +
# labs(x = "") +
# expand_limits(y = 40)
#commented this out because it only includes modified AAs; not sure if this is useful
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
count(TG, sort = TRUE)
kw_levels <-
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
count(KW, sort = TRUE) %>%
slice(1:10) %>%
filter(!is.na(KW)) %>%
pull(KW)
#ptm$KW <- factor(ptm$KW, levels = kw_levels)
fig3a <-
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
ggplot() +
geom_bar(aes(fct_rev(fct_infreq(TG, ordered = TRUE)), fill = fct_infreq(KW) %>% fct_lump_n(10)), color = "black") +
coord_flip() +
labs(x = "", y = "Count") +
expand_limits(y = 80) +
scale_fill_viridis(discrete = TRUE,
direction = -1,
option = "viridis",
name = "Most Frequent \nModification Class") +
NULL
#save plot
ggsave("output/fig3a.pdf", plot = last_plot(), dpi = 600)
#save ptm
#save(ptm, file=here::here("data", "ptm.RData"))
```
```{r}
#####################################################
#pull
ptm_vec <- ptm %>%
filter(!is.na(id)) %>%
filter(!FT == "CROSSLNK") %>%
select(id, AC) %>%
filter(str_detect(id, "Blocked amino end", negate = TRUE)) %>%
filter(str_detect(id, "\\.\\.\\.", negate = TRUE)) %>%
#tidyr::separate(id, into = c("id", "tmp"), sep = "\\([:alpha:]{3}\\)$") %>%
distinct(id) %>%
pull(id)
ptm_vec <- str_trim(ptm_vec, side = "both")
save(ptm_vec, file=here::here("data", "ptm_vec.RData"))
#####################################################
```
```{r}
#Run pir.Rmd first to generate pir_sys
ptm_dr <- ptm_raw %>%
separate(X1, c("key", "value"), sep = 3) %>%
mutate(id = if_else(grepl("ID", key), value, NA_character_)) %>% #must call NA_char so that fill fxn works
fill(id)
#clean more
ptm_dr$key <- str_trim(ptm_dr$key, side = "right") #use stringr pkg to remove white space *janitor works on colnames only
ptm_dr <- ptm_dr %>%
filter(key == "ID" | str_detect(value, "RESID") | key == "AC") %>%
spread(key, value) %>%
select(ID, AC, DR) %>%
rename(DR_ID = ID)
ptm_dr$DR <- str_trim(ptm_dr$DR, side = "left")
ptm_dr$DR <- str_remove(ptm_dr$DR, "RESID\\;\\s")
ptm_dr$DR <- str_remove(ptm_dr$DR, "\\.")
ptm_dr$AC <- str_trim(ptm_dr$AC, side = "left")
ptm$AC <- str_trim(ptm$AC, side = "left")
ptm <- ptm %>%
left_join(ptm_dr, by = "AC")
ptm <- ptm %>%
left_join(pir_sys, by = c("DR" = "id"))
save(ptm, file=here::here("data", "ptm.RData"))
```
##Figure 3b
Goal is to determine how these modifications are distributed; thought it'd be interesting to visualize by average added mass (MA) to a protein, with several small changes in molecular mass, with some very large additions of mass
```{r fig2}
fig3b <- ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
ggplot() +
#geom_point(aes(x = MA, y =0, color = fct_lump(KW, 0)), shape = "|", size = 15, alpha = 1/2) +
geom_density(aes(x = MA, ..scaled.., color = fct_lump(KW,0))) +
geom_rug(aes(x = MA, y = 0, color = fct_lump(KW,0)), sides = "b", alpha = 1/2, position = "jitter", size = 1) +
labs(x = "Mass Appended (Da)", y = "Distribtution of Modifications (scaled)") +
scale_color_viridis(discrete = TRUE, direction = 1) +
scale_y_continuous(limits = c(0,1)) +
theme(legend.position = "") +
NULL
#save plot
ggsave("output/fig3b.pdf", plot = last_plot(), width = 5, height = 5, dpi = 600)
#the reason some average masses (MA) are so abundant is because you find the same modifications across several AAs, thereby weighting some more
#NB several glycans and lipids are variable masses, and therefore are entered as NA, so not reflected in the graph
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
count(KW, sort = TRUE)
#combine figures for editor
fig3ab <-
fig3a + fig3b +
plot_annotation(tag_levels = 'A')
ggsave("output/fig3ab.pdf", plot = fig3ab, width = 8, height = 4, dpi = 600)
```
##AA Analyses
###Lysine Analysis
In this code chunk, the goal is to count and summarize lysine modifications.
```{r lysine}
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Lysine") %>%
count(ID, sort = TRUE)
#code chunk to make a tibble that is easy to view all attributes; no need to save as an object in environment
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Lysine") %>%
arrange(MA) #sorts by mass, low to high
```
###Cysteine Analysis
In this code chunk, the goal is to count and summarize cysteine modifications. Counted 57 (as of Feb 2019), however does not include 3 published modifiations: succination, 2,3-dicarboxylpropylation (i.e. itaconylation), or s-acetylation, so OK to conclude 60, at least.
```{r cysteine}
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Cysteine") %>%
count(ID, sort = TRUE)
#code chunk to make a tibble that is easy to view all attributes; no need to save as an object in environment
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Cysteine") %>%
arrange(MA) #sorts by mass, low to high
```
###Serine Analysis
In this code chunk, the goal is to count and summarize serine modifications. Counted 70 (as of Feb 2019).
```{r serine}
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Serine") %>%
count(ID, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Threonine") %>%
count(ID, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Tyrosine") %>%
count(ID, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Serine") %>%
filter(str_detect(CF, "P")) %>%
count(ID, sort = TRUE)
#13 serine modifications contain phosphate (12 carbon-phosphate, 1 phosphate only)
#code chunk to make a tibble that is easy to view all attributes; no need to save as an object in environment
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Serine") %>%
arrange(MA) #sorts by mass, low to high
```
###Phenylalanine Analysis
In this code chunk, the goal is to count and summarize phenylalanine modifications. Counted 5 (as of Feb 2019); 1?
```{r phenylalanine}
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Phenylalanine") %>%
arrange(MA) #sorts by mass, low to high
```
###Protein Backbone Analysis
In this code chunk, the goal is to count and summarize backbone modifications. First look at backbone alone; next look at the part of the protein where these are ascribed; then look at distribution of all backbone modifications on amino acids (glycine is the most); but, these are all n- or c-term modifications; if you look at protien core modifications, these are all serine/threonine/tyrosine and cysteine.
```{r backbone}
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
arrange(MA) #sorts by mass, low to high
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
count(PA, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
count(PP, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
count(TG, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
filter(TG == "Glycine")
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
filter(str_detect(PP, "core")) %>%
count(TG, sort = TRUE)
ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(str_detect(PA, "backbone")) %>%
filter(str_detect(PP, "core")) %>%
arrange(TG)
```
##Figure 4
In this figure, the goal is to determine how many acyl-CoA species have been measured
Import metabolite data from HMDB (http://www.hmdb.ca/downloads); all metabolites http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip
```{r import_fig4, eval=FALSE, include=FALSE}
# Parse the XML files
proteins_raw <- xmlToDataFrame("data/hmdb_proteins.xml")
metabolites_raw <- xmlToDataFrame("data/hmdb_metabolites.xml")
#use feather next time here
save (temp) because extracting XML took a long time for a 4GB XML file!
save(proteins_raw, file = "data/proteins_raw.Rda")
save(metabolites_raw, file = "data/metabolites_raw.Rda")
#Commented out parsing and saving code, but then set this code chunk to eval/include = FALSE, so re-running the entire code fresh will simply skip this chunk, and load the raw data files in the next chunk as a df
```
```{r reload_fig4}
#reload
load("data/proteins_raw.Rda")
load("data/metabolites_raw.Rda")
```
```{r clean_fig4}
#as.X
proteins_raw <- as_tibble(proteins_raw)
metabolites_raw <- as_tibble(metabolites_raw)
metabolites_raw$average_molecular_weight <- as.numeric(metabolites_raw$average_molecular_weight)
#clean
metabolites <- metabolites_raw %>%
select(one_of("accession", "name", "average_molecular_weight", "chemical_formula", "smiles", "normal_concentrations")) %>%
clean_names() %>%
remove_empty("rows")
```
```{r fig4}
#count CoAs
CoA <- metabolites %>%
filter(str_detect(name, "CoA")) %>%
mutate(average_molecular_weight_noCoA = round(average_molecular_weight - 767.534, 2)) %>% #substract weight of CoA for appended mass
arrange(average_molecular_weight_noCoA)
CoA <- CoA %>% #number of carbons
mutate(carbon_num = str_extract(chemical_formula, "C\\d+")) %>% #extract C then digit, then one or more
mutate(carbon_num = str_extract(carbon_num, "\\d+")) %>% #to extract digit only
mutate(carbon_num = as.numeric(carbon_num)) %>% #numeric, to do substraction next
mutate(carbon_num_acyl = carbon_num - 21) %>% #remove number of carbons in CoA alone, to get acyls
slice(-1:-6) %>% #typos in the dataset
arrange(carbon_num_acyl) %>%
mutate (type = "CoA")
CoA <- CoA %>% #number of oxygens
mutate(o2_num = str_extract(chemical_formula, "O\\d+")) %>% #extract C then digit, then one or more
mutate(o2_num = str_extract(o2_num, "\\d+")) %>% #to extract digit only
mutate(o2_num = as.numeric(o2_num)) %>% #numeric, to do substraction next
mutate(o2_num_acyl = o2_num - 16) %>% #remove number of oxygens in CoA alone, to get acyls
slice(-1) #remove dephosphoCoA
CoA <- CoA %>%
separate(smiles, into = c("smiles1", "smiles2"), sep = "S", remove = FALSE, extra = "merge") #split smiles code on the Sulfur(S); needed to split this from the next code chunk, so it runs first; if not, then throws an error that 'smiles1' does not exist (length is zero)
#https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system
CoA <- CoA %>%
mutate(smiles_acyl = if_else(str_detect(smiles1, "P"), smiles2, smiles1)) #this is the code that pulls the 'acyl' from the split smiles data, and discards the CoA, becuase it contains the phosphate; this will allow me to look only at the possible protein appendage.
CoA <- CoA %>%
mutate(acyl_description = if_else(str_detect(smiles_acyl, "\\(O\\)\\=O"), "Carboxyl",
if_else(str_detect(smiles_acyl, "CO"), "Hydroxyl",
if_else(str_detect(smiles_acyl, "C\\(O\\)C"), "Hydroxyl",
if_else(str_detect(smiles_acyl, "C\\=C"), "Methylene",
if_else(str_detect(smiles_acyl, "C\\(\\=C\\)"), "Methylene",
if_else(str_detect(smiles_acyl, "CC\\=O"), "Aldehyde", #hardcode aldehyde
if_else(str_detect(smiles_acyl, "C\\=O"), "Straight", #hardcode formyl
if_else(str_detect(smiles_acyl, "C\\(C\\)"), "Branched",
if_else(str_detect(smiles_acyl, "CCC"), "Straight",
if_else(str_detect(smiles_acyl, "CC\\(\\=O\\)"), "Straight", #hardcode acetyl
"Other")))))))))))
CoA %>%
filter(str_detect(smiles_acyl, "N")) %>% #looking
count(smiles_acyl, sort = TRUE)
CoA %>%
count(average_molecular_weight, sort = TRUE) #code chunk to count acyl-CoAs, both total and discrete Molecular Weights
ggplot(CoA) +
geom_bar(aes(x = carbon_num_acyl, fill = acyl_description), color = "black", width = 0.8) +
labs(x = "Acyl-CoA Chain Length (# of carbons)", y = "Count") +
expand_limits(y = 40) +
scale_fill_viridis(discrete = TRUE, direction = -1, option = "magma", name = "Acyl Class") +
scale_x_continuous(breaks = c(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28), limits = c(0,28)) +
NULL
#save plot
ggsave("output/fig4.pdf", plot = last_plot(), width = 5, height = 5, dpi = 600)
CoA %>%
count(average_molecular_weight_noCoA, sort = TRUE)
```
##Acyl-phosphate Analysis
Code chunk to count acyl-phosphates. 10 total counted, although strangely two are listed at 266 Da. Same or different?
```{r acyl_phosphates}
phosphate <- metabolites %>%
filter(str_detect(smiles, "C\\(=O\\)OP")) %>% #regex the smiles code for carbonyl-phosphate bond
arrange(average_molecular_weight) %>%
mutate (type = "Acyl Phosphate") %>% #duplicate entry!
mutate(pre_post = if_else(str_detect(smiles, "C\\(=O\\)OP"), "pre", "post")) %>%
separate(smiles, into = c("pre_smiles", "post_smiles"), sep = "P", remove = FALSE, extra = "merge") %>% #merge is to prevent discarding data
mutate(added_carbons = if_else(grepl("pre", pre_post), str_count(pre_smiles, "C"), str_count(post_smiles, "C")))
phosphate
phosphate %>%
count(average_molecular_weight, sort = TRUE) #code chunk to count acyl-CoAs, both total and discrete Molecular Weights
```
##Figure 5
In this figure, the goal is to determine how many reactive [human] metabolites there are and to determine how many are associated with PTMs
```{r fig5a}
#count thioesters
thioester <- metabolites %>%
filter(str_detect(smiles, "C\\(=O\\)S") | str_detect(smiles, "SC\\(=O\\)")) %>% #regex the smiles code for thioesters at either orientation
arrange(average_molecular_weight) %>%
mutate(type = "Thioester") %>%
mutate(pre_post = if_else(str_detect(smiles, "C\\(=O\\)S"), "pre", "post")) %>%
separate(smiles, into = c("pre_smiles", "post_smiles"), sep = "S", remove = FALSE, extra = "merge" ) %>% #merge is to prevent discarding data "Expected 2 pieces. Additional pieces discarded in 26 rows"
mutate(added_carbons = if_else(grepl("pre", pre_post), str_count(pre_smiles, "C"), str_count(post_smiles, "C"))) #entry errors
#because the smiles code has thioesters with orientations that could add carbon on either sides of the code, the first mutate adds a pre_post column to indicate which side will be added; then the separate will split at the thiol; then the last mutate will count the number of carbons in either the pre or post column, depending.
#these include all from CoA list, except "CoA-"
#anti_join(CoA, thioester, by = "name")
#semi_join(CoA, thioester, by = "name") leaves 355, which is one less than in the CoA df
#sum(str_count(thioester$smiles, "C\\(=O\\)S")) #346
#sum(str_count(thioester$smiles, "SC\\(=O\\)")) #80
#thioester$added_carbons <- as.factor(thioester$added_carbons)
match2 <- ptm %>%
filter(!FT == "CROSSLNK") %>% #omit AA cross links only
filter(TG == "Lysine") %>%
select(MA) %>%
round(2) %>%
distinct() %>%
pull()
match1 <- CoA %>%
select(average_molecular_weight_noCoA) %>%
round(2) %>%
distinct %>%
mutate(mod = if_else(average_molecular_weight_noCoA %in% match2, TRUE, FALSE)) %>%
left_join(CoA, by = "average_molecular_weight_noCoA")
match1 %>% count(mod, sort = TRUE)
fig5a <-
ggplot(match1) +
geom_histogram(aes(x = average_molecular_weight_noCoA, fill = mod), color = "black", binwidth = 25, center = 0, position = "stack", alpha = 0.9) +
labs(x = "Molecular Weight (Da)", y = "Possible modifications (#)") +
scale_fill_viridis(discrete = TRUE, direction = 1, option = "cividis", name = "Acyl-CoA Species", labels = c("Not Matching Lysine Modification", "Matching Lysine Modification")) +
NULL
#save plot
ggsave("output/fig5a.pdf", plot = last_plot(), width = 7, height = 5, dpi = 600)
```
```{r fig5b}
#count aldehydes
aldehyde <- metabolites %>%
filter(str_detect(name, "aldehyde")) %>% #str_detect for aldehydes give too many false positives
arrange(average_molecular_weight) %>%
mutate(type = "Aldehyde") %>%
mutate(added_carbons = str_count(smiles, "C"))
#Merge thioesters, phosphates, aldehydes
carbon <- full_join(thioester, phosphate) %>%
full_join(aldehyde) %>%
arrange(average_molecular_weight) %>%
select(-c("smiles", "pre_smiles", "post_smiles", "pre_post", "normal_concentrations"))
fig5b <-
ggplot(carbon) +
geom_bar(aes(x = added_carbons, fill = type), color = "black", width = 0.8) +
labs(x = "Predicted carbons appended", y = "Possible modifications (#)") +
scale_fill_viridis(discrete = TRUE, direction = -1, option = "inferno", name = "Class") +
NULL
#save plot
ggsave("output/fig5b.pdf", plot = last_plot(), width = 5, height = 5, dpi = 600)
#combine figures for editor
fig5ab <- fig5a / fig5b +
plot_annotation(tag_levels = 'A')
ggsave("output/fig5ab.pdf", plot = fig5ab, width = 8, height = 5, dpi = 600)
```
##Save final files
Code chunk to save files
```{r save}
write_delim(ptm, "output/table_s1.csv", delim = ",", na = "")
write_delim(metabolites, "output/table_s2.csv", delim = ",", na = "")
write_delim(carbon, "output/table_s3.csv", delim = ",", na = "")
beep(sound = 8) #because mario is awesome
```