-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproteinorthograph_dot.Rmd
154 lines (114 loc) · 6.74 KB
/
proteinorthograph_dot.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
title: "ProteinOrtho Abundance Graph"
author: "Nikeisha Caruana"
date: "11 January 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r, include = FALSE}
rm(list=ls())
library(stringr)
library(dplyr)
library(splitstackshape)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
options(stringsAsFactors = FALSE)
```
The code below merges two species ProteinGroups.txt files created by MaxQuant, with a ProteinOrtho file from the same two species in order to visualize the abundance of the orthologous proteins between the two species.
Read in protein orthofiles and trinotate annotation reports
```{r read, include = TRUE}
proteinortho <- read.csv("raw_data/proteinortho_data/slimeglyco_comparison.proteinortho", sep='\t', header = TRUE)
proteingroups_SL <- read.csv("raw_data/proteinGroups_PNGaseFSL.txt",sep='\t', header = TRUE)
proteingroups_SA <- read.csv("raw_data/proteinGroups_SA.txt",sep='\t', header = TRUE)
trinotateannotation_SL <- read.csv("raw_data/trinotate_annotation_report_SL.txt",sep='\t', header = TRUE)
trinotateannotation_SA <- read.csv("raw_data/trinotate_annotation_report_SA.txt",sep='\t', header = TRUE)
```
Expand protein groups so we have one row per protein.
```{r expand function, include = TRUE}
expand_groups <- function(protein_data){
group_list = lapply(1:nrow(protein_data),function(ri){
prot_ids = unlist(str_split(protein_data[ri,]$Protein.IDs,pattern = ";"))
prot_ids = str_match(prot_ids,"lcl\\|(.*)")[,2]
prot_ids = str_replace(prot_ids,"_m\\.",replacement = "\\|m\\.")
other_cols = protein_data[rep(ri,length(prot_ids)),]
data.frame(prot_ids,other_cols)
})
do.call(rbind,group_list)
}
```
Expand and clean up files
```{r expand and clean, include = TRUE, warning=FALSE}
expanded_SA <- expand_groups(proteingroups_SA)
expanded_SA$prot_ids <- str_replace(expanded_SA$prot_ids,"\\|m\\..*", "")
expanded_SL <- expand_groups(proteingroups_SL)
expanded_SL$prot_ids <- str_match(expanded_SL$prot_ids,"TRINITY_DN[0-9]*_c[0-9]_g[0-9]_i[0-9]")
expanded_cleaned_SA <- expanded_SA %>% select(prot_ids,iBAQ) %>% rename(protID_SA = prot_ids) %>% rename(iBAQ_SA=iBAQ) %>% na.omit()
expanded_cleaned_SL <- expanded_SL %>% select(prot_ids,iBAQ) %>% rename(protID_SL = prot_ids) %>% rename(iBAQ_SL=iBAQ) %>% na.omit()
proteinortho_cleaned <- proteinortho %>% add_rownames(var="orthogroup") %>% rename(protID_SL = TrinitySepio.fasta) %>% rename(protID_SA = TrinitySepia.fasta) %>% cSplit("protID_SA", ",", "long",type.convert=FALSE) %>% cSplit("protID_SL", ",", "long",type.convert=FALSE) %>% as.data.frame()
```
Merge the Abundance information to the proteinortho table.
```{r merge, include = TRUE}
merged <- proteinortho_cleaned %>% left_join(expanded_cleaned_SL,by="protID_SL") %>% left_join(expanded_cleaned_SA,by="protID_SA")
#Select for IDs that have an abundance in at least one species.
merged_abun <- merged[rowSums(is.na(merged)) !=2,]
```
Pull out transcript_id and top BLASTX hit columns for trinotate report
```{r edit trinotate report, include = TRUE}
trin_SA <- trinotateannotation_SA %>% select(transcript_id,sprot_Top_BLASTX_hit) %>% rename(protID_SA = transcript_id) %>% rename(swissprot_SA=sprot_Top_BLASTX_hit) %>% na.omit()
trin_SA$swissprot_SA <- str_extract(trin_SA$swissprot_SA,"[A-Z0-9]{2,5}_[A-Z0-9]{3,5}")
trin_SL <- trinotateannotation_SL %>% select(transcript_id,sprot_Top_BLASTX_hit) %>% rename(protID_SL = transcript_id) %>% rename(swissprot_SL=sprot_Top_BLASTX_hit) %>% na.omit()
trin_SL$swissprot_SL <- str_extract(trin_SL$swissprot_SL,"[A-Z0-9]{2,5}_[A-Z0-9]{3,5}")
```
Merge abundance table with SwissProt identifiers
```{r merge abundance with trinotate, include = TRUE}
merged_trinotate <- merged_abun %>% left_join(trin_SL,by="protID_SL") %>% left_join(trin_SA,by="protID_SA")
```
```{r merge proteinortho and top 20 iBAQ for both species}
top20_SL <- expanded_cleaned_SL[order(expanded_cleaned_SL$iBAQ_SL,decreasing=TRUE)[1:20],]
top20_SL$protID_SL <- as.character(top20_SL$protID_SL)
top20_SA <- expanded_cleaned_SA[order(expanded_cleaned_SA$iBAQ_SA,decreasing=TRUE)[1:20],]
merged_trinotate_top20_SA <- top20_SA %>% left_join(trin_SA,by="protID_SA")
merged_trinotate_top20_SL <- top20_SL %>% left_join(trin_SL,by="protID_SL")
#write.csv(x = merged_trinotate_top20_SA, file = "raw_data/SAoverall_20.csv")
#write.csv(x = merged_trinotate_top20_SL, file = "raw_data/SLoverall_20.csv")
top20overall_edit_SL <- read.csv("raw_data/SLoverall_20edit.csv",sep=',', header = TRUE)
top20overall_edit_SA <- read.csv("raw_data/SAoverall_20edit.csv",sep=',', header = TRUE)
top20merged_overall <- merge(top20overall_edit_SA,top20overall_edit_SL, all = TRUE)
```
Collapse the list back
```{r collapse function, include = TRUE}
#function for aggregating
aggfun <- function(values){
if (class(values)=="numeric"){
return(mean(values))
}
paste(unique(values),collapse=";")
}
```
```{r aggregate merged table, include = TRUE}
merged_coll = aggregate(merged_trinotate,by=list(merged_trinotate$orthogroup),aggfun)
```
```{r graphing, include = TRUE, warning= FALSE}
#combine the top20 abundant proteins from each species with the top20 orthologous proteins
top20iBAQ_SL <- head(arrange(merged_coll,desc(iBAQ_SL)),n=20)
top20iBAQ_SA <- head(arrange(merged_coll,desc(iBAQ_SA)),n=20)
top20 <- rbind(top20iBAQ_SA,top20iBAQ_SL)
top20 <- top20 %>% full_join(top20overall_edit_SA) %>% full_join(top20overall_edit_SL)
#write.csv(x = top20, file = "raw_data/top20.csv")
#read in table edited for description columns and log of iBAQ
top20_des <- read.table("raw_data/top20_description.csv",sep=',', header = TRUE)
top20_long <- top20_des %>% select(orthogroup,iBAQ_SL_LOG,iBAQ_SA_LOG,description) %>% gather(species,iBAQ,-orthogroup, -description)
top20_long$orthogroup <- factor(top20_long$orthogroup, levels = top20_long$orthogroup [order(top20_long$description)])
top20_long$species<- str_replace_all(top20_long$species,"iBAQ_SL_LOG", "S.lineolata")
top20_long$species <- str_replace_all(top20_long$species,"iBAQ_SA_LOG", "S.austrinum")
top20_long <- unique(top20_long)
```
```{r}
ggplot(top20_long,aes(x=orthogroup,y=iBAQ)) +
geom_point(size = 4,aes(colour = description),stat="identity") +
facet_wrap(~species,ncol = 1, nrow = 2) + ylab("log(iBAQ)") + xlab("Orthogroup") +
scale_colour_manual(values = c('Actin' = "lightslateblue", 'Enzymatic' = "yellowgreen", 'Histone' = "deeppink", 'Calmodulin' = "darkorchid", 'Other Cellular' = "dodgerblue", 'Intermediate Filament' = 'darkorange','Unknown' = "lightpink",'Annexin' = "darkblue", 'PPR3B' = "red", 'Ubiquitination' = "aquamarine", 'IMDH3' = "yellow", 'Transcription Regulation' = "palevioletred2", 'Hormone Processing' = "thistle4", 'MR30' = "black", 'Collagen' = "moccasin")) + theme(axis.text.x= element_text(angle=90)) + theme(legend.title=element_blank())
```