|
| 1 | +#' Layer for Protein Coverage Plot. |
| 2 | +#' |
| 3 | +#' @param coverage.file Exported protein coverage file, should be in excel. |
| 4 | +#' @param fasta.file Input reference protein fasta file. |
| 5 | +#' @param protein.id The protein ID of exported coverage file. This should be unique and in \code{fasta.file}. |
| 6 | +#' @param XCorr.threshold The cross-correlation threshold. Default: 2. |
| 7 | +#' @param confidence The confidence level. Default: High. |
| 8 | +#' @param contaminant Whether to remove contaminant peptides. Default: NULL (not remove). |
| 9 | +#' @param remove.na Logical value, whether to remove NA value in Abundance column. Default: TRUE. |
| 10 | +#' @param color The fill color of coverage plot. Default: grey. |
| 11 | +#' @param mark.bare Logical value, whether to mark region where Abundance is zero or NA. Default: TRUE. |
| 12 | +#' @param mark.color The color used for the marked region. Default: red. |
| 13 | +#' @param mark.alpha The transparency used for the marked region. Default: 0.5. |
| 14 | +#' @param show.table Logical value, whether to show coverage summary table. Default: TRUE. |
| 15 | +#' @param table.position The position of the coverage summary table, choose from right_top, left_top, left_bottom, right_bottom. |
| 16 | +#' Default: right_top. |
| 17 | +#' @param table.size The font size of coverage summary table. Default: 4. |
| 18 | +#' @param table.color The font color of coverage summary table. Default: black. |
| 19 | +#' @param range.size The label size of range text, used when \code{range.position} is in. Default: 3. |
| 20 | +#' @param range.position The position of y axis range, chosen from in (move y axis in the plot) and |
| 21 | +#' out (normal y axis). Default: in. |
| 22 | +#' |
| 23 | +#' @return A ggplot2 object. |
| 24 | +#' @importFrom openxlsx read.xlsx |
| 25 | +#' @importFrom magrittr %>% |
| 26 | +#' @importFrom dplyr filter group_by summarise arrange |
| 27 | +#' @importFrom rlang .data |
| 28 | +#' @importFrom Biostrings readAAStringSet |
| 29 | +#' @importFrom stringr str_locate |
| 30 | +#' @importFrom GenomicRanges reduce GRanges setdiff |
| 31 | +#' @importFrom IRanges IRanges |
| 32 | +#' @importFrom ggplot2 ggplot geom_rect geom_text aes aes_string scale_x_continuous |
| 33 | +#' @importFrom ggpp annotate |
| 34 | +#' @importFrom scales scientific |
| 35 | +#' @export |
| 36 | +#' |
| 37 | +#' @examples |
| 38 | +#' # library(ggplot2) |
| 39 | +#' # library(ggcoverage) |
| 40 | +#' # coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") |
| 41 | +#' # fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") |
| 42 | +#' # protein.id = "sp| |
| 43 | +#' # ggplot() + |
| 44 | +#' # geom_peptide(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) |
| 45 | +geom_protein = function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2, |
| 46 | + confidence = "High", contaminant = NULL, remove.na = TRUE, |
| 47 | + color = "grey", mark.bare = TRUE, mark.color = "red", mark.alpha = 0.5, |
| 48 | + show.table = TRUE, table.position = c("right_top", "left_top", "left_bottom", "right_bottom"), |
| 49 | + table.size = 4, table.color = "black", range.size = 3, range.position = c("in", "out")){ |
| 50 | + # check parameters |
| 51 | + table.position <- match.arg(arg = table.position) |
| 52 | + range.position <- match.arg(arg = range.position) |
| 53 | + |
| 54 | + # load coverage dataframe |
| 55 | + coverage.df = openxlsx::read.xlsx(coverage.file) |
| 56 | + # remove suffix and prefix string |
| 57 | + coverage.df$Annotated.Sequence = gsub(pattern = ".*\\.(.*)\\..*", replacement = "\\1", x = coverage.df$Annotated.Sequence) |
| 58 | + # filter converge according to confidence |
| 59 | + if(!is.null(confidence)){ |
| 60 | + coverage.df = coverage.df[coverage.df[, "Confidence"] == confidence, ] |
| 61 | + } |
| 62 | + # filter converge according to contaminant |
| 63 | + if(!is.null(contaminant)){ |
| 64 | + coverage.df = coverage.df[coverage.df[, "Contaminant"] == contaminant, ] |
| 65 | + } |
| 66 | + # filter converge according to cross-correlation |
| 67 | + if(!is.null(XCorr.threshold)){ |
| 68 | + xcorr.index = grep(pattern = "XCorr", x = colnames(coverage.df)) |
| 69 | + coverage.df = coverage.df[coverage.df[, xcorr.index] >= XCorr.threshold, ] |
| 70 | + } |
| 71 | + # get abundance cols |
| 72 | + abundance.col = grep(pattern = "Abundance", x = colnames(coverage.df), value = TRUE) |
| 73 | + # remove na abundance |
| 74 | + if(remove.na){ |
| 75 | + coverage.df = coverage.df %>% dplyr::filter(!is.na(.data[[abundance.col]])) |
| 76 | + } |
| 77 | + # sum abundance of duplicated Annotated.Sequence |
| 78 | + coverage.df = coverage.df %>% |
| 79 | + dplyr::group_by(.data[["Annotated.Sequence"]]) %>% |
| 80 | + dplyr::summarise(Abundance = sum(.data[[abundance.col]])) %>% |
| 81 | + as.data.frame() |
| 82 | + colnames(coverage.df) = c("peptide", "abundance") |
| 83 | + # check the coverage dataframe |
| 84 | + if(nrow(coverage.df) == 0){ |
| 85 | + stop("There is no valid peptide, please check!") |
| 86 | + } |
| 87 | + |
| 88 | + # load genome fasta |
| 89 | + aa.set = Biostrings::readAAStringSet(fasta.file) |
| 90 | + protein.index = which(names(aa.set) == protein.id) |
| 91 | + if(length(protein.index) == 1){ |
| 92 | + aa.set.used = aa.set[protein.index] |
| 93 | + aa.seq.used = paste(aa.set.used) |
| 94 | + }else if(length(protein.index) > 1){ |
| 95 | + stop("Please check the protein.id you provided, there is more than one in provided fasta file!") |
| 96 | + }else{ |
| 97 | + stop("Please check the protein.id you provided, it can't be found in provided fasta file!") |
| 98 | + } |
| 99 | + |
| 100 | + # get the region |
| 101 | + aa.anno.region = sapply(coverage.df$peptide, function(x){ |
| 102 | + stringr::str_locate(pattern =x, aa.seq.used) |
| 103 | + }) %>% t() %>% as.data.frame() |
| 104 | + colnames(aa.anno.region) = c("start", "end") |
| 105 | + |
| 106 | + # merge |
| 107 | + coverage.final = merge(coverage.df, aa.anno.region, by.x = "peptide", by.y = 0, all.x = TRUE) |
| 108 | + coverage.final = coverage.final %>% dplyr::arrange(.data[["start"]], .data[["end"]]) |
| 109 | + |
| 110 | + # get coverage positions |
| 111 | + coverage.pos = |
| 112 | + GenomicRanges::reduce(GenomicRanges::GRanges(protein.id, IRanges::IRanges(coverage.final$start, coverage.final$end))) %>% |
| 113 | + as.data.frame() |
| 114 | + coverage.pos$strand = NULL |
| 115 | + colnames(coverage.pos) = c("ProteinID", "start", "end", "width") |
| 116 | + coverage.pos$Type = "covered" |
| 117 | + # get coverage rate |
| 118 | + coverage.rate = round(sum(coverage.pos$width)*100/nchar(aa.seq.used), 2) |
| 119 | + # non-cover position |
| 120 | + non.coverage.pos = |
| 121 | + GenomicRanges::setdiff(GenomicRanges::GRanges(protein.id, IRanges::IRanges(1, nchar(aa.seq.used))), |
| 122 | + GenomicRanges::GRanges(protein.id, IRanges::IRanges(coverage.final$start, coverage.final$end))) %>% |
| 123 | + as.data.frame() |
| 124 | + non.coverage.pos$strand = NULL |
| 125 | + colnames(non.coverage.pos) = c("ProteinID", "start", "end", "width") |
| 126 | + non.coverage.pos$Type = "bare" |
| 127 | + # coverage summary |
| 128 | + coverage.summary = rbind(coverage.pos, non.coverage.pos) %>% as.data.frame() |
| 129 | + |
| 130 | + # coverage rect |
| 131 | + coverage.rect = geom_rect(data = coverage.final, mapping = aes_string(xmin = "start", xmax = "end", |
| 132 | + ymin = "0", ymax = "abundance"), |
| 133 | + show.legend = FALSE, fill = color) |
| 134 | + plot.ele <- list(coverage.rect) |
| 135 | + # mark bare |
| 136 | + if(mark.bare){ |
| 137 | + bare.rect = geom_rect(data = non.coverage.pos, mapping = aes_string(xmin = "start", xmax = "end", |
| 138 | + ymin = "0", ymax = "Inf"), |
| 139 | + show.legend = F, fill = mark.color, alpha = mark.alpha) |
| 140 | + plot.ele <- append(plot.ele, bare.rect) |
| 141 | + } |
| 142 | + # summary table |
| 143 | + if(show.table){ |
| 144 | + # table position |
| 145 | + if(table.position == "left_top"){ |
| 146 | + table.x = 0 |
| 147 | + table.y = max(coverage.final[ , "abundance"]) |
| 148 | + }else if(table.position == "right_top"){ |
| 149 | + table.x = nchar(aa.seq.used) |
| 150 | + table.y = max(coverage.final[ , "abundance"]) |
| 151 | + }else if(table.position == "left_bottom"){ |
| 152 | + table.x = 0 |
| 153 | + table.y = 0 |
| 154 | + }else if(table.position == "right_bottom"){ |
| 155 | + table.x = nchar(aa.seq.used) |
| 156 | + table.y = 0 |
| 157 | + } |
| 158 | + summary.table = ggpp::annotate(geom = "table", label = list(coverage.summary), x= table.x, y=table.y, |
| 159 | + color = table.color, size = table.size) |
| 160 | + plot.ele <- append(plot.ele, summary.table) |
| 161 | + } |
| 162 | + # range position |
| 163 | + if (range.position == "in") { |
| 164 | + # prepare range |
| 165 | + max.abundance = CeilingNumber(max(coverage.final$abundance)) |
| 166 | + abundance.range = data.frame(label = paste0("[0, ", scales::scientific(max.abundance, digits = 2), "]")) |
| 167 | + range.text = geom_text( |
| 168 | + data = abundance.range, |
| 169 | + mapping = aes(x = -Inf, y = Inf, label = label), |
| 170 | + hjust = 0, |
| 171 | + vjust = 1.5, |
| 172 | + size = range.size |
| 173 | + ) |
| 174 | + plot.ele <- append(plot.ele, range.text) |
| 175 | + } |
| 176 | + # change x scale |
| 177 | + plot.ele <- append(plot.ele, scale_x_continuous(limits = c(1, nchar(aa.seq.used)), expand = c(0, 0))) |
| 178 | + return(plot.ele) |
| 179 | +} |
0 commit comments