-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCNV_segmentation_by_DNAcopy.R
executable file
·80 lines (67 loc) · 4.41 KB
/
CNV_segmentation_by_DNAcopy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env Rscript
##############################################################
# script: CNV_segmentation_by_DNAcopy.R
# author: Jia-Xing Yue (GitHub ID: yjx1217)
# last edited: 2019.09.10
# description: run CNV segmentation by DNAcopy
# example: Rscript --vanilla --slave CNV_segmentation_by_DNAcopy.R --input input.bam_ratio.txt --prefix output_prefix --window 250 --ploidy 1 --genome_fai ref.genom.fai
##############################################################
library("optparse")
library("DNAcopy")
option_list <- list(
make_option(c("--input"), type = "character", default = NULL,
help = "name of the bam_ratio.txt file generated by FREEC", metavar = "character"),
make_option(c("--window"), type = "character", default = NULL,
help = "window size used for the input ratio file", metavar = "character"),
make_option(c("--prefix"), type = "character", default = NULL,
help = "the output file prefix", metavar = "character"),
make_option(c("--ploidy"), type = "character", default = NULL,
help = "sample ploidy", metavar = "character"),
make_option(c("--genome_fai"), type = "character", default = NULL,
help = "path to the genome.fai file", metavar = "character")
);
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)
ratio_data <- read.table(opt$input, header = TRUE, sep = "\t")
ratio_data$Ratio[which(ratio_data$Ratio == -1)] <- NA
ratio_data$MedianRatio[which(ratio_data$MedianRatio == -1)] <- NA
ratio_data$MedianRatio[which(is.na(ratio_data$Ratio))] <- NA
log_transformation_offset <- 0.001
ratio_data$Ratio <- ratio_data$Ratio + log_transformation_offset
ratio_data$MedianRatio <- ratio_data$MedianRatio + log_transformation_offset
window <- opt$window
ploidy <- opt$ploidy
genome_fai <- read.table(opt$genome_fai, header = FALSE, sep = "\t")
colnames(genome_fai) <- c("chr", "length", "offset", "linebases", "linewidth")
# re-order the chromosomes
chr_list <- as.vector(unique(genome_fai$chr))
chr_list_intersect <- intersect(chr_list, ratio_data$Chromosome)
ratio_data$Chromosome <- factor(ratio_data$Chromosome, levels = chr_list_intersect)
if (nrow(ratio_data) > 0) {
ratio_data_sorted <- ratio_data[order(ratio_data$Chromosome),]
write.table(ratio_data_sorted, paste0(opt$prefix, ".FREEC.bam_ratio.sorted.txt"), sep = "\t", row.names = FALSE, quote = FALSE)
ratio_data_filtered <- subset(ratio_data_sorted, !is.na(ratio_data$MedianRatio))
ratio_data_filtered$logMedianRatio <- log(ratio_data_filtered$MedianRatio, 2)
ratio_data_filtered_CNA <- CNA(ratio_data_filtered$logMedianRatio, ratio_data_filtered$Chromosome, ratio_data_filtered$Start, data.type = "logratio", presorted = TRUE)
smoothed.ratio_data_filtered_CNA <- smooth.CNA(ratio_data_filtered_CNA)
segment.smoothed.ratio_data_filtered_CNA <- segment(smoothed.ratio_data_filtered_CNA, undo.splits = "sdundo", undo.SD = 3, verbose = 1)
segment_data <- segment.smoothed.ratio_data_filtered_CNA$output
segment_data$loc.end <- segment_data$loc.end + as.numeric(window) - 1
segment_data$DNAcopy_ratio <- (2 ^ segment_data$seg.mean - log_transformation_offset) * as.numeric(ploidy)
segment_data$ID <- NULL
segment_data$DNAcopy_copy_number <- round(segment_data$DNAcopy_ratio)
write.table(segment_data, paste0(opt$prefix, ".FREEC.bam_ratio.sorted.resegmented.raw.txt"), sep = "\t", row.names = FALSE, quote = FALSE)
segment_data_lite <- subset(segment_data, select = -c(num.mark, seg.mean))
colnames(segment_data_lite) <- c("Chromosome", "Start", "End", "Ratio", "CopyNumber")
write.table(segment_data_lite, paste0(opt$prefix, ".FREEC.bam_ratio.sorted.resegmented.lite.txt"), sep = "\t", row.names = FALSE, quote = FALSE)
CNV_data <- subset(segment_data, DNAcopy_copy_number != ploidy)
if (nrow(CNV_data) > 0) {
CNV_data$annotation <- NA
CNV_data$annotation[which(CNV_data$DNAcopy_copy_number < as.numeric(ploidy))] <- "loss"
CNV_data$annotation[which(CNV_data$DNAcopy_copy_number > as.numeric(ploidy))] <- "gain"
CNV_data_lite <- subset(CNV_data, select = c(chrom, loc.start, loc.end, DNAcopy_copy_number, annotation))
write.table(CNV_data_lite, paste0(opt$prefix, ".FREEC.bam_ratio.sorted.resegmented.CNVs.txt"), sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
} else {
file.create(paste0(opt$prefix, ".FREEC.bam_ratio.sorted.resegmented.CNVs.txt"))
}
}