Skip to content

Commit 7ab9196

Browse files
committed
Merge branch 'master' of https://github.com/vastgroup/vast-tools
2 parents 72fd500 + fa636f1 commit 7ab9196

File tree

5 files changed

+310
-221
lines changed

5 files changed

+310
-221
lines changed

R/Rlib/include_diff.R

+187-114
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
#!/usr/bin/env Rscript
22
#
3-
# Author: Tim Sterne-Weiler, 2014
4-
5-
# Modifications by Ulrich Braunschweig 2018
3+
# Author: Tim Sterne-Weiler, Ulrich Braunschweig 2014-2024
4+
65

7-
# This function takes a qual and returns c(post_alpha, post_beta)
8-
# Increments by prior alpha and prior distribution beta, uniform by default
6+
## Check that columns of INCLUSION... table are what we think they are
7+
checkHeader <- function(x, replicateA, replicateB) {
8+
reps <- c(replicateA, replicateB)
9+
sampInd <- unlist(sapply(reps, FUN=function(y) {which(x == y)}))
10+
if (length(sampInd) != length(reps)) {
11+
stop("[vast diff error]: Not all replicate names found in input header!\n")
12+
}
13+
namesOK <- all(paste0(reps, "-Q") == x[sampInd + 1])
14+
if (!namesOK | !all(x[1:3] == c("GENE", "EVENT", "COORD"))) {
15+
stop("[vast diff error]: Input table does not have expected format!\n")
16+
}
17+
}
18+
19+
## This function takes a qual and returns c(post_alpha, post_beta)
20+
## Increments by prior alpha and prior distribution beta, uniform by default
921
parseQual <- function(qual, prior_alpha=1, prior_beta=1) {
1022
res1 <- as.numeric(sub("[^@]*@([^,]*),.*", "\\1", qual))
1123
res2 <- as.numeric(sub("[^@]*@[^,]*,(.*)", "\\1", qual))
@@ -19,44 +31,43 @@ parseQual <- function(qual, prior_alpha=1, prior_beta=1) {
1931
cbind(res1, res2)
2032
}
2133

22-
# calculate the probability that the first dist is > than second
23-
# P(psi1 > psi2) when alpha=0; more generally we are determining the probability
24-
# that Psi1 is greater than Psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
34+
## Calculate the probability that the first dist is > than second
35+
## P(psi1 > psi2) when alpha=0; more generally we are determining the probability
36+
## that psi1 is greater than psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
2537
## IMPORTANT: run this function as sample(firstDist, length(firstDist)) AND
26-
## sample(secondDist, length(secondDist))
38+
## sample(secondDist, length(secondDist))
2739
## UNLESS you have paired data, then don't sample.
2840
pDiff <- function(firstDist, secondDist, alpha=0.15) {
29-
N <- length(firstDist)
30-
pass <- length( which( (firstDist - secondDist) > alpha ) )
41+
N <- length(firstDist)
42+
pass <- length(which((firstDist - secondDist) > alpha))
3143
pass / N
3244
}
3345

34-
# This function finds the maximum difference between psi1 and psi2 given
35-
# at least some acceptable probability for difference. Defaults to 0.8
36-
# distributions where no diff value exists with a probability > 0.8 are given
37-
# maxDiff of 0.
38-
maxDiff <- function(firstDist, secondDist, acceptProb=0.9) {
39-
alphaSet <- seq(0,1,0.01) #make this global?
40-
probs <- unlist(lapply(alphaSet, function(x) { pDiff(firstDist, secondDist, x) }))
46+
## This function finds the maximum difference between psi1 and psi2 given
47+
## at least some acceptable probability for difference. Defaults to 0.8
48+
## distributions where no diff value exists with a probability > 0.8 are given
49+
## maxDiff of 0.
50+
maxDiff <- function(firstDist, secondDist, acceptProb=0.9, alphaSet) {
51+
probs <- unlist(lapply(alphaSet, pDiff,
52+
firstDist=firstDist, secondDist=secondDist))
4153
ind <- max(c(which(probs > acceptProb), 1))
4254
alphaSet[ind]
4355
}
4456

45-
#
46-
# return the beta variance
57+
## Return the beta variance
4758
betaVar <- function(alpha, beta) {
48-
var <- alpha*beta / (
59+
var <- alpha * beta / (
4960
((alpha + beta) ** 2) * (alpha + beta + 1)
5061
)
5162
var
5263
}
5364

54-
#
65+
## Return a confidence interval
5566
betaCI <- function(betaDist, percentile = c(0.05, 0.95)) {
5667
quantile(betaDist, p=percentile, na.rm = T)
5768
}
5869

59-
# Extension of betaCI function that includes the sampling step
70+
## Extension of betaCI function that includes the sampling step
6071
betaCISample <- function(alpha, beta, n = 5000) {
6172
if (is.na(alpha) || is.na(beta)) {
6273
sample <- NA
@@ -66,43 +77,44 @@ betaCISample <- function(alpha, beta, n = 5000) {
6677
return(betaCI(sample))
6778
}
6879

69-
70-
### MAKE VISUAL OUTPUT
71-
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo, sampOneName, sampTwoName, rever) {
80+
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo,
81+
sampOneName, sampTwoName, alphaSet, rever) {
82+
### Make visual output
7283

7384
if(rever) { #write this better. ;-)
7485
curCol <- cbb[3:2]
7586
} else {
7687
curCol <- cbb[2:3]
7788
}
7889

79-
if(length(expOne) == 0 || length(expTwo) == 0) { return(NULL) }
90+
if (length(expOne) == 0 || length(expTwo) == 0) {return(NULL)}
8091
one <- data.frame(x=expOne, y=-0.5)
8192
two <- data.frame(x=expTwo, y=-0.5)
8293

83-
distPlot <- ggplot(melt(as.data.frame(
84-
do.call(cbind,list(inpOne, inpTwo))
85-
), measure.vars=c("V1","V2")), aes(fill=variable, x=value))+
86-
geom_histogram(aes(y=..density..), binwidth=0.03333,alpha=0.5, col="grey", position="identity")+
87-
theme_bw()+xlim(c(0,1))+xlab(expression(hat(Psi)))+
88-
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples")+
89-
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.85, inherit.aes = FALSE)+
90-
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.85, inherit.aes = FALSE)
94+
distPlot <- ggplot(melt(as.data.frame(do.call(cbind, list(inpOne, inpTwo))),
95+
measure.vars=c("V1","V2")),
96+
aes(fill=variable, x=value)) +
97+
geom_histogram(aes(y=after_stat(density)), binwidth=0.03333, alpha=0.5, col="grey", position="identity") +
98+
theme_bw() + scale_x_continuous(limits = c(0, 1), oob = scales::oob_keep) + xlab(expression(hat(Psi))) +
99+
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples") +
100+
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.50, inherit.aes = FALSE) +
101+
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.50, inherit.aes = FALSE)
91102

92103
probPlot <- ggplot(as.data.frame(cbind(seq(0,1,0.01),
93-
unlist(lapply(alphaList, function(x) {
104+
unlist(lapply(alphaSet, function(x) {
94105
pDiff(inpOne, inpTwo, x)
95-
})))), aes(x=V1, y=V2))+
96-
geom_line()+theme_bw()+
97-
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7])+
98-
ylab(expression(P((hat(Psi)[1]-hat(Psi)[2]) > x)))+
99-
xlab(expression(x))+ylim(c(0,1))+
100-
annotate("text", x=(maxD+0.08), y=0.05, label=maxD, col=cbb[7])
101-
102-
return(list(distPlot, probPlot))
106+
})))), aes(x=V1, y=V2)) +
107+
geom_line() + theme_bw() +
108+
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7]) +
109+
ylab(expression(P((hat(Psi)[1] - hat(Psi)[2]) > x))) +
110+
xlab(expression(x)) + scale_y_continuous(limits = c(0, 1), oob = scales::oob_keep) +
111+
annotate("text", x=(maxD + 0.08), y=0.05, label=maxD, col=cbb[7])
112+
113+
return(list(distPlot, probPlot))#
103114
}
104115

105116
plotPrint <- function(plotTitle, plotCoord, plotList) {
117+
### Print saved plotting information to output file
106118
grid.newpage()
107119
pushViewport(viewport(layout = grid.layout(3, 2, widths = unit(c(5, 4), "null"), heights = unit(c(1, 0.5, 5), "null"))))
108120
grid.text(as.character(plotTitle), check.overlap=TRUE, gp=gpar(font=2), draw=T, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2))
@@ -117,106 +129,167 @@ diffBeta <- function(i, lines, opt,
117129
totalFirst, totalSecond,
118130
shapeFirstAve, shapeSecondAve,
119131
expFirst, expSecond,
120-
repA.qualInd, repB.qualInd) {
121-
### Main diff functionality; fit beta distributions to sample groups for one event and compare
122-
### Is applied to each line of the current nLines of the INCLUSION... table
123-
124-
## if no data, next; # adapted from @lpantano's fork 7/22/2015
125-
if( (sum(totalFirst[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) ||
126-
(sum(totalSecond[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) ) {
127-
return(NULL)
128-
}
132+
repA.qualInd, repB.qualInd,
133+
okFirst, okSecond, skip,
134+
alphaSet) {
135+
## Main diff functionality; fit beta distributions to sample groups for one event and compare
136+
## Is applied to each line of the current nLines of the INCLUSION... table
129137

130138
## Sample Posterior Distributions
131139
psiFirst <- lapply(1:(dim(shapeFirst)[2]), function(x) {
132-
#sample here from rbeta(N, alpha, beta) if > -e
133-
if(totalFirst[i,x] < opt$minReads) { return(NULL) }
134-
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2])
140+
## sample here from rbeta(N, alpha, beta) if > -e
141+
if (!okFirst[i,x]) {return(NULL)}
142+
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2])
135143
})
136144

137145
psiSecond <- lapply(1:(dim(shapeSecond)[2]), function(x) {
138-
#sample here from rbeta(N, alpha, beta) if > -e
139-
if(totalSecond[i,x] < opt$minReads) { return(NULL) }
140-
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2])
146+
## sample here from rbeta(N, alpha, beta) if > -e
147+
if (!okSecond[i,x]) {return(NULL)}
148+
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2])
141149
})
142150

143-
if(opt$paired) { #make sure both samples have a non-NULL replicate
144-
for(lstInd in 1:length(psiFirst)) {
145-
if(is.null(psiFirst[[lstInd]]) || is.null(psiSecond[[lstInd]])) {
146-
psiFirst[[lstInd]] <- NULL
147-
psiSecond[[lstInd]] <- NULL
148-
}
149-
}
150-
}
151-
152151
## Create non-parametric Joint Distributions
153152
psiFirstComb <- do.call(c, psiFirst)
154153
psiSecondComb <- do.call(c, psiSecond)
155154

156-
if( length(psiFirstComb) <= 0 || length(psiSecondComb) <= 0 ) { return(NULL) }
157-
158-
## if they aren't paired, then shuffle the joint distributions...
159-
if( !opt$paired ) {
160-
paramFirst <- try (suppressWarnings(
161-
fitdistr(psiFirstComb,
162-
"beta",
163-
list( shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2])
164-
)$estimate ), TRUE )
165-
paramSecond <- try (suppressWarnings(
166-
fitdistr(psiSecondComb,
167-
"beta",
168-
list( shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2])
169-
)$estimate ), TRUE )
170-
## if optimization fails its because the distribution is too narrow
171-
## in which case our starting shapes should already be good enough
172-
if(class(paramFirst) != "try-error") {
173-
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2])
174-
}
175-
if(class(paramSecond) != "try-error") {
176-
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2])
177-
}
155+
## Try to to fit a beta distribution on the replicates' parameter estimates.
156+
if (!opt$paired) {
157+
if (length(psiFirstComb) > 0) {
158+
paramFirst <- try(suppressWarnings(
159+
fitdistr(psiFirstComb,
160+
"beta",
161+
list(shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2])
162+
)$estimate ), TRUE )
163+
}
164+
if (length(psiSecondComb)) {
165+
paramSecond <- try(suppressWarnings(
166+
fitdistr(psiSecondComb,
167+
"beta",
168+
list(shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2])
169+
)$estimate ), TRUE )
170+
}
171+
## If optimization fails it's because the distribution is too narrow
172+
## in which case our starting shapes should already be good enough.
173+
## Shuffle since not paired, and downsample to size of smaller number
174+
## but don't downsize if the other sample type is absent anyway.
175+
minSample <- min(c(length(psiFirstComb), length(psiSecondComb)))
176+
177+
if (length(psiFirstComb) > 0) {
178+
if (!inherits(paramFirst, "try-error")) {
179+
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2])
180+
} else {
181+
if (length(psiSecondComb) > 0) {
182+
psiFirstComb <- sample(psiFirstComb, minSample)
183+
}
184+
}
185+
}
186+
if (length(psiSecondComb) > 0) {
187+
if (!inherits(paramSecond, "try-error")) {
188+
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2])
189+
} else {
190+
if (length(psiFirstComb) > 0) {
191+
psiSecondComb <- sample(psiSecondComb, minSample)
192+
}
193+
}
194+
}
178195
}
179196

180197
## get empirical posterior median of psi
181198
medOne <- median(psiFirstComb)
182199
medTwo <- median(psiSecondComb)
183200

184201
## look for a max difference given prob cutoff...
185-
if(medOne > medTwo) {
186-
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob)
202+
if (skip[i]) {
203+
if (is.null(medOne)) {medOne <- NA}
204+
if (is.null(medTwo)) {medTwo <- NA}
205+
max <- NA
187206
} else {
188-
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob)
207+
if (medOne > medTwo) {
208+
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob, alphaSet)
209+
} else {
210+
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob, alphaSet)
211+
}
189212
}
190213

191-
## SIGNIFICANT from here on out:
192-
193-
filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s", lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne - medTwo, round(max,2))
214+
filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s",
215+
lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne-medTwo, round(max,2))
194216

195217
## check for significant difference
196-
if(max < opt$minDiff) {
197-
## non-sig, return null plots and text output
198-
return(list(NULL, NULL, NULL, NULL, filtOut))
199-
} else {
200-
sigInd <- i
201-
202-
if (opt$noPDF) {
203-
eventTitle <- NULL
204-
eventCoord <- NULL
205-
retPlot <- NULL
206-
} else {
218+
if (opt$noPDF || (is.na(max) || max < opt$minDiff)) {
219+
## non-sig, return null plots and text output
220+
return(list(NULL, NULL, NULL, NULL, filtOut))
221+
} else {
222+
## significant; plot
223+
sigInd <- i
207224
eventTitle <- paste(c("Gene: ", lines[[i]][1], " Event: ", lines[[i]][2]), collapse="")
208225
eventCoord <- paste(c("Coordinates: ", lines[[i]][3]), collapse="")
209226

210-
## Print visual output to pdf;
211-
if( medOne > medTwo ) {
227+
## Store visual output to be printed to pdf
228+
if (medOne > medTwo) {
212229
retPlot <- plotDiff(psiFirstComb, psiSecondComb,
213-
expFirst[i,], expSecond[i,], max, medOne, medTwo, sampOneName, sampTwoName, FALSE)
230+
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]],
231+
max, medOne, medTwo, sampOneName, sampTwoName, alphaSet, FALSE)
214232
} else {
215233
retPlot <- plotDiff(psiSecondComb, psiFirstComb,
216-
expFirst[i,], expSecond[i,], max, medTwo, medOne, sampTwoName, sampOneName, TRUE)
234+
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]],
235+
max, medTwo, medOne, sampTwoName, sampOneName, alphaSet, TRUE)
217236
}
218-
}
219-
## sig event return
220-
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut)) #return of mclapply function
237+
238+
## sig event return
239+
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut))
221240
}
222241
}
242+
243+
writeLog <- function(vastPath, opt) {
244+
### Add a line with call information to the general vast-tools log file
245+
246+
logName <- file.path(opt$output, "VTS_LOG_commands.txt")
247+
vastVer <- scan(file.path(vastPath, "VERSION"), what="character", quiet = T)
248+
249+
## Get date, time and vast-tools version
250+
systime <- Sys.time()
251+
sdate <- paste(
252+
sub("([0-9]{4})-.+", "\\1", systime),
253+
sub("[0-9]{4}-[0-9]{2}-([0-9]{2}).+", "\\1", systime),
254+
sub("[0-9]{4}-([0-9]{2})-.+", "\\1", systime),
255+
sep="-")
256+
stime <- sub("[^ ]+ (.{5}):[0-9]{2}.*", "\\1", systime)
257+
258+
msg1 <- paste0(
259+
"[VAST-TOOLS v", vastVer, ", ",
260+
sdate, " (", stime, ")]"
261+
)
262+
263+
msg2 <- "vast-tools diff"
264+
265+
## Format input options
266+
msg3 <- paste0(
267+
"-a ", opt$replicateA, " ",
268+
"-b ", opt$replicateB, " ",
269+
"--sampleNameA ", opt$sampleNameA, " ",
270+
"--sampleNameB ", opt$sampleNameB, " ",
271+
"-i ", opt$input, " ",
272+
"-n ", opt$nLines, " ",
273+
"-p ", opt$paired, " ",
274+
"-d ", opt$baseName, " ",
275+
"-o ", opt$output, " ",
276+
"-r ", opt$prob, " ",
277+
"-m ", opt$minDiff, " ",
278+
"-e ", opt$minReads, " ",
279+
"-S ", opt$minSamples, " ",
280+
"--alpha ", opt$alpha, " ",
281+
"--beta ", opt$beta, " ",
282+
"-s ", opt$size, " ",
283+
"-z ", opt$seed
284+
)
285+
286+
msg <- paste(msg1, msg2, msg3, paste = " ")
287+
288+
try_error <- try(logFile <- file(logName, "a"), silent = TRUE)
289+
if (inherits(try_error, "try-error")) {
290+
cat("[vast diff warning]: Could not open log file\n")
291+
} else {
292+
write(msg, logFile)
293+
close(logFile)
294+
}
295+
}

0 commit comments

Comments
 (0)