1
1
# !/usr/bin/env Rscript
2
2
#
3
- # Author: Tim Sterne-Weiler, 2014
4
-
5
- # Modifications by Ulrich Braunschweig 2018
3
+ # Author: Tim Sterne-Weiler, Ulrich Braunschweig 2014-2024
4
+
6
5
7
- # This function takes a qual and returns c(post_alpha, post_beta)
8
- # Increments by prior alpha and prior distribution beta, uniform by default
6
+ # # Check that columns of INCLUSION... table are what we think they are
7
+ checkHeader <- function (x , replicateA , replicateB ) {
8
+ reps <- c(replicateA , replicateB )
9
+ sampInd <- unlist(sapply(reps , FUN = function (y ) {which(x == y )}))
10
+ if (length(sampInd ) != length(reps )) {
11
+ stop(" [vast diff error]: Not all replicate names found in input header!\n " )
12
+ }
13
+ namesOK <- all(paste0(reps , " -Q" ) == x [sampInd + 1 ])
14
+ if (! namesOK | ! all(x [1 : 3 ] == c(" GENE" , " EVENT" , " COORD" ))) {
15
+ stop(" [vast diff error]: Input table does not have expected format!\n " )
16
+ }
17
+ }
18
+
19
+ # # This function takes a qual and returns c(post_alpha, post_beta)
20
+ # # Increments by prior alpha and prior distribution beta, uniform by default
9
21
parseQual <- function (qual , prior_alpha = 1 , prior_beta = 1 ) {
10
22
res1 <- as.numeric(sub(" [^@]*@([^,]*),.*" , " \\ 1" , qual ))
11
23
res2 <- as.numeric(sub(" [^@]*@[^,]*,(.*)" , " \\ 1" , qual ))
@@ -19,44 +31,43 @@ parseQual <- function(qual, prior_alpha=1, prior_beta=1) {
19
31
cbind(res1 , res2 )
20
32
}
21
33
22
- # calculate the probability that the first dist is > than second
23
- # P(psi1 > psi2) when alpha=0; more generally we are determining the probability
24
- # that Psi1 is greater than Psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
34
+ # # Calculate the probability that the first dist is > than second
35
+ # # P(psi1 > psi2) when alpha=0; more generally we are determining the probability
36
+ # # that psi1 is greater than psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
25
37
# # IMPORTANT: run this function as sample(firstDist, length(firstDist)) AND
26
- # # sample(secondDist, length(secondDist))
38
+ # # sample(secondDist, length(secondDist))
27
39
# # UNLESS you have paired data, then don't sample.
28
40
pDiff <- function (firstDist , secondDist , alpha = 0.15 ) {
29
- N <- length(firstDist )
30
- pass <- length( which( (firstDist - secondDist ) > alpha ) )
41
+ N <- length(firstDist )
42
+ pass <- length(which((firstDist - secondDist ) > alpha ) )
31
43
pass / N
32
44
}
33
45
34
- # This function finds the maximum difference between psi1 and psi2 given
35
- # at least some acceptable probability for difference. Defaults to 0.8
36
- # distributions where no diff value exists with a probability > 0.8 are given
37
- # maxDiff of 0.
38
- maxDiff <- function (firstDist , secondDist , acceptProb = 0.9 ) {
39
- alphaSet <- seq( 0 , 1 , 0.01 ) # make this global?
40
- probs <- unlist(lapply( alphaSet , function ( x ) { pDiff( firstDist , secondDist , x ) } ))
46
+ # # This function finds the maximum difference between psi1 and psi2 given
47
+ # # at least some acceptable probability for difference. Defaults to 0.8
48
+ # # distributions where no diff value exists with a probability > 0.8 are given
49
+ # # maxDiff of 0.
50
+ maxDiff <- function (firstDist , secondDist , acceptProb = 0.9 , alphaSet ) {
51
+ probs <- unlist(lapply( alphaSet , pDiff ,
52
+ firstDist = firstDist , secondDist = secondDist ))
41
53
ind <- max(c(which(probs > acceptProb ), 1 ))
42
54
alphaSet [ind ]
43
55
}
44
56
45
- #
46
- # return the beta variance
57
+ # # Return the beta variance
47
58
betaVar <- function (alpha , beta ) {
48
- var <- alpha * beta / (
59
+ var <- alpha * beta / (
49
60
((alpha + beta ) ** 2 ) * (alpha + beta + 1 )
50
61
)
51
62
var
52
63
}
53
64
54
- #
65
+ # # Return a confidence interval
55
66
betaCI <- function (betaDist , percentile = c(0.05 , 0.95 )) {
56
67
quantile(betaDist , p = percentile , na.rm = T )
57
68
}
58
69
59
- # Extension of betaCI function that includes the sampling step
70
+ # # Extension of betaCI function that includes the sampling step
60
71
betaCISample <- function (alpha , beta , n = 5000 ) {
61
72
if (is.na(alpha ) || is.na(beta )) {
62
73
sample <- NA
@@ -66,43 +77,44 @@ betaCISample <- function(alpha, beta, n = 5000) {
66
77
return (betaCI(sample ))
67
78
}
68
79
69
-
70
- # ## MAKE VISUAL OUTPUT
71
- plotDiff <- function ( inpOne , inpTwo , expOne , expTwo , maxD , medOne , medTwo , sampOneName , sampTwoName , rever ) {
80
+ plotDiff <- function ( inpOne , inpTwo , expOne , expTwo , maxD , medOne , medTwo ,
81
+ sampOneName , sampTwoName , alphaSet , rever ) {
82
+ # ## Make visual output
72
83
73
84
if (rever ) { # write this better. ;-)
74
85
curCol <- cbb [3 : 2 ]
75
86
} else {
76
87
curCol <- cbb [2 : 3 ]
77
88
}
78
89
79
- if (length(expOne ) == 0 || length(expTwo ) == 0 ) { return (NULL ) }
90
+ if (length(expOne ) == 0 || length(expTwo ) == 0 ) {return (NULL )}
80
91
one <- data.frame (x = expOne , y = - 0.5 )
81
92
two <- data.frame (x = expTwo , y = - 0.5 )
82
93
83
- distPlot <- ggplot(melt(as.data.frame(
84
- do.call( cbind , list ( inpOne , inpTwo ))
85
- ), measure.vars = c( " V1 " , " V2 " )), aes(fill = variable , x = value ))+
86
- geom_histogram(aes(y = .. density.. ) , binwidth = 0.03333 ,alpha = 0.5 , col = " grey" , position = " identity" )+
87
- theme_bw()+ xlim( c(0 ,1 )) + xlab(expression(hat(Psi )))+
88
- scale_fill_manual(values = curCol , labels = c(sampOneName , sampTwoName ), name = " Samples" )+
89
- geom_point(data = one , mapping = aes(x = x , y = y ), col = cbb [2 ], fill = cbb [2 ], alpha = 0.85 , inherit.aes = FALSE )+
90
- geom_point(data = two , mapping = aes(x = x , y = y ), col = cbb [3 ], fill = cbb [3 ], alpha = 0.85 , inherit.aes = FALSE )
94
+ distPlot <- ggplot(melt(as.data.frame(do.call( cbind , list ( inpOne , inpTwo ))),
95
+ measure.vars = c( " V1 " , " V2 " )),
96
+ aes(fill = variable , x = value )) +
97
+ geom_histogram(aes(y = after_stat( density )) , binwidth = 0.03333 , alpha = 0.5 , col = " grey" , position = " identity" ) +
98
+ theme_bw() + scale_x_continuous( limits = c(0 , 1 ), oob = scales :: oob_keep ) + xlab(expression(hat(Psi ))) +
99
+ scale_fill_manual(values = curCol , labels = c(sampOneName , sampTwoName ), name = " Samples" ) +
100
+ geom_point(data = one , mapping = aes(x = x , y = y ), col = cbb [2 ], fill = cbb [2 ], alpha = 0.50 , inherit.aes = FALSE ) +
101
+ geom_point(data = two , mapping = aes(x = x , y = y ), col = cbb [3 ], fill = cbb [3 ], alpha = 0.50 , inherit.aes = FALSE )
91
102
92
103
probPlot <- ggplot(as.data.frame(cbind(seq(0 ,1 ,0.01 ),
93
- unlist(lapply(alphaList , function (x ) {
104
+ unlist(lapply(alphaSet , function (x ) {
94
105
pDiff(inpOne , inpTwo , x )
95
- })))), aes(x = V1 , y = V2 ))+
96
- geom_line()+ theme_bw()+
97
- geom_vline(xintercept = maxD , lty = " dashed" , col = cbb [7 ])+
98
- ylab(expression(P((hat(Psi )[1 ]- hat(Psi )[2 ]) > x )))+
99
- xlab(expression(x ))+ ylim( c(0 ,1 )) +
100
- annotate(" text" , x = (maxD + 0.08 ), y = 0.05 , label = maxD , col = cbb [7 ])
101
-
102
- return (list (distPlot , probPlot ))
106
+ })))), aes(x = V1 , y = V2 )) +
107
+ geom_line() + theme_bw() +
108
+ geom_vline(xintercept = maxD , lty = " dashed" , col = cbb [7 ]) +
109
+ ylab(expression(P((hat(Psi )[1 ] - hat(Psi )[2 ]) > x ))) +
110
+ xlab(expression(x )) + scale_y_continuous( limits = c(0 , 1 ), oob = scales :: oob_keep ) +
111
+ annotate(" text" , x = (maxD + 0.08 ), y = 0.05 , label = maxD , col = cbb [7 ])
112
+
113
+ return (list (distPlot , probPlot ))#
103
114
}
104
115
105
116
plotPrint <- function (plotTitle , plotCoord , plotList ) {
117
+ # ## Print saved plotting information to output file
106
118
grid.newpage()
107
119
pushViewport(viewport(layout = grid.layout(3 , 2 , widths = unit(c(5 , 4 ), " null" ), heights = unit(c(1 , 0.5 , 5 ), " null" ))))
108
120
grid.text(as.character(plotTitle ), check.overlap = TRUE , gp = gpar(font = 2 ), draw = T , vp = viewport(layout.pos.row = 1 , layout.pos.col = 1 : 2 ))
@@ -117,106 +129,167 @@ diffBeta <- function(i, lines, opt,
117
129
totalFirst , totalSecond ,
118
130
shapeFirstAve , shapeSecondAve ,
119
131
expFirst , expSecond ,
120
- repA.qualInd , repB.qualInd ) {
121
- # ## Main diff functionality; fit beta distributions to sample groups for one event and compare
122
- # ## Is applied to each line of the current nLines of the INCLUSION... table
123
-
124
- # # if no data, next; # adapted from @lpantano's fork 7/22/2015
125
- if ( (sum(totalFirst [i ,] > (opt $ minReads + opt $ alpha + opt $ beta )) < opt $ minSamples ) ||
126
- (sum(totalSecond [i ,] > (opt $ minReads + opt $ alpha + opt $ beta )) < opt $ minSamples ) ) {
127
- return (NULL )
128
- }
132
+ repA.qualInd , repB.qualInd ,
133
+ okFirst , okSecond , skip ,
134
+ alphaSet ) {
135
+ # # Main diff functionality; fit beta distributions to sample groups for one event and compare
136
+ # # Is applied to each line of the current nLines of the INCLUSION... table
129
137
130
138
# # Sample Posterior Distributions
131
139
psiFirst <- lapply(1 : (dim(shapeFirst )[2 ]), function (x ) {
132
- # sample here from rbeta(N, alpha, beta) if > -e
133
- if ( totalFirst [i ,x ] < opt $ minReads ) { return (NULL ) }
134
- rbeta(opt $ size , shape1 = shapeFirst [i ,x ,1 ], shape2 = shapeFirst [i ,x ,2 ])
140
+ # # sample here from rbeta(N, alpha, beta) if > -e
141
+ if ( ! okFirst [i ,x ]) {return (NULL )}
142
+ rbeta(opt $ size , shape1 = shapeFirst [i ,x ,1 ], shape2 = shapeFirst [i ,x ,2 ])
135
143
})
136
144
137
145
psiSecond <- lapply(1 : (dim(shapeSecond )[2 ]), function (x ) {
138
- # sample here from rbeta(N, alpha, beta) if > -e
139
- if ( totalSecond [i ,x ] < opt $ minReads ) { return (NULL ) }
140
- rbeta(opt $ size , shape1 = shapeSecond [i ,x ,1 ], shape2 = shapeSecond [i ,x ,2 ])
146
+ # # sample here from rbeta(N, alpha, beta) if > -e
147
+ if ( ! okSecond [i ,x ]) {return (NULL )}
148
+ rbeta(opt $ size , shape1 = shapeSecond [i ,x ,1 ], shape2 = shapeSecond [i ,x ,2 ])
141
149
})
142
150
143
- if (opt $ paired ) { # make sure both samples have a non-NULL replicate
144
- for (lstInd in 1 : length(psiFirst )) {
145
- if (is.null(psiFirst [[lstInd ]]) || is.null(psiSecond [[lstInd ]])) {
146
- psiFirst [[lstInd ]] <- NULL
147
- psiSecond [[lstInd ]] <- NULL
148
- }
149
- }
150
- }
151
-
152
151
# # Create non-parametric Joint Distributions
153
152
psiFirstComb <- do.call(c , psiFirst )
154
153
psiSecondComb <- do.call(c , psiSecond )
155
154
156
- if ( length(psiFirstComb ) < = 0 || length(psiSecondComb ) < = 0 ) { return (NULL ) }
157
-
158
- # # if they aren't paired, then shuffle the joint distributions...
159
- if ( ! opt $ paired ) {
160
- paramFirst <- try (suppressWarnings(
161
- fitdistr(psiFirstComb ,
162
- " beta" ,
163
- list ( shape1 = shapeFirstAve [i ,1 ], shape2 = shapeFirstAve [i ,2 ])
164
- )$ estimate ), TRUE )
165
- paramSecond <- try (suppressWarnings(
166
- fitdistr(psiSecondComb ,
167
- " beta" ,
168
- list ( shape1 = shapeSecondAve [i ,1 ], shape2 = shapeSecondAve [i ,2 ])
169
- )$ estimate ), TRUE )
170
- # # if optimization fails its because the distribution is too narrow
171
- # # in which case our starting shapes should already be good enough
172
- if (class(paramFirst ) != " try-error" ) {
173
- psiFirstComb <- rbeta(opt $ size , shape1 = paramFirst [1 ], shape2 = paramFirst [2 ])
174
- }
175
- if (class(paramSecond ) != " try-error" ) {
176
- psiSecondComb <- rbeta(opt $ size , shape1 = paramSecond [1 ], shape2 = paramSecond [2 ])
177
- }
155
+ # # Try to to fit a beta distribution on the replicates' parameter estimates.
156
+ if (! opt $ paired ) {
157
+ if (length(psiFirstComb ) > 0 ) {
158
+ paramFirst <- try(suppressWarnings(
159
+ fitdistr(psiFirstComb ,
160
+ " beta" ,
161
+ list (shape1 = shapeFirstAve [i ,1 ], shape2 = shapeFirstAve [i ,2 ])
162
+ )$ estimate ), TRUE )
163
+ }
164
+ if (length(psiSecondComb )) {
165
+ paramSecond <- try(suppressWarnings(
166
+ fitdistr(psiSecondComb ,
167
+ " beta" ,
168
+ list (shape1 = shapeSecondAve [i ,1 ], shape2 = shapeSecondAve [i ,2 ])
169
+ )$ estimate ), TRUE )
170
+ }
171
+ # # If optimization fails it's because the distribution is too narrow
172
+ # # in which case our starting shapes should already be good enough.
173
+ # # Shuffle since not paired, and downsample to size of smaller number
174
+ # # but don't downsize if the other sample type is absent anyway.
175
+ minSample <- min(c(length(psiFirstComb ), length(psiSecondComb )))
176
+
177
+ if (length(psiFirstComb ) > 0 ) {
178
+ if (! inherits(paramFirst , " try-error" )) {
179
+ psiFirstComb <- rbeta(opt $ size , shape1 = paramFirst [1 ], shape2 = paramFirst [2 ])
180
+ } else {
181
+ if (length(psiSecondComb ) > 0 ) {
182
+ psiFirstComb <- sample(psiFirstComb , minSample )
183
+ }
184
+ }
185
+ }
186
+ if (length(psiSecondComb ) > 0 ) {
187
+ if (! inherits(paramSecond , " try-error" )) {
188
+ psiSecondComb <- rbeta(opt $ size , shape1 = paramSecond [1 ], shape2 = paramSecond [2 ])
189
+ } else {
190
+ if (length(psiFirstComb ) > 0 ) {
191
+ psiSecondComb <- sample(psiSecondComb , minSample )
192
+ }
193
+ }
194
+ }
178
195
}
179
196
180
197
# # get empirical posterior median of psi
181
198
medOne <- median(psiFirstComb )
182
199
medTwo <- median(psiSecondComb )
183
200
184
201
# # look for a max difference given prob cutoff...
185
- if (medOne > medTwo ) {
186
- max <- maxDiff(psiFirstComb , psiSecondComb , opt $ prob )
202
+ if (skip [i ]) {
203
+ if (is.null(medOne )) {medOne <- NA }
204
+ if (is.null(medTwo )) {medTwo <- NA }
205
+ max <- NA
187
206
} else {
188
- max <- maxDiff(psiSecondComb , psiFirstComb , opt $ prob )
207
+ if (medOne > medTwo ) {
208
+ max <- maxDiff(psiFirstComb , psiSecondComb , opt $ prob , alphaSet )
209
+ } else {
210
+ max <- maxDiff(psiSecondComb , psiFirstComb , opt $ prob , alphaSet )
211
+ }
189
212
}
190
213
191
- # # SIGNIFICANT from here on out:
192
-
193
- filtOut <- sprintf(" %s\t %s\t %f\t %f\t %f\t %s" , lines [[i ]][1 ], lines [[i ]][2 ], medOne , medTwo , medOne - medTwo , round(max ,2 ))
214
+ filtOut <- sprintf(" %s\t %s\t %f\t %f\t %f\t %s" ,
215
+ lines [[i ]][1 ], lines [[i ]][2 ], medOne , medTwo , medOne - medTwo , round(max ,2 ))
194
216
195
217
# # check for significant difference
196
- if (max < opt $ minDiff ) {
197
- # # non-sig, return null plots and text output
198
- return (list (NULL , NULL , NULL , NULL , filtOut ))
199
- } else {
200
- sigInd <- i
201
-
202
- if (opt $ noPDF ) {
203
- eventTitle <- NULL
204
- eventCoord <- NULL
205
- retPlot <- NULL
206
- } else {
218
+ if (opt $ noPDF || (is.na(max ) || max < opt $ minDiff )) {
219
+ # # non-sig, return null plots and text output
220
+ return (list (NULL , NULL , NULL , NULL , filtOut ))
221
+ } else {
222
+ # # significant; plot
223
+ sigInd <- i
207
224
eventTitle <- paste(c(" Gene: " , lines [[i ]][1 ], " Event: " , lines [[i ]][2 ]), collapse = " " )
208
225
eventCoord <- paste(c(" Coordinates: " , lines [[i ]][3 ]), collapse = " " )
209
226
210
- # # Print visual output to pdf;
211
- if ( medOne > medTwo ) {
227
+ # # Store visual output to be printed to pdf
228
+ if ( medOne > medTwo ) {
212
229
retPlot <- plotDiff(psiFirstComb , psiSecondComb ,
213
- expFirst [i ,], expSecond [i ,], max , medOne , medTwo , sampOneName , sampTwoName , FALSE )
230
+ expFirst [i ,][okFirst [i ,]], expSecond [i ,][okSecond [i ,]],
231
+ max , medOne , medTwo , sampOneName , sampTwoName , alphaSet , FALSE )
214
232
} else {
215
233
retPlot <- plotDiff(psiSecondComb , psiFirstComb ,
216
- expFirst [i ,], expSecond [i ,], max , medTwo , medOne , sampTwoName , sampOneName , TRUE )
234
+ expFirst [i ,][okFirst [i ,]], expSecond [i ,][okSecond [i ,]],
235
+ max , medTwo , medOne , sampTwoName , sampOneName , alphaSet , TRUE )
217
236
}
218
- }
219
- # # sig event return
220
- return (list (retPlot , eventTitle , eventCoord , sigInd , filtOut )) # return of mclapply function
237
+
238
+ # # sig event return
239
+ return (list (retPlot , eventTitle , eventCoord , sigInd , filtOut ))
221
240
}
222
241
}
242
+
243
+ writeLog <- function (vastPath , opt ) {
244
+ # ## Add a line with call information to the general vast-tools log file
245
+
246
+ logName <- file.path(opt $ output , " VTS_LOG_commands.txt" )
247
+ vastVer <- scan(file.path(vastPath , " VERSION" ), what = " character" , quiet = T )
248
+
249
+ # # Get date, time and vast-tools version
250
+ systime <- Sys.time()
251
+ sdate <- paste(
252
+ sub(" ([0-9]{4})-.+" , " \\ 1" , systime ),
253
+ sub(" [0-9]{4}-[0-9]{2}-([0-9]{2}).+" , " \\ 1" , systime ),
254
+ sub(" [0-9]{4}-([0-9]{2})-.+" , " \\ 1" , systime ),
255
+ sep = " -" )
256
+ stime <- sub(" [^ ]+ (.{5}):[0-9]{2}.*" , " \\ 1" , systime )
257
+
258
+ msg1 <- paste0(
259
+ " [VAST-TOOLS v" , vastVer , " , " ,
260
+ sdate , " (" , stime , " )]"
261
+ )
262
+
263
+ msg2 <- " vast-tools diff"
264
+
265
+ # # Format input options
266
+ msg3 <- paste0(
267
+ " -a " , opt $ replicateA , " " ,
268
+ " -b " , opt $ replicateB , " " ,
269
+ " --sampleNameA " , opt $ sampleNameA , " " ,
270
+ " --sampleNameB " , opt $ sampleNameB , " " ,
271
+ " -i " , opt $ input , " " ,
272
+ " -n " , opt $ nLines , " " ,
273
+ " -p " , opt $ paired , " " ,
274
+ " -d " , opt $ baseName , " " ,
275
+ " -o " , opt $ output , " " ,
276
+ " -r " , opt $ prob , " " ,
277
+ " -m " , opt $ minDiff , " " ,
278
+ " -e " , opt $ minReads , " " ,
279
+ " -S " , opt $ minSamples , " " ,
280
+ " --alpha " , opt $ alpha , " " ,
281
+ " --beta " , opt $ beta , " " ,
282
+ " -s " , opt $ size , " " ,
283
+ " -z " , opt $ seed
284
+ )
285
+
286
+ msg <- paste(msg1 , msg2 , msg3 , paste = " " )
287
+
288
+ try_error <- try(logFile <- file(logName , " a" ), silent = TRUE )
289
+ if (inherits(try_error , " try-error" )) {
290
+ cat(" [vast diff warning]: Could not open log file\n " )
291
+ } else {
292
+ write(msg , logFile )
293
+ close(logFile )
294
+ }
295
+ }
0 commit comments