Skip to content

Commit 4914c6f

Browse files
akokateAlexander Yip
authored and
Alexander Yip
committed
bio-duplication-stats: use maxX and maxY computed by bio-mark-duplicates
Summary: This revision outputs max X coordinate and max Y coordinate computed by bio-mark-duplicates in a json file as tile width and height respectively. bio-duplication stats uses them for global coordinate computations. Test Plan: Unit tests and tiny targeted test. Reviewers: ayip, escott, kshashidhar Reviewed By: ayip Maniphest Tasks: T35065 Differential Revision: https://phabricator.grailbio.com/D44038 fbshipit-source-id: d2eaf3d
1 parent e52fc2b commit 4914c6f

File tree

4 files changed

+35
-5
lines changed

4 files changed

+35
-5
lines changed

Diff for: main.go

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var (
3939
format = flag.String("format", "bam", "Output format. Value is either 'bam' or 'pam'.")
4040
metricsFile = flag.String("metrics", "", "Output metrics file")
4141
highCovFile = flag.String("high-cov-regions", "", "Output high coverage regions file")
42+
tileSizeFile = flag.String("tile-size", "", "Output width and height of tile to file")
4243
scratchDir = flag.String("scratch-dir", "/tmp", "Directory to put scratch files")
4344
parallelism = flag.Int("parallelism", runtime.NumCPU(), "Number of parallel computations to run during the markdup phase")
4445
queueLength = flag.Int("queue-length", runtime.NumCPU()*5, "Number shards to queue while waiting for flush")
@@ -79,6 +80,7 @@ func main() {
7980
IndexFile: *indexFile,
8081
MetricsFile: *metricsFile,
8182
HighCoverageIntervalFile: *highCovFile,
83+
TileSizeFile: *tileSizeFile,
8284
Format: *format,
8385
CoverageMax: *maxDepth,
8486
ShardSize: *shardSize,

Diff for: markduplicates/mark_duplicates.go

+10-4
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ type OpticalDetector interface {
6161
// influence optical detection.
6262
GetRecordProcessor() bampair.RecordProcessor
6363

64-
// RecordProcessorsDone should be called after the RecordProcessors
65-
// have seen all the input records.
66-
RecordProcessorsDone()
64+
// RecordProcessorsDone should return maximum X and Y co-ordinates and
65+
// be called after the RecordProcessors have seen all the input records.
66+
RecordProcessorsDone() (int, int)
6767

6868
// Detect identifies the optical duplicates in pairs and returns
6969
// their names in a slice. readGroupLibrary maps readGroup to
@@ -80,6 +80,7 @@ type Opts struct {
8080
IndexFile string
8181
MetricsFile string
8282
HighCoverageIntervalFile string
83+
TileSizeFile string
8384
Format string
8485
CoverageMax int
8586
ShardSize int
@@ -246,7 +247,7 @@ func (m *MarkDuplicates) Mark(shards []bam.Shard) (*MetricsCollection, error) {
246247
m.shardInfo = shardInfo
247248
m.globalMetrics.maxAlignDist = m.globalMaxAlignDist
248249
if m.Opts.OpticalDetector != nil {
249-
m.Opts.OpticalDetector.RecordProcessorsDone()
250+
m.globalMetrics.maxX, m.globalMetrics.maxY = m.Opts.OpticalDetector.RecordProcessorsDone()
250251
}
251252

252253
// Determine high coverage intervals if desired.
@@ -874,6 +875,11 @@ func SetupAndMark(ctx context.Context, provider bamprovider.Provider, opts *Opts
874875
return err
875876
}
876877
}
878+
if opts.TileSizeFile != "" {
879+
if err := writeTileSize(ctx, opts, globalMetrics); err != nil {
880+
return err
881+
}
882+
}
877883
if opts.OpticalHistogram != "" {
878884
if err := writeOpticalHistogram(ctx, opts, globalMetrics); err != nil {
879885
return err

Diff for: markduplicates/metrics.go

+21
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package markduplicates
1515

1616
import (
1717
"context"
18+
"encoding/json"
1819
"fmt"
1920
"os"
2021
"sort"
@@ -94,6 +95,8 @@ func (m *Metrics) Add(other *Metrics) {
9495
type MetricsCollection struct {
9596
// Global metrics
9697
maxAlignDist int
98+
maxX int
99+
maxY int
97100

98101
// OpticalDistance stores the number of duplicate read pairs that
99102
// have the given Euclidean distance.
@@ -253,6 +256,24 @@ func writeHighCoverageIntervals(ctx context.Context, opts *Opts, header *sam.Hea
253256
return nil
254257
}
255258

259+
func writeTileSize(ctx context.Context, opts *Opts, globalMetrics *MetricsCollection) (err error) {
260+
var f *os.File
261+
f, err = os.Create(opts.TileSizeFile)
262+
if err != nil {
263+
return errors.E(err, "Couldn't create tile size file:", opts.TileSizeFile)
264+
}
265+
defer func() {
266+
if err2 := f.Close(); err == nil && err2 != nil {
267+
err = err2
268+
}
269+
}()
270+
enc := json.NewEncoder(f)
271+
return enc.Encode(map[string]int{
272+
"tileWidth": globalMetrics.maxX,
273+
"tileHeight": globalMetrics.maxY,
274+
})
275+
}
276+
256277
func writeOpticalHistogram(ctx context.Context, opts *Opts, globalMetrics *MetricsCollection) (err error) {
257278
var f *os.File
258279
f, err = os.Create(opts.OpticalHistogram)

Diff for: markduplicates/optical_detector.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ func (t *TileOpticalDetector) GetRecordProcessor() bampair.RecordProcessor {
9898
}
9999

100100
// RecordProcessorsDone implements OpticalDetector.
101-
func (t *TileOpticalDetector) RecordProcessorsDone() {
101+
func (t *TileOpticalDetector) RecordProcessorsDone() (int, int) {
102+
return 0, 0
102103
}
103104

104105
// Detect implements OpticalDetector.

0 commit comments

Comments
 (0)