Skip to content

Commit b2b17fa

Browse files
committed
WIP: Only stage necessary files
This branch is an attempt to only stage the necessary files. This would be very efficient on S3, but it's also rather complicated and involves a lot of hacks. It is also unnecessary on systems with a shared files system or fusion.
1 parent 339fda2 commit b2b17fa

File tree

4 files changed

+50
-4
lines changed

4 files changed

+50
-4
lines changed

assets/schema_input.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@
1313
"errorMessage": "Sample name must be provided and cannot contain spaces",
1414
"meta": ["id"]
1515
},
16+
"path": {
17+
"type": "string",
18+
"format": "path"
19+
},
1620
"checksums": {
1721
"type": "string",
1822
"format": "file-path",
1923
"exists": true,
2024
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
2125
}
2226
},
23-
"required": ["id", "checksums"]
27+
"required": ["id", "path", "checksums"]
2428
}
2529
}

modules/local/sha256sum/main.nf

+4-2
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ process SHA256SUM_CHECK {
99
'nf-core/ubuntu:20.04' }"
1010

1111
input:
12-
tuple val(meta), path(checksum_file)
12+
tuple val(meta), path(checksum_file), path("rename.sh"), path("files/??????"),
1313

1414
output:
1515
tuple val(meta), path(report)
1616

1717
script:
1818
"""
19-
sha256sum -c ${checksum_file} > ${meta.id}.report.txt
19+
bash rename.sh
20+
21+
cd work && sha256sum -c ${checksum_file} > ../${meta.id}.report.txt
2022
"""
2123
}

subworkflows/local/checksum_verify.nf

+41
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,25 @@ include { methodsDescriptionText } from '../local/utils_nfcore_datasync_pipeline
1616
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1717
*/
1818

19+
def splitChecksumFile(f, batchsize) {
20+
lines = []
21+
f.eachLine { line, index ->
22+
def parts = line.split(/\s+/)
23+
lines.add([parts[0], parts[1], String.format('%06d', (index % batchsize) + 1)])
24+
}
25+
return lines.collate(batchsize)
26+
}
27+
28+
def makeRenameScript(batch) {
29+
script = []
30+
script.add("#!/bin/bash -euo pipefail")
31+
script.add("mkdir -p work")
32+
batch.each { checksum, filename, index ->
33+
script.add("mkdir -p work/\$(dirname '${filename}') && mv files/${index} 'work/${filename}'")
34+
}
35+
return script.join("\n")
36+
}
37+
1938
workflow CHECKSUM_VERIFY {
2039

2140
take:
@@ -26,6 +45,28 @@ workflow CHECKSUM_VERIFY {
2645
ch_versions = Channel.empty()
2746
ch_multiqc_files = Channel.empty()
2847

48+
ch_batches = ch_samplesheet.map{ meta, path, checksum_file ->
49+
splitChecksumFile(checksum_file, params.chunksize).withIndex().collect {
50+
chunk, index -> [meta, chunk]
51+
}
52+
}.flatMap { meta, chunk -> tuple(meta, chunk)}
53+
// ch_batches.view()
54+
ch_scripts = ch_batches.map{ meta, chunk -> [meta, makeRenameScript(chunk)] }
55+
// ch_scripts.view()
56+
ch_files = ch_batches.join(ch_samplesheet).map{ meta, chunk, path, checksum_file ->
57+
[meta, chunk.collect{ checksum, filename, idx -> file("${path}/${filename}")}]
58+
}
59+
ch_files.view()
60+
// ch_files = ch_batches.map{
61+
// meta, path, chunk -> chunk.each{
62+
// checksum, filename, numeric_id ->
63+
// files = []
64+
// files.add(file("${path}/${filename}", checkIfExists:true))
65+
// }
66+
// }
67+
// ch_files.view()
68+
// SHA256SUM_CHECK([[:], ["foo/test.txt", "foo/bar.txt"], [file("foo/test.txt"), file("foo/bar.txt")], []])
69+
2970
emit:
3071
versions = ch_versions // channel: [ path(versions.yml) ]
3172
multiqc_files = ch_multiqc_files

subworkflows/local/utils_nfcore_datasync_pipeline/main.nf

-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ workflow PIPELINE_INITIALISATION {
7474

7575
Channel
7676
.fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json"))
77-
.groupTuple()
7877
.map { samplesheet ->
7978
validateInputSamplesheet(samplesheet)
8079
}

0 commit comments

Comments
 (0)