Skip to content

Commit b94be1f

Browse files
committed
kingfisher worked..
1 parent 24731d1 commit b94be1f

File tree

3 files changed

+71
-81
lines changed

3 files changed

+71
-81
lines changed

test/test_sra.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ echo "WD="$WD
1515

1616
atlas init-public PRJEB20796 -w $WD
1717

18-
echo "Run Atlas"
18+
echo "Dry run Atlas"
1919

2020
atlas run qc -w $WD --dry-run $@
2121

@@ -31,6 +31,8 @@ echo "WD="$WD
3131

3232
atlas init-public ERR2213683 -w $WD
3333

34+
echo "Dry run Atlas"
35+
3436
atlas run qc -w $WD --dry-run $@
3537

3638

@@ -60,7 +62,7 @@ sed -i.bak '/ILLUMINA/d' $WD/RunInfo.csv
6062
echo "Continue public init"
6163
atlas init-public continue -w $WD
6264

63-
echo "Run Atlas"
65+
echo "Dry run Atlas"
6466

6567
atlas run qc -w $WD --dry-run $@
6668

workflow/envs/kingfisher.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
- defaults
5+
dependencies:
6+
- kingfisher=0.4

workflow/rules/sra.smk

Lines changed: 61 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -3,97 +3,60 @@ wildcard_constraints:
33

44

55
localrules:
6-
prefetch,
6+
kingfisher_get,
7+
merge_runs_to_sample
78

89

910
SRA_read_fractions = ["_1", "_2"] if PAIRED_END else [""]
10-
SRA_SUBDIR_RUN = "SRA/Runs"
11+
SRA_SUBDIR_RUN = Path("SRA/Runs")
1112

1213

13-
rule prefetch:
14-
output:
15-
sra=temp(touch(SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}_downloaded")),
16-
# not givins sra file as output allows for continue from the same download
17-
params:
18-
outdir=SRA_SUBDIR_RUN, # prefetch creates file in subfolder with run name automatically
19-
log:
20-
"logs/SRAdownload/prefetch/{sra_run}.log",
21-
benchmark:
22-
"logs/benchmarks/SRAdownload/prefetch/{sra_run}.tsv"
23-
threads: 1
24-
resources:
25-
mem_mb=1000,
26-
time_min=60 * int(config["runtime"]["simplejob"]),
27-
internet_connection=1,
28-
conda:
29-
"%s/sra.yaml" % CONDAENV
30-
shell:
31-
" mkdir -p {params.outdir} 2> {log} "
32-
" ; "
33-
" prefetch "
34-
" --output-directory {params.outdir} "
35-
" -X 999999999 "
36-
" --progress "
37-
" --log-level info "
38-
" {wildcards.sra_run} &>> {log} "
39-
" ; "
40-
" vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} "
14+
RunTable = None
15+
def load_runtable():
16+
global RunTable
17+
if RunTable is None:
18+
from atlas.init import parse_sra
19+
RunTable = parse_sra.load_and_validate_runinfo_table()
20+
return RunTable
4121

22+
def get_run_ids_for_sample(wildcards):
23+
RunTable = load_runtable()
24+
from atlas.init import parse_sra
25+
return parse_sra.get_run_ids_for_sample(RunTable, wildcards.sample)
4226

43-
rule extract_run:
44-
input:
45-
flag=rules.prefetch.output,
27+
28+
rule kingfisher_get:
4629
output:
47-
temp(
48-
expand(
49-
SRA_SUBDIR_RUN + "/{{sra_run}}/{{sra_run}}{fraction}.fastq.gz",
50-
fraction=SRA_read_fractions,
51-
)
52-
),
30+
#dir = temp(directory("Reads/tmp/runs/{sample}")),
31+
flag = temp(touch("Reads/tmp/flags/{sample}.downloaded")),
5332
params:
54-
outdir=os.path.abspath(SRA_SUBDIR_RUN + "/{sra_run}"),
55-
sra_file=SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}.sra",
33+
run_ids = get_run_ids_for_sample,
34+
download_methods="ena-ascp ena-ftp prefetch",
35+
output_dir= lambda wc: SRA_SUBDIR_RUN / wc.sample,
5636
log:
57-
"logs/SRAdownload/extract/{sra_run}.log",
58-
benchmark:
59-
"logs/benchmarks/SRAdownload/fasterqdump/{sra_run}.tsv"
60-
threads: config["simplejob_threads"]
37+
Path("log/download_reads/download/{sample}.log").resolve(),
38+
threads: config['threads'],
6139
resources:
62-
time_min=60 * int(config["runtime"]["simplejob"]),
63-
mem_mb=1000, #default 100Mb
40+
mem_mb=config['mem']*1000,
41+
time_min=config["runtime"]["long"]*60,
42+
ncbi_connection=1
6443
conda:
65-
"%s/sra.yaml" % CONDAENV
44+
"../envs/kingfisher.yaml"
6645
shell:
67-
" vdb-validate {params.sra_file} &>> {log} "
68-
" ; "
69-
" parallel-fastq-dump "
70-
" --threads {threads} "
71-
" --gzip --split-files "
72-
" --outdir {params.outdir} "
73-
" --tmpdir {resources.tmpdir} "
74-
" --skip-technical --split-3 "
75-
" -s {params.sra_file} &>> {log} "
46+
" mkdir -p {params.output_dir} ; "
47+
" cd {params.output_dir} "
7648
" ; "
77-
" rm -f {params.sra_file} 2>> {log} "
78-
79-
80-
49+
"kingfisher get --run-identifiers {params.run_ids} "
50+
" --download-threads 2 --extraction-threads {threads} "
51+
" --hide-download-progress "
52+
" --output-format-possibilities 'fastq.gz' "
53+
" --force --check-md5sums "
54+
" --download-methods {params.download_methods} "
55+
" -f fastq.gz &> {log}"
8156

8257

8358

84-
85-
RunTable = None
86-
def get_run_fastq_for_sample(wildcards):
87-
88-
from atlas.init.parse_sra import load_and_validate_runinfo_table, get_run_ids_for_sample
89-
90-
# load RunTable if not already loaded
91-
global RunTable
92-
if RunTable is None:
93-
94-
RunTable = load_and_validate_runinfo_table()
95-
96-
run_ids = get_run_ids_for_sample(RunTable,wildcards.sample)
59+
def get_run_fastq_for_sample(run_ids):
9760

9861
ReadFiles = {}
9962
for fraction in SRA_read_fractions:
@@ -103,7 +66,7 @@ def get_run_fastq_for_sample(wildcards):
10366
key = fraction
10467

10568
ReadFiles[key] = expand(
106-
SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}{fraction}.fastq.gz",
69+
str(SRA_SUBDIR_RUN / "{sample}/{sra_run}{fraction}.fastq.gz"),
10770
fraction=fraction,
10871
sra_run=run_ids,
10972
)
@@ -113,20 +76,39 @@ def get_run_fastq_for_sample(wildcards):
11376

11477
rule merge_runs_to_sample:
11578
input:
116-
unpack(get_run_fastq_for_sample),
79+
flag= "Reads/tmp/flags/{sample}.downloaded"
11780
output:
11881
expand(
11982
"SRA/Samples/{{sample}}/{{sample}}{fraction}.fastq.gz",
12083
fraction=SRA_read_fractions,
12184
),
85+
params:
86+
run_ids = get_run_ids_for_sample,
12287
threads: 1
12388
run:
89+
90+
# print(list( (SRA_SUBDIR_RUN / wildcards.sample).iterdir()))
91+
12492
from utils import io
12593

94+
12695
for i, fraction in enumerate(SRA_read_fractions):
127-
if fraction == "":
128-
fraction = "se"
129-
io.cat_files(input[fraction], output[i])
96+
97+
run_fastqs= expand(
98+
SRA_SUBDIR_RUN / "{sample}/{sra_run}{fraction}.fastq.gz",
99+
fraction=fraction,
100+
sra_run=params.run_ids,
101+
sample=wildcards.sample
102+
)
103+
104+
assert all([Path(f).exists() for f in run_fastqs])," Not all fastq files exist. Expected: %s" % run_fastqs
105+
106+
io.cat_files(run_fastqs, output[i])
107+
108+
109+
110+
111+
130112

131113

132114
rule download_sra:

0 commit comments

Comments
 (0)