@@ -3,97 +3,60 @@ wildcard_constraints:
3
3
4
4
5
5
localrules :
6
- prefetch ,
6
+ kingfisher_get ,
7
+ merge_runs_to_sample
7
8
8
9
9
10
SRA_read_fractions = ["_1" , "_2" ] if PAIRED_END else ["" ]
10
- SRA_SUBDIR_RUN = "SRA/Runs"
11
+ SRA_SUBDIR_RUN = Path ( "SRA/Runs" )
11
12
12
13
13
- rule prefetch :
14
- output :
15
- sra = temp (touch (SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}_downloaded" )),
16
- # not givins sra file as output allows for continue from the same download
17
- params :
18
- outdir = SRA_SUBDIR_RUN , # prefetch creates file in subfolder with run name automatically
19
- log :
20
- "logs/SRAdownload/prefetch/{sra_run}.log" ,
21
- benchmark :
22
- "logs/benchmarks/SRAdownload/prefetch/{sra_run}.tsv"
23
- threads : 1
24
- resources :
25
- mem_mb = 1000 ,
26
- time_min = 60 * int (config ["runtime" ]["simplejob" ]),
27
- internet_connection = 1 ,
28
- conda :
29
- "%s/sra.yaml" % CONDAENV
30
- shell :
31
- " mkdir -p {params.outdir} 2> {log} "
32
- " ; "
33
- " prefetch "
34
- " --output-directory {params.outdir} "
35
- " -X 999999999 "
36
- " --progress "
37
- " --log-level info "
38
- " {wildcards.sra_run} &>> {log} "
39
- " ; "
40
- " vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} "
14
+ RunTable = None
15
+ def load_runtable ():
16
+ global RunTable
17
+ if RunTable is None :
18
+ from atlas .init import parse_sra
19
+ RunTable = parse_sra .load_and_validate_runinfo_table ()
20
+ return RunTable
41
21
22
+ def get_run_ids_for_sample (wildcards ):
23
+ RunTable = load_runtable ()
24
+ from atlas .init import parse_sra
25
+ return parse_sra .get_run_ids_for_sample (RunTable , wildcards .sample )
42
26
43
- rule extract_run :
44
- input :
45
- flag = rules .prefetch .output ,
27
+
28
+ rule kingfisher_get :
46
29
output :
47
- temp (
48
- expand (
49
- SRA_SUBDIR_RUN + "/{{sra_run}}/{{sra_run}}{fraction}.fastq.gz" ,
50
- fraction = SRA_read_fractions ,
51
- )
52
- ),
30
+ #dir = temp(directory("Reads/tmp/runs/{sample}")),
31
+ flag = temp (touch ("Reads/tmp/flags/{sample}.downloaded" )),
53
32
params :
54
- outdir = os .path .abspath (SRA_SUBDIR_RUN + "/{sra_run}" ),
55
- sra_file = SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}.sra" ,
33
+ run_ids = get_run_ids_for_sample ,
34
+ download_methods = "ena-ascp ena-ftp prefetch" ,
35
+ output_dir = lambda wc : SRA_SUBDIR_RUN / wc .sample ,
56
36
log :
57
- "logs/SRAdownload/extract/{sra_run}.log" ,
58
- benchmark :
59
- "logs/benchmarks/SRAdownload/fasterqdump/{sra_run}.tsv"
60
- threads : config ["simplejob_threads" ]
37
+ Path ("log/download_reads/download/{sample}.log" ).resolve (),
38
+ threads : config ['threads' ],
61
39
resources :
62
- time_min = 60 * int (config ["runtime" ]["simplejob" ]),
63
- mem_mb = 1000 , #default 100Mb
40
+ mem_mb = config ['mem' ]* 1000 ,
41
+ time_min = config ["runtime" ]["long" ]* 60 ,
42
+ ncbi_connection = 1
64
43
conda :
65
- "%s/sra .yaml" % CONDAENV
44
+ "../envs/kingfisher .yaml"
66
45
shell :
67
- " vdb-validate {params.sra_file} &>> {log} "
68
- " ; "
69
- " parallel-fastq-dump "
70
- " --threads {threads} "
71
- " --gzip --split-files "
72
- " --outdir {params.outdir} "
73
- " --tmpdir {resources.tmpdir} "
74
- " --skip-technical --split-3 "
75
- " -s {params.sra_file} &>> {log} "
46
+ " mkdir -p {params.output_dir} ; "
47
+ " cd {params.output_dir} "
76
48
" ; "
77
- " rm -f {params.sra_file} 2>> {log} "
78
-
79
-
80
-
49
+ "kingfisher get --run-identifiers {params.run_ids} "
50
+ " --download-threads 2 --extraction-threads {threads} "
51
+ " --hide-download-progress "
52
+ " --output-format-possibilities 'fastq.gz' "
53
+ " --force --check-md5sums "
54
+ " --download-methods {params.download_methods} "
55
+ " -f fastq.gz &> {log}"
81
56
82
57
83
58
84
-
85
- RunTable = None
86
- def get_run_fastq_for_sample (wildcards ):
87
-
88
- from atlas .init .parse_sra import load_and_validate_runinfo_table , get_run_ids_for_sample
89
-
90
- # load RunTable if not already loaded
91
- global RunTable
92
- if RunTable is None :
93
-
94
- RunTable = load_and_validate_runinfo_table ()
95
-
96
- run_ids = get_run_ids_for_sample (RunTable ,wildcards .sample )
59
+ def get_run_fastq_for_sample (run_ids ):
97
60
98
61
ReadFiles = {}
99
62
for fraction in SRA_read_fractions :
@@ -103,7 +66,7 @@ def get_run_fastq_for_sample(wildcards):
103
66
key = fraction
104
67
105
68
ReadFiles [key ] = expand (
106
- SRA_SUBDIR_RUN + "/{sra_run }/{sra_run}{fraction}.fastq.gz" ,
69
+ str ( SRA_SUBDIR_RUN / "{sample }/{sra_run}{fraction}.fastq.gz") ,
107
70
fraction = fraction ,
108
71
sra_run = run_ids ,
109
72
)
@@ -113,20 +76,39 @@ def get_run_fastq_for_sample(wildcards):
113
76
114
77
rule merge_runs_to_sample :
115
78
input :
116
- unpack ( get_run_fastq_for_sample ),
79
+ flag = "Reads/tmp/flags/{sample}.downloaded"
117
80
output :
118
81
expand (
119
82
"SRA/Samples/{{sample}}/{{sample}}{fraction}.fastq.gz" ,
120
83
fraction = SRA_read_fractions ,
121
84
),
85
+ params :
86
+ run_ids = get_run_ids_for_sample ,
122
87
threads : 1
123
88
run :
89
+
90
+ # print(list( (SRA_SUBDIR_RUN / wildcards.sample).iterdir()))
91
+
124
92
from utils import io
125
93
94
+
126
95
for i , fraction in enumerate (SRA_read_fractions ):
127
- if fraction == "" :
128
- fraction = "se"
129
- io .cat_files (input [fraction ], output [i ])
96
+
97
+ run_fastqs = expand (
98
+ SRA_SUBDIR_RUN / "{sample}/{sra_run}{fraction}.fastq.gz" ,
99
+ fraction = fraction ,
100
+ sra_run = params .run_ids ,
101
+ sample = wildcards .sample
102
+ )
103
+
104
+ assert all ([Path (f ).exists () for f in run_fastqs ])," Not all fastq files exist. Expected: %s" % run_fastqs
105
+
106
+ io .cat_files (run_fastqs , output [i ])
107
+
108
+
109
+
110
+
111
+
130
112
131
113
132
114
rule download_sra :
0 commit comments