-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile.index
111 lines (96 loc) · 5.24 KB
/
Snakefile.index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Copyright (c) Tuukka Norri 2020
# Licenced under the MIT licence.
import functools
CHR_NAMES = [chr_name for chr_name, _ in config["input_a2m"]]
REF_INDICES = [1 + x for x in range(config["n_refs"])]
wildcard_constraints:
chr_id = "|".join(map(lambda chr_name: f"({re.escape(chr_name)})", CHR_NAMES))
rule all:
input:
std_ref = expand("{index_root}/std_ref.fa", index_root = config["index_root"]),
max_edit_distance_file = expand("{index_root}/max_edit_distance.txt", index_root = config["index_root"]),
max_read_len_file = expand("{index_root}/max_read_len.txt", index_root = config["index_root"]),
chr_list_file = expand("{index_root}/chr_list.txt", index_root = config["index_root"]),
len_files = expand("{index_root}/{chr_id}/recombinant.n{seq_idx}.gaps.len", index_root = config["index_root"], chr_id = CHR_NAMES, seq_idx = REF_INDICES),
sp_files = expand("{index_root}/{chr_id}/recombinant.n{seq_idx}.gaps.sp", index_root = config["index_root"], chr_id = CHR_NAMES, seq_idx = REF_INDICES),
bowtie_index = expand("{index_root}/recombinant.all.fa.x", index_root = config["index_root"])
run:
print(f"Index has been generated at {config['index_root']}.")
rule max_edit_distance_file:
output: expand("{index_root}/max_edit_distance.txt", index_root = config["index_root"])
run:
with open(f"{output}", "w") as f:
f.write(str(config["max_edit_distance"]))
rule max_read_len_file:
output: expand("{index_root}/max_read_len.txt", index_root = config["index_root"])
run:
with open(f"{output}", "w") as f:
f.write(str(config["max_read_length"]))
rule chr_list_file:
output: expand("{index_root}/chr_list.txt", index_root = config["index_root"])
run:
with open(f"{output}", "w") as f:
for chr_name, _ in config["input_a2m"]:
f.write(chr_name)
f.write("\n")
rule a2m_prepare:
conda: "workflow/envs/panvc.yaml"
message: "Handling A2M input"
output:
names_plain = expand("{index_root}/{{chr_id}}/names.plain", index_root = config["index_root"]),
n_refs = expand("{index_root}/{{chr_id}}/n_refs.txt", index_root = config["index_root"]),
msa_len = expand("{index_root}/{{chr_id}}/msa_len.txt", index_root = config["index_root"]),
gapped_recombinants = expand("{index_root}/{{chr_id}}/recombinant.n{seq_idx}.gapped", index_root = config["index_root"], seq_idx = REF_INDICES)
benchmark: f"{config['benchmark_dir']}/a2m_prepare/{{chr_id}}"
script: "workflow/scripts/panvc_a2m_prepare.py"
rule all_sequences_file:
# Actually more outputs.
message: "Combining input"
priority: 50
input:
names_plain = expand("{index_root}/{chr_id}/names.plain", index_root = config["index_root"], chr_id = CHR_NAMES),
n_refs = expand("{index_root}/{chr_id}/n_refs.txt", index_root = config["index_root"], chr_id = CHR_NAMES),
msa_len = expand("{index_root}/{chr_id}/msa_len.txt", index_root = config["index_root"], chr_id = CHR_NAMES),
gapped_recombinants = expand("{index_root}/{chr_id}/recombinant.n{seq_idx}.gapped", index_root = config["index_root"], chr_id = CHR_NAMES, seq_idx = REF_INDICES)
output: expand("{index_root}/recombinant.all.fa", index_root = config["index_root"])
benchmark: f"{config['benchmark_dir']}/all_sequences_file"
run:
for curr_ref in range(1, 1 + config["n_refs"]):
for chr_id, _ in config["input_a2m"]:
src_dir = f"{config['index_root']}/{chr_id}"
sequence_gapped_path = f"{src_dir}/recombinant.n{curr_ref}.gapped"
ref_name = f"pg_ref_{chr_id}_{curr_ref}"
shell(f"echo '>{ref_name}' >> {output[0]}")
shell(f"tr -d '-' < {sequence_gapped_path} >> {output[0]}")
rule gap_positions:
conda: "workflow/envs/panvc.yaml"
message: "Storing gap positions"
input:
gapped_recombinants = expand("{index_root}/{{chr_id}}/recombinant.n{{seq_idx}}.gapped", index_root = config["index_root"])
params:
gap_pos_prefix = expand("{index_root}/{{chr_id}}/recombinant.n{{seq_idx}}.gaps", index_root = config["index_root"])
output:
len_files = expand("{index_root}/{{chr_id}}/recombinant.n{{seq_idx}}.gaps.len", index_root = config["index_root"]),
sp_files = expand("{index_root}/{{chr_id}}/recombinant.n{{seq_idx}}.gaps.sp", index_root = config["index_root"])
benchmark: f"{config['benchmark_dir']}/gap_positions/{{chr_id}}-seq{{seq_idx}}"
shell: "panvc-store-gaps {input.gapped_recombinants} {params.gap_pos_prefix}"
rule chic_index:
# Actually more outputs.
conda: "workflow/envs/panvc.yaml"
message: "Indexing with CHIC"
threads: workflow.cores
priority: 50
input: expand("{index_root}/recombinant.all.fa", index_root = config["index_root"])
output: expand("{index_root}/recombinant.all.fa.x", index_root = config["index_root"])
resources:
mem_mb = config["max_memory_MB"]
params:
max_read_length = config["max_read_length"],
max_edit_distance = config["max_edit_distance"],
benchmark: f"{config['benchmark_dir']}/chic_index"
shell: "chic-index {input} {params.max_read_length} --kernel=BOWTIE2 --lz-parsing-method=RELZ --max-edit-distance={params.max_edit_distance} --mem={resources.mem_mb} --threads={threads} --verbose=2"
rule std_ref:
message: "Outputting standard reference"
conda: "workflow/envs/panvc.yaml"
output: expand("{index_root}/std_ref.fa", index_root = config["index_root"])
script: "workflow/scripts/panvc_a2m_to_std_ref.py"