-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSnakefile
190 lines (158 loc) · 7.11 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
Pipeline for the Mutational Landscapes/Amino Acids Subtypes Project
"""
# General todo list for pipeline
# TODO add automated download of some of the input data?
import os
from pathlib import Path
from collections import defaultdict
from ruamel.yaml import YAML
yaml = YAML(typ='safe')
import subtypes_utils as sutil
configfile: 'snakemake.yaml'
localrules:
all, quick_clean, full_clean, standardise_all_studies, all_sift_predictions,
make_gene_fasta, all_foldx_predictions, foldx_variants, foldx_split,
filter_pdb, naccess
# Hash of study IDs to their config
STUDIES = {s: yaml.load(Path(f'data/studies/{s}/{s}.yaml')) for
s in os.listdir('data/studies') if not s.startswith('.')}
UNFILTERED_STUDIES = [s for s,v in STUDIES.items() if not v['qc']['filter']]
# Hash linking genes to studies
GENES = defaultdict(list)
for study, conf in STUDIES.items():
GENES[sutil.gene_to_filename(conf['gene'])].append(study)
UNFILTERED_GENES = [g for g, v in GENES.items() if any(s in UNFILTERED_STUDIES for s in v)]
STRUCTURES = yaml.load(Path('meta/structures.yaml'))
AA_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY'
STANDARD_CLUSTERINGS = [Path(x).stem for x in os.listdir('meta/subtypes')]
PDB_LANDSCAPE_FACTORS = ['PC1', 'PC2', 'PC3', 'PC4', 'total_energy', 'mean_sift', 'mean_score',
'tSNE1', 'tSNE2', 'umap1', 'umap2']
PDB_LANDSCAPE_FACTORS.extend(list(AA_ALPHABET))
FIGURES = list(range(1,7)) + [f'S{i}' for i in range (1, 30)] + 'S8_27'
#### Include subroutines ####
include: 'bin/pipeline/data_validation.smk'
include: 'bin/pipeline/standardisation.smk'
include: 'bin/pipeline/sequence_statistics.smk'
include: 'bin/pipeline/foldx.smk'
include: 'bin/pipeline/structure_statistics.smk'
include: 'bin/pipeline/landscape.smk'
include: 'bin/pipeline/subtypes.smk'
include: 'bin/pipeline/figures.smk'
#### Global rules ####
rule all:
"""
Run main pipeline, generating the subtypes and figures from the paper.
"""
input:
'data/combined_mutational_scans.tsv', # Covers standardisation, SIFT, FoldX + other stats
rules.summarise_study_set.output,
rules.summarise_standardised_data.output,
rules.compare_hclust_dynamic_deep_split.output,
rules.final_subtypes.output,
rules.sift_subtypes.output,
[f"figures/4_figures/figure{n}.pdf" for n in FIGURES]
rule full_analysis:
"""
Run complete pipeline, including suplementary anlyses
"""
input:
'data/combined_mutational_scans.tsv', # Covers standardisation, SIFT, FoldX + other stats
rules.summarise_study_set.output,
[r.output for r in VALIDATION_RULES],
rules.study_summary_plots.output,
rules.summarise_standardised_data.output,
rules.landscape_dimensionality_reduction.output,
[f'figures/1_landscape/pdb/{f}_colourbar.pdf' for f in PDB_LANDSCAPE_FACTORS],
[f'figures/1_landscape/pdb/{g}/{g}_{f}.png' for g in GENES for f in PDB_LANDSCAPE_FACTORS],
rules.evaluate_kmeans_k.output,
rules.compare_hclust_dynamic_deep_split.output,
rules.compare_subtypes.output,
[f'data/subtypes/{x}.tsv' for x in STANDARD_CLUSTERINGS],
[f'figures/2_subtypes/{x}/aa_profiles/A.pdf' for x in STANDARD_CLUSTERINGS],
rules.final_subtypes.output,
rules.all_position_subtypes.output,
rules.continuous_characterisation.output,
rules.sift_subtypes.output,
[f"figures/4_figures/figure{n}.pdf" for n in FIGURES]
# Only remove rapidly generated results
def quick_clean_files():
"""
Generate a list of rm strings identifying rapidly generated files that
can be safely deleted.
"""
output_files = [f'data/studies/{s}/{s}.tsv' for s in STUDIES.keys()]
output_files.append('-r figures/0_data/*')
output_files.append('-r figures/1_landscape/*')
output_files.append('-r figures/2_subtypes/*')
output_files.append('-r figures/3_continuous/*')
output_files.append('meta/study_summary.tsv')
output_files.append('meta/gene_summary.tsv')
output_files.append('meta/overall_summary')
output_files.append('data/subtypes/*')
output_files.append('data/backbone_angles/*')
output_files.append('data/chemical_environment/*')
return output_files
rule quick_clean:
"""
Delete all files that can be quickly regenerated
"""
run:
for i in quick_clean_files():
shell(f'rm {i} && echo "rm {i}" || true')
# Same as clean, but also remove slower to genreate Sift & FoldX results
rule full_clean:
"""
Delete all files generated by the workflow, including those that take a
long time to generate (SIFT, FoldX etc.)
"""
run:
output_files = quick_clean_files()
# Main data files
output_files.append('data/combined_mutational_scans.tsv')
output_files.append('data/long_combined_mutational_scans.tsv')
# Quick to generate but dependancy for SIFT/Porter5
output_files.append('data/fasta/*')
# SIFT results
output_files.extend([f'data/sift/{g}.*' for g in GENES.keys()])
# FoldX results
output_files.extend([f"-r data/foldx/{g}/*" for g in GENES.keys()])
# Porter5 results
output_files.extend([f'data/porter5/{g}.*' for g in GENES.keys()])
# Naccess results
output_files.extend([f'data/surface_accessibility/{g}.asa' for g in GENES.keys()])
output_files.extend([f'data/surface_accessibility/{g}.rsa' for g in GENES.keys()])
for i in output_files:
shell(f'rm {i} && echo "rm {i}" || true')
rule setup_directories:
"""
Setup initial project directory structure for all generated files. Assumes bin, src & docs
exist. Plus many of the others will often already exist for various reasons, but this rule
ensures everything is setup correctly
"""
run:
# data
shell('mkdir data && echo "mkdir data" || true')
dirs = ['backbone_angles', 'chemical_environment', 'clusterings',
'foldx', 'pdb', 'sift', 'studies', 'fasta', 'porter5',
'surface_accessibility', 'wip']
for d in dirs:
shell(f'mkdir data/{d} && echo "mkdir data/{d}" || true')
# figures
shell('mkdir figures && echo "mkdir figures" || true')
dirs = ['0_data', '1_landscape', '1_landscape/pdb', '2_subtypes',
'3_continuous', '4_figures']
for d in dirs:
shell(f'mkdir figures/{d} && echo "mkdir figures/{d}" || true')
# logs
shell('mkdir logs && echo "mkdir logs" || true')
dirs = ['calculate_backbone_angles', 'data_validation', 'dbscan_clustering',
'filter_pdb', 'foldx_combine', 'foldx_model', 'foldx_repair',
'foldx_split', 'foldx_variants', 'make_subtypes',
'make_gene_fasta', 'naccess', 'sift4g', 'standardise_study',
'within_a_profile', 'porter5', 'project_landscape', 'project_landscape_colourbar']
for d in dirs:
shell(f'mkdir logs/{d} && echo "mkdir logs/{d}" || true')
# meta
shell('mkdir meta && echo "mkdir meta" || true')
shell('mkdir meta/subtypes && echo "mkdir meta/subtypes" || true')