-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpropores_status.py
176 lines (147 loc) · 9.67 KB
/
propores_status.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import yaml
import argparse as ap
from collections import defaultdict
from typing import Dict, Union, Tuple, List
# extract a value from a dictionary if the key exists, return a default value if it does not
def get_value(dictionary: Dict[str, str], key: str) -> str:
if key in dictionary:
return str(dictionary[key])
return '?'
# get the parameters and log dictionaries from a YAML configuration file
def get_log(file_path: str) -> Tuple[Union[Dict, str], Union[Dict, str], str]:
with open(file_path, 'r') as log_file:
d = yaml.load(log_file, Loader=yaml.SafeLoader) # type: Dict
# extract the PDB name if the parameter log is given
log_dict = d['log'] if 'log' in d else dict() # type: Dict
param_dict = d['parameters'] if 'parameters' in d else dict() # type: Dict
pdb_name = get_value(param_dict, 'PDB name') # type: str
return param_dict, log_dict, pdb_name
# compute the run status
def run_state(start: str, end: str, total_runtime: str) -> str:
if start == '?':
return 'NOT STARTED'
if start != '?' and end == '?':
return 'IN PROGRESS'
return total_runtime
# the script needs to be wrapped in this guard for multiprocessing to work on Windows
if __name__ == '__main__':
# COMMAND LINE PARSING
parser = ap.ArgumentParser(description='Compute the run status (not yet started, in progress, finished) from '
'an directory that contains the output of one or more PROPORES 2.0 runs. '
'This includes start time, used parameters and runtime.')
parser.add_argument('directory', type=str, help='path to a directory containing the output of one or more '
'PROPORES runs')
parser.add_argument('overview', type=str, help='path to the file that is supposed to contain the status overview')
parser.add_argument('-i', '--input-directory', type=str, default='',
help='(optional) path to the input directory that was used to determine which PROPORES runs '
'have not been started yet')
args = parser.parse_args()
# SCRIPT
runs = defaultdict(lambda: defaultdict(lambda: '?'))
# generate input file list if the input directory was given (and exists)
if args.input_directory and os.path.isdir(args.input_directory):
# iterate over all files and potential sub-directories in the PDB input directory
for root, dirs, files in os.walk(args.input_directory):
for file in files:
# skip files that are not marked as PDB files
if not file.lower().endswith('.pdb'):
continue
# add PDB file path
name = '.'.join(file.split('.')[:-1])
runs[name]['PDB.path'] = os.path.join(root, file)
runs[name]['PDB.name'] = name
print('PDBs in the input directory: {0:,}'.format(len(runs)))
for root, dirs, files in os.walk(args.directory):
# skip sub-directory if it does not belong to a PROPORES run
if 'axes_trace_log.yaml' not in files and 'gate_open_log.yaml' not in files and 'pore_ID_log.yaml' not in files:
continue
# extract pore ID information
if 'pore_ID_log.yaml' in files:
# load the parameter and log file into a dictionary
params, log, name = get_log(os.path.join(root, 'pore_ID_log.yaml'))
# there is some error with the run if the parameters are not given
if name == '?' or params == '?':
continue
runs[name]['PDB.name'] = name
runs[name]['PDB.path'] = os.path.abspath(params['PDB path']) if 'PDB path' in params else '?'
runs[name]['output.directory'] = os.path.abspath(root)
runs[name]['ID.skip.H'] = get_value(params, 'skip hydrogen atoms')
runs[name]['ID.skip.HETATM'] = get_value(params, 'skip hetero atoms')
runs[name]['ID.skip.non.std.amino.acids'] = get_value(params, 'skip non-standard amino acids in ATOM records')
runs[name]['ID.keep.alternative.locations'] = get_value(params, 'keep alternative atom locations')
runs[name]['ID.run.axis.prep'] = get_value(params, 'run axes trace preparation')
runs[name]['ID.run.gate.prep'] = get_value(params, 'run gate open preparation')
runs[name]['ID.resolution'] = get_value(params, 'resolution')
runs[name]['ID.solvent.radius'] = get_value(params, 'solvent radius')
runs[name]['ID.probe.radius'] = get_value(params, 'probe radius')
runs[name]['ID.volume.threshold'] = get_value(params, 'volume threshold')
runs[name]['ID.selected.computation.mode'] = get_value(params, 'computation mode')
runs[name]['ID.filter'] = get_value(params, 'filter')
runs[name]['ID.started'] = get_value(log, 'started')
if 'PDB parsing stats' in log:
runs[name]['atoms'] = get_value(log['PDB parsing stats'], 'atoms')
runs[name]['removed.atoms'] = get_value(log['PDB parsing stats'], 'total skipped atoms')
runs[name]['ID.n.grid.boxes'] = get_value(log, 'number of grid boxes')
runs[name]['ID.atom.pairs'] = get_value(log, 'atom pairs')
runs[name]['ID.used.computation.mode'] = get_value(log, 'used computation mode')
runs[name]['ID.identified.pores'] = get_value(log, 'identified pores')
runs[name]['ID.finished'] = get_value(log, 'finished')
runs[name]['ID.total.runtime'] = get_value(log, 'total runtime')
# extract axis trace information
if 'axes_trace_log.yaml' in files:
# load the parameter and log file into a dictionary
params, log, name = get_log(os.path.join(root, 'axes_trace_log.yaml'))
# there is some error with the run if the parameters are not given
if name == '?' or params == '?':
continue
runs[name]['PDB.name'] = name
runs[name]['PDB.path'] = os.path.abspath(params['PDB path']) if 'PDB path' in params else '?'
runs[name]['output.directory'] = os.path.abspath(root)
runs[name]['axis.enabled'] = str(True)
runs[name]['axis.surface.patch.threshold'] = get_value(params, 'surface patch threshold')
runs[name]['axis.started'] = get_value(log, 'started')
runs[name]['ID.identified.pores'] = get_value(log, 'pores')
runs[name]['axis.finished'] = get_value(log, 'finished')
runs[name]['axis.total.runtime'] = get_value(log, 'total runtime')
# extract gate open information
if 'gate_open_log.yaml' in files:
# load the parameter and log file into a dictionary
params, log, name = get_log(os.path.join(root, 'gate_open_log.yaml'))
# there is some error with the run if the parameters are not given
if name == '?' or params == '?':
continue
runs[name]['PDB.name'] = name
runs[name]['PDB.path'] = os.path.abspath(params['PDB path']) if 'PDB path' in params else '?'
runs[name]['output.directory'] = os.path.abspath(root)
runs[name]['gate.enabled'] = str(True)
runs[name]['gate.perturb.value'] = get_value(params, 'perturb value')
runs[name]['gate.clash.tolerance'] = get_value(params, 'clash tolerance')
runs[name]['gate.difficulty.threshold'] = get_value(params, 'gate difficulty threshold')
runs[name]['gate.re.estimate.difficulty'] = get_value(params, 're-estimate gate difficulty')
runs[name]['gate.started'] = get_value(log, 'started')
runs[name]['gates'] = get_value(log, 'gates')
runs[name]['gate.finished'] = get_value(log, 'finished')
runs[name]['gate.total.runtime'] = get_value(log, 'total runtime')
# inform the users how many runs had at least some output
print('Runs in the output directory: {0:,}'.format(sum(1 for k, d in runs.items()
if d['ID.started'] != '?' or d['axis.started'] != '?'
or d['gate.started'] != '?')))
# compute the status
for key, run in runs.items():
run['ID.status'] = run_state(run['ID.started'], run['ID.finished'], run['ID.total.runtime'])
run['axis.status'] = run_state(run['axis.started'], run['axis.finished'], run['axis.total.runtime'])
run['gate.status'] = run_state(run['gate.started'], run['gate.finished'], run['gate.total.runtime'])
# generate and output the overview
with open(args.overview, 'w') as file:
header = ['PDB.name', 'ID.status', 'axis.status', 'gate.status', 'atoms', 'removed.atoms', 'ID.n.grid.boxes',
'ID.atom.pairs', 'ID.used.computation.mode', 'ID.identified.pores', 'gates',
'ID.resolution', 'ID.solvent.radius', 'ID.probe.radius', 'ID.volume.threshold',
'ID.skip.H', 'ID.skip.HETATM', 'ID.skip.non.std.amino.acids', 'ID.keep.alternative.locations',
'ID.selected.computation.mode', 'ID.filter',
'axis.surface.patch.threshold', 'gate.perturb.value', 'gate.clash.tolerance',
'gate.re.estimate.difficulty', 'PDB.path', 'output.directory']
fmt = '\t'.join('{' + str(i) + '}' for i in range(len(header))) + '\n'
file.write(fmt.format(*header))
for name, run in sorted(runs.items()):
file.write(fmt.format(*[run[key] for key in header]))