-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_mhci_prediction.py
executable file
·167 lines (155 loc) · 7.66 KB
/
run_mhci_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python2.7
"""
Author : Arjun Arkal Rao
Affiliation : UCSC BME, UCSC Genomics Institute
File : docker_scripts/run_MHC_prediction.py
Program info can be found in the docstring of the main function.
Details can also be obtained by running the script with -h .
"""
from __future__ import division, print_function
from subprocess import call
from datetime import datetime as dt
from collections import Counter
import sys
import os
import prepare
import pi_errors
def process_parameters(params):
'''
This module conducts the error handling for all parmeters passed to the
program.
'''
# Is mhc_executable set up correctly?
if os.path.split(params.mhc_executable)[1] == 'predict_binding.py':
params.mhc_executable = pi_errors.test_param_value(
params.mhc_executable, 'predict_binding.py', '--mhc_predictor',
params.logfile)
else:
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --mhc_predictor has to be predict_binding.py', params.logfile)
# List of acceptable prediction methods. Used to ensure the correct
# prediction method has been provided.
prediction_methods = set(["ann", "comblib_sidney2008", "consensus",
"IEDB_recommended", "netmhcpan", "smm",
"smmpmbec"])
if params.pred_meth not in prediction_methods:
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --prediction_method has to be one of ' + \
''.join(prediction_methods), params.logfile)
# Test the value of peplen. For MHCI it can only be 9, 10 or 11.
if params.peplen == "mhc dependent":
params.peplen = [9, 10]
elif set([int(x) for x in params.peplen]).difference(set([9, 10, 11])):
# This is true iff params.peplen contains an item not in [9, 10, 11].
# The values are stored in params as strings and need to be converted
# to ints hence the list comprehension.
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --peplen must be provided space separated values in [9, 10, 11]',
params.logfile)
else:
# This means the user has passed acceptable values to --peplen
params.peplen = [int(x) for x in params.peplen]
# Once we have the peptide lengths, we need to ensure the --file_prefix has
# been set up right
peplen_filenames = Counter() # The
if len(params.peplen) == 1 and params.file_prefix.endswith('.faa'):
# There is only 1 peplen and the full filename has been provided
peplen_filenames[params.peplen[0]] = pi_errors.test_param_value(
'/'.join(params.file_path, params.file_prefix]), 'Input file',
'--file_prefix', params.logfile)
elif not params.file_prefix.endswith('.faa')):
# 1 or more peptide lengths provided and file_prefix is a prefix and
# not a path to a .faa file
for peplen in params.peplen:
peplen_filenames[params.peplen[0]] = pi_errors.test_param_value(
''.join([params.file_path, '/', params.file_prefix, '_',
str(params.peplen), '_mer_snpeffed.faa']),
'Input file', '--file_prefix', params.logfile)
else:
# More that 1 peptide provided and file_prefix points to a single file
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': If more than 1 peplen is provided, --file_prefix must be a ' + \
'prefix to the individual .faa files and not the path to a ' + \
'single .faa file itself.', params.logfile)
# Test the input alleles for validity
# First, if an input .allele file is provided, parse it to obtain the list
# of alleles
if length(params.alleles) == 1 and params.alleles[0].endswith(".alleles"):
# This means the user has provided a .alleles file
pi_errors.test_param_value(params.alleles[0], params.alleles[0],
'--alleles', params.logfile)
with open(params.alleles[0], 'r') as allele_file:
params.alleles=[]
for line in allele_file:
params.alleles.append(line.strip())
# Once the .allele file has been parsed, params.alleles now either contains
# the parsed alleles, or the list of alleles provided by the user. Now we
# need to ensure that all alleles have been formatted in the required
# format Eg. HLA-A*01:04
MHCI = re.compile('HLA-[ABC]\*\d{2}:\d{2}')
for allele in params.alleles:
# For each allele in params.alleles, cehck if it matches the regex for
# a generic MHCI molecule
if not MHCI.match(allele):
return pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': Alleles for mhci must be in the form HLA-[A/B/C]*XX:XX ' + \
' where X is in [0, 9]', params.logfile)
return peplen_filenames
def main():
"""
This wrapper script will run the IEDB tools within the cutadapt docker
container for the precision immuno project. The wrapper requires
1. IEDB tools for MHCI prediction - http://tools.iedb.org/mhci
2. python
This script requires an input file (--file_prefix) that contains
(2 * PEPLEN - 1)-mer
fasta records for analysis. If you are looking into multiple peplens,
--file_prefix should be expandable to:
FILE_PREFIX_PEPLEN_mer_snpeffed.faa
for each value of PEPLEN. If you specify only 1 peplen, if the value of
FILE_PREFIX ends with .faa then the value is used as is. If not, the prefix
needs to expand as above.
Unless specified, the program will look for default executables on $PATH.
The program DOES NOT look for jar files and they are required to be
passed during execution.
"""
# Parse the arguments using prepare.parse_args()
params = prepare.parse_args(main.__doc__, 'mhc', 'mhci_predictions')
# Params ERROR handling
# peplen_filenames is a dictionary with peptide length as key and the full
# path to the filename associated with the peplen as the value.
peplen_filenames = process_parameters(params)
# Move to working directory before doing I/O intensive work
os.chdir(params.working_dir)
# set up the strip allele regex
strip_allele_regex = re.compile('[\*:-]')
for allele in params.alleles:
for peptide_length in params.peplen:
# Setup the output file
strip_allele = re.sub(strip_allele_regex, '_', allele)
# Setup the call
mhc_i_call = ['python', params.mhc_executable] # base call
mhc_i_call.append(params.pred_meth) # prediction method
mhc_i_call.append(allele) # Allele
mhc_i_call.append(peptide_length)
mhc_i_call.append(peplen_filenames[peptide_length])
mhc_outfile_name = ''.join([params.out_prefix, '_', allele, '.tsv'])
with open(mhc_outfile_name, 'w') as mhc_outfile:
return_value = call(mhc_i_call, stdout=mhc_outfile,
stderr=params.logfile)
if return_value != 0:
raise pi_errors.MyRuntimeError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': MHCI prediction failed.', params.logfile)
# Move files from temp directory to outdir
prepare.move_output(params)
print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
'completed', file=params.logfile)
params.logfile.close()
if __name__ == "__main__":
sys.exit(main())