-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_mhcii_prediction.py
166 lines (154 loc) · 7.64 KB
/
run_mhcii_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python2.7
"""
Author : Arjun Arkal Rao
Affiliation : UCSC BME, UCSC Genomics Institute
File : docker_scripts/run_MHC_prediction.py
Program info can be found in the docstring of the main function.
Details can also be obtained by running the script with -h .
"""
from __future__ import division, print_function
from subprocess import call
from datetime import datetime as dt
from collections import Counter
import re
import sys
import os
import prepare
import pi_errors
def process_parameters(params):
'''
This module conducts the error handling for all parmeters passed to the
program.
'''
# Are predictor executables set up correctly?
if os.path.split(params.mhc_executable)[1] == 'mhc_II_binding.py':
params.mhc_executable = pi_errors.test_param_value(
params.mhc_executable, 'mhc_II_binding.py', '--mhc_predictor',
params.logfile)
params.netmhciipan_executable = pi_errors.test_param_value(
params.netmhciipan_executable, 'netMHCIIpan', '--netMHCIIpan',
params.logfile)
else:
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --mhc_predictor has to be predict_binding.py', params.logfile)
# List of acceptable prediction methods. Used to ensure the correct
# prediction method has been provided.
prediction_methods = set(['comblib', 'consensus3', 'IEDB_recommended',
'NetMHCIIpan', 'nn_align', 'smm_align',
'tepitope', ])
if params.pred_meth not in prediction_methods:
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --prediction_method has to be one of ' + \
''.join(prediction_methods), params.logfile)
# For MHCII, peplen can only be 15.
params.peplen = [15]
# Ensure the --file_prefix has been set up right
if params.file_prefix.endswith('.faa'):
pepfilename = pi_errors.test_param_value(
'/'.join([params.file_path, params.file_prefix]), 'Input file',
'--file_prefix', params.logfile)
else:
# More that 1 peptide provided and file_prefix points to a single file
raise pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': --file_prefix must point to a valid .faa file.', params.logfile)
# Test the input alleles for validity
# First, if an input .allele file is provided, parse it to obtain the list
# of alleles
if len(params.alleles) == 1 and params.alleles[0].endswith(".alleles"):
# This means the user has provided a .alleles file
pi_errors.test_param_value(params.alleles[0], params.alleles[0],
'--alleles', params.logfile)
with open(params.alleles[0], 'r') as allele_file:
params.alleles = []
for line in allele_file:
params.alleles.append(line.strip())
# Once the .allele file has been parsed, params.alleles now either contains
# the parsed alleles, or the list of alleles provided by the user. Now we
# need to ensure that all alleles have been formatted in the required
# format Eg. HLA-DRB1*01:04, HLA-DQA1*03:01/DQB1*03:02
# IMPORTANT: HLA-DP isn't implemented yet -- none of the predictors use it
MHCII = re.compile(
r'HLA-D((RB\d\*\d{2}:\d{2})|'
r'([PQ]A\d\*\d{2}:\d{2}/D[PQ]B\d\*\d{2}:\d{2}))')
for allele in params.alleles:
# For each allele in params.alleles, cehck if it matches the regex for
# a generic MHCI molecule
if not MHCII.match(allele):
return pi_errors.ParameterError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': Alleles for mhcii must be in the form HLA-DRB1*XX:XX or' + \
' HLA-DQA1*XX:XX/DQB1*XX:XX where X is in [0, 9]',
params.logfile)
return pepfilename
def main():
"""
This wrapper script will run the IEDB tools within the cutadapt docker
container for the precision immuno project. The wrapper requires
1. IEDB tools for MHCI prediction - http://tools.iedb.org/mhci
2. netMHCIIpan - In case IEDB tools fails
3. python
This script requires an input file (--file_prefix) that contains
(2 * PEPLEN - 1)-mer
fasta records for analysis. FILE_PREFIX must be a .faa file.
Unless specified, the program will look for default executables on $PATH.
The program DOES NOT look for jar files and they are required to be
passed during execution.
"""
# Parse the arguments using prepare.parse_args()
params = prepare.parse_args(main.__doc__, 'mhc', 'mhci_predictions')
# Params ERROR handling
# peplen_filenames is a dictionary with peptide length as key and the full
# path to the filename associated with the peplen as the value.
pepilename = process_parameters(params)
# Move to working directory before doing I/O intensive work
os.chdir(params.working_dir)
# set up the different allele regexes
strip_allele_regex = re.compile(r'[\*:/-]') # For strip allele
dpqa_allele_regex_1 = re.compile(r'[\*:]') # For DPA and DQA if netMHCIIpan
dpqa_allele_regex_2 = re.compile(r'/') # is used
for allele in params.alleles:
for peptide_length in params.peplen:
# Setup the output file
# Strip allele converts HLA-DRB1*15:01 to HLA_DRB1_15_01 and
# HLA-DQA1*01:02/DQB1*03:02 to HLA_DQA1_01_02_DQB1_03_02
strip_allele = re.sub(strip_allele_regex, '_', allele)
# Setup the call
mhc_ii_call = ['python', params.mhc_executable] # base call
mhc_ii_call.append(params.pred_meth) # prediction method
mhc_ii_call.append(allele) # Allele
mhc_ii_call.append(pepfilename)
mhc_outfile_name = ''.join([params.out_prefix, '_', allele, '.tsv'])
with open(mhc_outfile_name, 'w') as mhc_outfile:
return_value = call(mhc_ii_call, stdout=mhc_outfile,
stderr=params.logfile)
if return_value != 0:
print('WARNING: IEDBtools failed. Attempting netMHCIIpan',
file=params.logfile)
# netmHCIIpan needs a different formatting for allele
# HLA-DQA1*01:02/DQB1*03:02 should be HLA-DQA10102-DQB10302
# HLA-DRB1*15:01 should be DRB1_1501. DP and DQ are similar
if allele.startswith('HLA-DQ') or allele.startswith('HLA-DP'):
allele = re.sub(dpqa_allele_regex_1, '', allele)
allele = re.sub(dpqa_allele_regex_2, '-', allele)
else:
allele = strip_allele[4:] # Easier than starting from allele
netmhc_ii_call = [params.netmhciipan_executable]
netmhc_ii_call.extend(['-a', allele]) # Allele
netmhc_ii_call.extend(['-xls', 1])
netmhc_ii_call.extend(['-xlsfile', mhc_outfile_name])
netmhc_ii_call.extend(['-f', pepfilename])
return_value = call(mhc_ii_call, stderr=params.logfile)
if return_value != 0:
raise pi_errors.MyRuntimeError(
dt.now().strftime('%I:%M %p %b %d, %Y') + \
': MHCII prediction failed.', params.logfile)
# Move files from temp directory to outdir
prepare.move_output(params)
print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
'completed', file=params.logfile)
params.logfile.close()
if __name__ == "__main__":
sys.exit(main())