-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdftofasta.py
executable file
·172 lines (140 loc) · 6 KB
/
pdftofasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python
# Sebastian Raschka 2014
# Script that converts amino acid residues from a PDB file into a FASTA string
#############################
# Amino acid abbreviations
#############################
#
# Unusual/modified amino acids
#
# ASH (protonated ASP) = D
# CYX (disulfide-bonded CYS) = C
# GLH (protonated GLU) = E
# HID/HIE/HIP (different protonation states of HIS) = H
# HYP (hydroxyproline) = P
# MSE (selenomethionine) = M
# CSE (selenocysteine) = U
# LNT (N-((2S)-2-amino-1,1-dihydroxy-4-methylpentyl)-L-threonine)
# Ambiguous amino acids
#
# ASX (asparagine or aspartic acid) = B
# GLX (glutamine or glutamic acid = Z
AMINO_ACIDS_3TO1 = {'CYS': 'C', 'ASP': 'D', 'GLN': 'Q', 'ILE': 'I',
'ALA': 'A', 'TYR': 'Y', 'TRP': 'W', 'HIS': 'H',
'LEU': 'L', 'ARG': 'R', 'VAL': 'V', 'GLU': 'E',
'PHE': 'F', 'GLY': 'G', 'MET': 'M', 'ASN': 'N',
'PRO': 'P', 'SER': 'S', 'LYS': 'K', 'THR': 'T',
# extended set of amino acids:
'MSE': 'M', 'CSE': 'U', 'LNT': 'X', 'GLH': 'E',
'HID': 'H', 'HIE': 'H', 'HIP': 'H', 'HYP': 'P',
# ambigous amino acids:
'ASX': 'B', 'GLX': 'Z'
}
AMINO_ACIDS_1TO3 = {'A': 'ALA', 'C': 'CYS', 'D': 'ASP', 'E': 'GLU',
'F': 'PHE', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
'K': 'LYS', 'L': 'LEU', 'M': 'MET', 'N': 'ASN',
'P': 'PRO', 'Q': 'GLN', 'R': 'ARG', 'S': 'SER',
'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR',
# extended set of amino acids:
'U': 'CSE', 'X': 'LNT',
# ambigous amino acids:
'B': 'ASX', 'Z': 'GLX'
}
class Pdb(object):
""" Object that allows operations with protein files in PDB format. """
def __init__(self, file_cont = [], pdb_code = ""):
self.cont = []
self.chains = []
self.fileloc = ""
if isinstance(file_cont, list):
self.cont = file_cont[:]
elif isinstance(file_cont, str):
try:
with open(file_cont, 'r') as pdb_file:
self.cont = [row.strip() for row in pdb_file.read().split('\n') if row.strip()]
except FileNotFoundError as err:
print(err)
if self.cont:
self.chains = self._get_chains()
def _get_chains(self):
"""
Splits a PDB file into individual chains.
Returns a dictionary with the respective chains, where
the Chain IDs are the keys, and the lines of the chain
are the dictionary values as lists:
{'A':[chain A lines], 'B':[...], ...}
"""
chain_dict = dict()
for line in self.cont:
if line.startswith(('ATOM', 'HETATM', 'TER')):
if line[21:22] not in chain_dict:
chain_dict[line[21:22]] = []
chain_dict[line[21:22]].append(line)
return chain_dict
def to_fasta(self, hetatm=False):
"""
Converts the PDB protein atoms into a fasta string and
returns the results as a dictionary, where the keys are
chain IDs and the items a list of 1-letter amino acid
codes.
E.g., {'H': ['K'], 'L': ['D', 'I', 'V', 'M']}
Keyword arguments:
hetatm (bool): If True, also HETATM lines are considered.
Returns a dictionary with the protein chain letters A-Z as keys
and the FASTA sequence as values (as list of characters).
E.g.,
{'A': ['P', 'Q', 'I', ...], 'B': ['P', 'Q', 'I', ...], ...}
"""
prev_seq_num = 0
fasta_dict = dict()
if hetatm:
hetatm = "HETATM"
else:
hetatm = "ATOM"
for chain in self.chains.items():
fasta_sequence = []
for line in chain[1]:
if line.startswith(("ATOM", hetatm)):
try:
aa_3letter = line[17:20].strip()
aa_1letter = AMINO_ACIDS_3TO1[aa_3letter]
res_seqnumber = line[22:26].strip()
res_seqnumber = int(res_seqnumber)
if prev_seq_num != res_seqnumber:
fasta_sequence.append(aa_1letter)
prev_seq_num = res_seqnumber
except KeyError:
pass
if line.startswith("ATOM"):
print('Warning: Residue {} unknown.'.format(line))
fasta_dict[chain[0]] = fasta_sequence
return fasta_dict
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Converts amino acid residues from PDB file into a FASTA string',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('PDBfile')
parser.add_argument('-l', '--ligand', action='store_true', help='includes HETATM residues.')
parser.add_argument('-o', '--out', metavar='out.fasta', type=str,
help='writes FASTA strings to an output file instead of printing it to the screen')
parser.add_argument('-v', '--version', action='version', version='pdb_to_fasta v. 1.0')
args = parser.parse_args()
in_pdb = Pdb(args.PDBfile)
fastas = sorted(in_pdb.to_fasta(hetatm=args.ligand).items())
if args.out:
with open(args.out, 'w') as out:
for chain in fastas:
out.write('>Chain {}:\n'.format(chain[0]))
for amino_code in chain[1]:
out.write(amino_code)
out.write('\n\n')
else:
for chain in fastas:
print('>Chain {}:\n'.format(chain[0]))
amino_list = []
for amino_code in chain[1]:
amino_list.append(amino_code)
print("".join(amino_list))
print('\n\n')