forked from KevinMenden/TMHProjectFinal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDBextractor.py
133 lines (103 loc) · 4.24 KB
/
PDBextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
__date__ = '26.04.2016'
import sys, os, glob
import operator
import matplotlib.pyplot as plt
from Bio.PDB.PDBParser import PDBParser
class SecondaryStructureCounter(object):
def __init__(self, dir):
self.dir = dir
self.totalLength = 0
self.alphaHelix = 0
self.helices = []
self.proteinHelixSequences = []
self.alphaHelixPositions = []
self.aminoAcidDict = {"ALA": "A", "GLY": "G", "PHE": "F", "ILE": "I", "MET": "M", "LEU": "L", "PRO": "P", "VAL": "V",
"ASP": "D", "GLU": "E", "LYS": "K", "ARG": "R", "SER": "S", "THR": "T", "TYR": "Y", "HIS": "H",
"CYS": "C", "ASN": "N", "GLN": "Q", "TRP": "W", "MSE": "M" }
self.sortedHelix = []
self.backboneDistances = []
self.backboneDistHelix = []
def getResiduesFromChain(self, chain, start, end):
"""
Extract the amino acids given a chain and the end and start positions
:param chain: the chain
:param start: start of the helix
:param end: end of the helix
:return: array containing all residues of the helix
"""
helix_residues = []
for residue in chain.get_residues():
if (residue.id[1] >= start and residue.id[1] <= end and residue.get_resname() in self.aminoAcidDict.keys()):
helix_residues.append(residue)
return helix_residues
def getResiduesFromList(self, residues, start, end):
"""
:param residues:
:param start:
:param end:
:return:
"""
helix_residues = []
for res in residues:
if (res.id[1] >= start and res.id[1] <= end and res.get_resname() in self.aminoAcidDict.keys()):
helix_residues.append(res)
return helix_residues
def parsePDBInformation(self, file):
"""
Parses a single pdb file and counts the residues in
helices, sheets and the total length of the protein
"""
helices = []
helixSequences = []
f = open(self.dir + "/" + file)
line = f.readline()
while line:
#If HELIX, check for type and add length and positions
if line.startswith("HELIX"):
start = int(line[21:25].replace(" " ,""))
end = int(line[33:37].replace(" ", ""))
type = int(line[39:40])
chain = line[19:20].replace(" ", "")
currentHelix = (start, end, chain)
if type == 1:
helices.append(currentHelix)
line = f.readline()
else:
line = f.readline()
f.close()
# Parse the structure with a PDBParser object
pdbParser = PDBParser()
structure = pdbParser.get_structure("currentFile", self.dir+"/"+file)
# For every helix tuple, extract the residues and store them in helixSequences
for helix in helices:
if helix[2] == "":
residues = structure.get_residues()
helixSequences.append(self.getResiduesFromList(residues, helix[0], helix[1]))
chains = structure.get_chains()
for chain in chains:
if (chain.get_id() == helix[2]):
helixSequences.append(self.getResiduesFromChain(chain, helix[0], helix[1]))
return helixSequences
def convertResiduesToLetters(self, residues):
"""
Converts an array of BioPython residues to their amino acid letters
:param residues:
:return: the simple amino acid letteres
"""
sequence = ""
for res in residues:
sequence += self.aminoAcidDict[res.get_resname()]
return sequence
def parse_all_pdb_files(self):
"""
Parse all pdb tmh_set in the given directory (self.dir)
"""
for subdir, dirs, files in os.walk(self.dir):
for file in files:
if file.endswith(".pdb"):
print("Parsing file: " + file)
self.proteinHelixSequences.append(self.parsePDBInformation(file))
if __name__ == "__main__":
directory = "test"
structureCounter = SecondaryStructureCounter(directory)
structureCounter.parse_all_pdb_files()