forked from KevinMenden/TMHProjectFinal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXMLParser.py
64 lines (56 loc) · 2.18 KB
/
XMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from xml.dom.minidom import parse
from Bio.PDB import *
def parseXML(file):
sequences = []
data = parse(file)
all = data.getElementsByTagName("pdbtm")
count = 0
for pdbtm in all:
if count == 230:
break
chains = pdbtm.getElementsByTagName("CHAIN")
for chain in chains:
if chain.getAttribute("TYPE") == "alpha":
count += 1
sequences.extend(searchHelices(pdbtm))
break
return sequences
def searchHelices(pdbtm):
results = []
try:
pdbHelices = getHelices(pdbtm.getAttribute("ID"))
chains = pdbtm.getElementsByTagName("CHAIN")
for chain in chains:
if chain.getAttribute("TYPE") == "alpha":
id = chain.getAttribute("CHAINID")
sequence = chain.getElementsByTagName("SEQ")[0].firstChild.nodeValue.replace(" ", "").replace("\n", "")
regions = chain.getElementsByTagName("REGION")
for region in regions:
if region.getAttribute("type") == "H":
start = int(region.getAttribute("pdb_beg"))
if id in pdbHelices:
for entry in pdbHelices[id]:
if start >= entry["start"]:
if start <= entry["end"]:
subseq = sequence[entry["start"]:entry["end"]]
if "U" not in subseq:
results.append(subseq)
break
except IOError:
print("file not found")
return results
def getHelices(file):
dict = {}
pdb = open("tmh_set/pdb" + file + ".ent", "r")
lines = pdb.readlines()
for line in lines:
if line.startswith("HELIX"):
if line[39:40] == "1":
chain = line[19]
if not chain in dict:
dict[chain] = []
dict[chain].append({"start": int(line[22:25]), "end": int(line[34:37])})
return dict
if __name__ == "__main__":
sequences = parseXML("pdbtmall.xml")
print(len(sequences))