-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.go
167 lines (149 loc) · 5.31 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
package dom
import (
"encoding/xml"
"fmt"
"io"
"strings"
)
// Parser is the entrypoint of the dom package to parse an XML tree from the given
// reader, into the doc attribute, using the decoder from encoding/xml.
type Parser struct {
reader io.Reader // Reader containing the XML document.
Configuration Configuration
}
// NewParser constructs a new Parser using the given reader. The reader is expected
// to contain the (...valid) XML tree. Namespace awareness will be set to true per default.
// Parser configuration will be set to a default one.
func NewParser(reader io.Reader) *Parser {
b := &Parser{}
b.reader = reader
b.Configuration = NewConfiguration()
return b
}
// Parse parses an XML Document contained within the reader attribute of the current Parser.
// A Document will be returned and a nil error if the parsing succeeded.
func (b *Parser) Parse() (Document, error) {
doc := NewDocument()
decoder := xml.NewDecoder(b.reader)
var curNode = Node(doc)
for {
token, err := decoder.Token()
if err == io.EOF {
// End of file, processed okay
return doc, nil
}
if err != nil {
// Other error, return that.
return nil, err
}
switch typ := token.(type) {
case xml.Comment:
// Skip comments?
if !b.Configuration.Comments {
continue
}
cmt, err := doc.CreateComment(string(typ))
if err != nil {
return nil, err
}
if err = curNode.AppendChild(cmt); err != nil {
return nil, err
}
case xml.ProcInst:
// Note: the Go default decoder regards the XML declaration as a processing
// instruction, even though it is not. Therefore, we handle this edge case
// to NOT include this as a valid child node.
if strings.ToLower(typ.Target) != "xml" {
pi, err := doc.CreateProcessingInstruction(typ.Target, string(typ.Inst))
if err != nil {
return nil, err
}
if err = doc.AppendChild(pi); err != nil {
return nil, err
}
}
case xml.StartElement:
// FIXME: The default encoding/xml.Decoder does fuck all about prefixes.
// That's not all: https://github.com/golang/go/issues/11735
// Therefore, we don't/can't set any kind of prefix ourselves. The
// NormalizeDocument() or Normalize() can be used to "fix" the namespaces.
namespace := ""
// Are we parsing namespaces? e.g. namespace awareness?
if b.Configuration.NamespaceDeclarations {
namespace = typ.Name.Space
}
elem, err := doc.CreateElementNS(namespace, typ.Name.Local)
if err != nil {
return nil, err
}
// Iterate over the element's attributes.
for _, a := range typ.Attr {
namespace = ""
// Are we parsing namespaces?
if b.Configuration.NamespaceDeclarations {
namespace = a.Name.Space
}
attrName := a.Name.Local
// OK: there is some weirdness going in with parsing attributes.
// For example, these have different outputs:
//
// xmlns:bla="what" : namespace = "xmlns", localname = "bla", value = "what".
// bla="what" : namespace = "" (unless bound), localname = "bla", value = "what"
// ns0:bla="what" : namespace = [whatever ns0 is bound to], localname = "bla"
//
// Therefore, we handle the xmlns edge case like this, so we can assign prefixes
// to elements. Looks extremely hacky, and it is. I wish it could behave more SAX like.
// fmt.Printf("%s, %s, %s\n", a.Name.Space, a.Name.Local, a.Value)
if strings.HasPrefix(a.Name.Space, "xmlns") {
continue
}
// Add all other (normal) attributes.
attr, err := doc.CreateAttributeNS(namespace, attrName)
if err != nil {
return nil, err
}
attr.SetValue(a.Value)
elem.GetAttributes().SetNamedItem(attr)
}
if err = curNode.AppendChild(elem); err != nil {
return nil, err
}
curNode = elem
case xml.EndElement:
curNode = curNode.GetParentNode()
case xml.CharData:
// If there is no document element yet, and the character data is found which is NOT whitespace,
// generate an error. No character data allowed before document element, but whitespaces
// are okay to parse. Don't add it as a child element though.
if doc.GetDocumentElement() == nil {
if strings.TrimSpace(string(typ)) != "" {
return nil, fmt.Errorf("%v: content is not allowed in prolog", ErrorHierarchyRequest)
}
// We got whitespace. Don't add it as a child, merely continue the next token
// parsing in the stream.
continue
}
// Likewise, character data may not occur after the document element in the trailing
// section, so check that as well. The Go decoder doesn't care so we handle this edge
// case as well.
if curNode == doc {
if strings.TrimSpace(string(typ)) != "" {
// We cannot append text/chardata to the document itself.
return nil, fmt.Errorf("%v: content is not allowed in trailing section", ErrorHierarchyRequest)
}
// We got whitespace. Don't add it as a child, merely continue the next token
// parsing in the stream. Same behaviour as above.
continue
}
text := doc.CreateText(string(typ))
// Should we ignore ignorable whitespaces, and the text content is whitespace?
if !b.Configuration.ElementContentWhitespace && text.IsElementContentWhitespace() {
continue
}
// In all other cases, create a text node and add it to the current node as a child.
if err := curNode.AppendChild(text); err != nil {
return nil, err
}
}
}
}