Skip to content

Commit

Permalink
internal/sanitizer: keep sanitized child nodes of rejected nodes
Browse files Browse the repository at this point in the history
Instead of throwing away the entire tree rooted at a node that we
reject, instead move the (sanitized) children of the rejected node up
by one level. There are some nodes that we still throw away the entire
body for, such as script nodes.

Also throw away <a> tags that have no attributes left (because we're not
throwing out the bodies anymore).

For #61399

Change-Id: Ie62d26f4efbcd95533a0aeb0716fce2a09aaa4c7
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/545917
Reviewed-by: Jonathan Amsterdam <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
kokoro-CI: kokoro <[email protected]>
Reviewed-by: Roland Shoemaker <[email protected]>
  • Loading branch information
matloob committed Dec 5, 2023
1 parent 2c6cc9d commit cd7aa4a
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 24 deletions.
73 changes: 54 additions & 19 deletions internal/sanitizer/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,7 @@ func SanitizeBytes(b []byte) []byte {
if err != nil {
return []byte{}
}
var keepNodes []*html.Node
for _, n := range nodes {
if sanitize(n) {
keepNodes = append(keepNodes, n)
}
}
keepNodes := sanitizeNodes(nodes)
var buf bytes.Buffer
for _, n := range keepNodes {
html.Render(&buf, n)
Expand All @@ -48,22 +43,28 @@ func SanitizeBytes(b []byte) []byte {
}

// sanitize sanitizes the attributes and children of n.
// It returns false if the entire node should be cut out.
func sanitize(n *html.Node) bool {
// It returns false if the node should be cut out, and a list
// of parent-less nodes the node should be replaced with.
func sanitize(n *html.Node) ([]*html.Node, bool) {
switch n.Type {
case html.CommentNode:
return false
return nil, false
case html.DoctypeNode:
return false
return nil, false
case html.TextNode:
return true // Assume text nodes are safe
return nil, true // Assume text nodes are safe
case html.ElementNode:
if n.Namespace != "" {
return false
return nil, false
}
n.Data = strings.ToLower(n.Data)
if !allowElemsMap[n.Data] {
return false
switch n.Data {
case "frame", "frameset", "iframe", "noembed", "noframes", "noscript", "nostyle", "object", "script", "style", "title":
return nil, false
default:
return extractSanitizedChildren(n), false
}
}
keepAttr := []html.Attribute{}
for _, attr := range n.Attr {
Expand Down Expand Up @@ -93,22 +94,56 @@ func sanitize(n *html.Node) bool {
keepAttr = append(keepAttr, attr)
}
if n.Data == "a" {
if len(keepAttr) == 0 {
return extractSanitizedChildren(n), false
}
keepAttr = addRelNoFollow(keepAttr)
}
if n.Data == "img" {
if len(keepAttr) == 0 {
return nil, false
}
}
n.Attr = keepAttr
var removeChildren []*html.Node
replaceChildren := make(map[*html.Node][]*html.Node)
for child := n.FirstChild; child != nil; child = child.NextSibling {
if !sanitize(child) {
removeChildren = append(removeChildren, child)
if replace, ok := sanitize(child); !ok {
replaceChildren[child] = replace
}
}
for _, child := range removeChildren {
for child, replace := range replaceChildren {
for _, r := range replace {
n.InsertBefore(r, child)
}
n.RemoveChild(child)
}
return true
return nil, true
default:
return false
return extractSanitizedChildren(n), false
}
}

func extractSanitizedChildren(node *html.Node) []*html.Node {
var children []*html.Node
for child := node.FirstChild; child != nil; child = child.NextSibling {
children = append(children, child)
}
for _, child := range children {
node.RemoveChild(child)
}
return sanitizeNodes(children)
}

func sanitizeNodes(nodes []*html.Node) []*html.Node {
var keepNodes []*html.Node
for _, n := range nodes {
if replace, ok := sanitize(n); ok {
keepNodes = append(keepNodes, n)
} else {
keepNodes = append(keepNodes, replace...)
}
}
return keepNodes
}

func addRelNoFollow(attrs []html.Attribute) []html.Attribute {
Expand Down
18 changes: 13 additions & 5 deletions internal/sanitizer/sanitizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,17 @@ func TestSanitizeBytes(t *testing.T) {
"<script>body</script>",
"",
},
{
"<script><tag>body</tag></script>",
"",
},
{
`<a href="%">body</a>`,
`<a>body</a>`,
`body`,
},
{
`<a href="unrecognized://foo">body</a>`,
`<a>body</a>`,
`body`,
},
{
`<p dir="RTL" lang="en" id="foo" title="a title"></p>`,
Expand Down Expand Up @@ -52,6 +56,8 @@ func TestSanitizeBytes(t *testing.T) {
<img src="file.jpeg" alt="alt text" usemap="#map" width="600" height="400"/>
`,
},
{
Expand All @@ -75,8 +81,8 @@ func TestSanitizeBytes(t *testing.T) {
`text<p></p>`,
},
{
`<article badattr="bad"> <ol> <script> <li>hello</li> </script> <li>thi<wbr/>ng</li> </ol> <ul> </ul> </article>`,
`<article> <ol> <li>thi<wbr/>ng</li> </ol> <ul> </ul> </article>`,
`<article badattr="bad"> <ol> <bad> <li>hello</li> </script> <li>thi<wbr/>ng</li> </ol> <ul> </ul> </article>`,
`<article> <ol> <li>hello</li> <li>thi<wbr/>ng</li> </ol> <ul> </ul> </article>`,
},
{
`<details open="closed"></details>`,
Expand All @@ -96,7 +102,7 @@ func TestSanitizeBytes(t *testing.T) {
},
{
`<p><bad>A</bad><bad>B</bad></p>`,
`<p></p>`,
`<p>AB</p>`,
},
{
`
Expand Down Expand Up @@ -135,6 +141,8 @@ func TestSanitizeBytes(t *testing.T) {
</table>
`,
},
{`<p><bad><bad2><bad3><bad4>hello<bad5><bad6><p> middle</p>goodbye`,
`<p>hello</p><p> middle</p>goodbye`},
}

for _, tc := range testCases {
Expand Down

0 comments on commit cd7aa4a

Please sign in to comment.