Skip to content

Commit c970840

Browse files
committed
Fix docs to include references and examples for xml2.
Also included tests for xml2 that are equivalent to XML tests.
1 parent 7182256 commit c970840

10 files changed

+570
-20
lines changed

man/css_to_xpath.Rd

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
\description{
77
This function aims to create an XPath expression equivalent to what
88
would be matched by the given CSS selector. The reason the translation
9-
is required is because the XML package, being a libxml2 wrapper, can
10-
only evaluate XPath expressions.
9+
is required is because the XML and xml2 packages, being a libxml2
10+
wrappers, can only evaluate XPath expressions.
1111

1212
Using this function, it is possible to search an XML tree without the
1313
prerequisite of knowing XPath.
@@ -55,8 +55,8 @@ css_to_xpath(selector,
5555
A character vector of XPath expressions.
5656
}
5757
\references{
58-
CSS3 Selectors \url{http://www.w3.org/TR/css3-selectors/}, XPath
59-
\url{http://www.w3.org/TR/xpath/}.
58+
CSS3 Selectors \url{https://www.w3.org/TR/css3-selectors/}, XPath
59+
\url{https://www.w3.org/TR/xpath/}.
6060
}
6161
\author{
6262
Simon Potter

man/querySelectorAll.Rd

+49-9
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ querySelectorAllNS(doc, selector, ns,
5454
selector, they must have a namespace prefix, e.g. \code{"svg|g"}.
5555

5656
The namespace argument, \code{ns}, is simply passed on to
57-
\code{\link[XML]{getNodeSet}} if it is necessary to use a namespace
58-
present within the document. This can be ignored for content lacking a
59-
namespace, which is usually the case when using \code{querySelector}
60-
or \code{querySelectorAll}.
57+
\code{\link[XML]{getNodeSet}} or \code{\link[xml2]{xml_find_all}} if
58+
it is necessary to use a namespace present within the document. This
59+
can be ignored for content lacking a namespace, which is usually the
60+
case when using \code{querySelector} or \code{querySelectorAll}.
6161
}
6262
\value{
6363
For \code{querySelector}, the result is a single node that represents
@@ -71,17 +71,57 @@ querySelectorAllNS(doc, selector, ns,
7171
return the same type of content as their un-namespaced counterparts.
7272
}
7373
\references{
74-
CSS3 Selectors \url{http://www.w3.org/TR/css3-selectors/}, XPath
75-
\url{http://www.w3.org/TR/xpath/}, querySelectorAll
74+
CSS3 Selectors \url{https://www.w3.org/TR/css3-selectors/}, XPath
75+
\url{https://www.w3.org/TR/xpath/}, querySelectorAll
7676
\url{https://developer.mozilla.org/en-US/docs/DOM/Document.querySelectorAll}
7777
and \url{http://www.w3.org/TR/selectors-api/#interface-definitions}.
7878
}
7979
\author{
8080
Simon Potter
8181
}
8282
\examples{
83-
library(XML)
84-
exdoc <- xmlParse(c('<a><b class="aclass"/><c id="anid"/></a>'))
83+
hasXML <- require(XML)
84+
hasxml2 <- require(xml2)
85+
86+
if (!hasXML && !hasxml2)
87+
return() # can't demo without XML or xml2 packages present
88+
89+
parseFn <- if (hasXML) xmlParse else read_xml
90+
# Demo for working with the XML package (if present, otherwise xml2)
91+
exdoc <- parseFn('<a><b class="aclass"/><c id="anid"/></a>')
92+
querySelector(exdoc, "#anid") # Returns the matching node
93+
querySelector(exdoc, ".aclass") # Returns the matching node
94+
querySelector(exdoc, "b, c") # First match from grouped selection
95+
querySelectorAll(exdoc, "b, c") # Grouped selection
96+
querySelectorAll(exdoc, "b") # A list of length one
97+
querySelector(exdoc, "d") # No match
98+
querySelectorAll(exdoc, "d") # No match
99+
100+
# Read in a document where two namespaces are being set:
101+
# SVG and MathML
102+
svgdoc <- parseFn(system.file("demos/svg-mathml.svg",
103+
package = "selectr"))
104+
# Search for <script/> elements in the SVG namespace
105+
querySelectorNS(svgdoc, "svg|script",
106+
c(svg = "http://www.w3.org/2000/svg"))
107+
querySelectorAllNS(svgdoc, "svg|script",
108+
c(svg = "http://www.w3.org/2000/svg"))
109+
# MathML content is *within* SVG content,
110+
# search for <mtext> elements within the MathML namespace
111+
querySelectorNS(svgdoc, "math|mtext",
112+
c(math = "http://www.w3.org/1998/Math/MathML"))
113+
querySelectorAllNS(svgdoc, "math|mtext",
114+
c(math = "http://www.w3.org/1998/Math/MathML"))
115+
# Search for *both* SVG and MathML content
116+
querySelectorAllNS(svgdoc, "svg|script, math|mo",
117+
c(svg = "http://www.w3.org/2000/svg",
118+
math = "http://www.w3.org/1998/Math/MathML"))
119+
120+
if (!hasXML)
121+
return() # already demo'd xml2
122+
123+
# Demo for working with the xml2 package
124+
exdoc <- read_xml('<a><b class="aclass"/><c id="anid"/></a>')
85125
querySelector(exdoc, "#anid") # Returns the matching node
86126
querySelector(exdoc, ".aclass") # Returns the matching node
87127
querySelector(exdoc, "b, c") # First match from grouped selection
@@ -92,7 +132,7 @@ querySelectorAllNS(doc, selector, ns,
92132

93133
# Read in a document where two namespaces are being set:
94134
# SVG and MathML
95-
svgdoc <- xmlParse(system.file("demos/svg-mathml.svg",
135+
svgdoc <- read_xml(system.file("demos/svg-mathml.svg",
96136
package = "selectr"))
97137
# Search for <script/> elements in the SVG namespace
98138
querySelectorNS(svgdoc, "svg|script",

tests/testthat/test-querySelector.R tests/testthat/test-querySelector-XML.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
context("querySelector")
1+
context("querySelector-XML")
22

33
test_that("querySelector returns a single node or NULL", {
44
library(XML)
+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
context("querySelector-xml2")
2+
3+
test_that("querySelector returns a single node or NULL", {
4+
library(xml2)
5+
doc <- read_xml('<a><b id="#test"/><c class="ex"/><c class="xmp"/></a>')
6+
p <- function(x) {
7+
if (is.null(x)) x else as.character(x)
8+
}
9+
expect_that(p(querySelector(doc, "a")),
10+
equals(p(xml_find_first(doc, "//a"))))
11+
expect_that(p(querySelector(doc, "*", prefix = "")),
12+
equals(p(xml_find_first(doc, "*"))))
13+
expect_that(p(querySelector(doc, "d")), equals(NULL))
14+
expect_that(p(querySelector(doc, "c")), equals(p(xml_find_first(doc, "//c"))))
15+
})
16+
17+
test_that("querySelectorAll returns expected nodes", {
18+
library(XML)
19+
doc <- read_xml('<a><b id="#test"/><c class="ex"/><c class="xmp"/></a>')
20+
p <- function(x) {
21+
lapply(x, function(node) as.character(node))
22+
}
23+
expect_that(p(querySelectorAll(doc, "a")),
24+
equals(p(xml_find_all(doc, "//a"))))
25+
expect_that(p(querySelectorAll(doc, "*", prefix = "")),
26+
equals(p(xml_find_all(doc, "*"))))
27+
expect_that(p(querySelectorAll(doc, "c")),
28+
equals(p(xml_find_all(doc, "//c"))))
29+
})
30+
31+
test_that("querySelectorAll returns empty list for no match", {
32+
library(xml2)
33+
doc <- read_xml('<a><b id="#test"/><c class="ex"/><c class="xmp"/></a>')
34+
p <- function(x) {
35+
lapply(x, function(node) as.character(node))
36+
}
37+
expect_that(p(querySelectorAll(doc, "d")),
38+
equals(p(xml_find_all(doc, "//d"))))
39+
})
File renamed without changes.

tests/testthat/test-select-xml2.R

+158
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
context("large-test")
2+
3+
test_that("selection works correctly on a large barrage of tests", {
4+
HTML_IDS <- paste0(
5+
c("<html id=\"html\"><head>", " <link id=\"link-href\" href=\"foo\" />",
6+
" <link id=\"link-nohref\" />", "</head><body>", "<div id=\"outer-div\">",
7+
" <a id=\"name-anchor\" name=\"foo\"></a>", " <a id=\"tag-anchor\" rel=\"tag\" href=\"http://localhost/foo\">link</a>",
8+
" <a id=\"nofollow-anchor\" rel=\"nofollow\" href=\"https://example.org\">",
9+
" link</a>", " <ol id=\"first-ol\" class=\"a b c\">", " <li id=\"first-li\">content</li>",
10+
" <li id=\"second-li\" lang=\"En-us\">", " <div id=\"li-div\">",
11+
" </div>", " </li>", " <li id=\"third-li\" class=\"ab c\"></li>",
12+
" <li id=\"fourth-li\" class=\"ab", "c\"></li>", " <li id=\"fifth-li\"></li>",
13+
" <li id=\"sixth-li\"></li>", " <li id=\"seventh-li\"> </li>",
14+
" </ol>", " <p id=\"paragraph\">", " <b id=\"p-b\">hi</b> <em id=\"p-em\">there</em>",
15+
" <b id=\"p-b2\">guy</b>", " <input type=\"checkbox\" id=\"checkbox-unchecked\" />",
16+
" <input type=\"checkbox\" id=\"checkbox-disabled\" disabled=\"\" />",
17+
" <input type=\"text\" id=\"text-checked\" checked=\"checked\" />",
18+
" <input type=\"hidden\" />", " <input type=\"hidden\" disabled=\"disabled\" />",
19+
" <input type=\"checkbox\" id=\"checkbox-checked\" checked=\"checked\" />",
20+
" <input type=\"checkbox\" id=\"checkbox-disabled-checked\"",
21+
" disabled=\"disabled\" checked=\"checked\" />", " <fieldset id=\"fieldset\" disabled=\"disabled\">",
22+
" <input type=\"checkbox\" id=\"checkbox-fieldset-disabled\" />",
23+
" <input type=\"hidden\" />", " </fieldset>", " </p>",
24+
" <ol id=\"second-ol\">", " </ol>", " <map name=\"dummymap\">",
25+
" <area shape=\"circle\" coords=\"200,250,25\" href=\"foo.html\" id=\"area-href\" />",
26+
" <area shape=\"default\" id=\"area-nohref\" />", " </map>",
27+
"</div>", "<div id=\"foobar-div\" foobar=\"ab bc", "cde\"><span id=\"foobar-span\"></span></div>",
28+
"</body></html>"), collapse = "\n")
29+
30+
library(XML)
31+
document <- xmlRoot(xmlParse(HTML_IDS))
32+
gt <- GenericTranslator$new()
33+
ht <- HTMLTranslator$new()
34+
35+
select_ids <- function(selector, html_only) {
36+
if (html_only) {
37+
xpath <- ht$css_to_xpath(selector)
38+
items <- getNodeSet(document, xpath)
39+
} else {
40+
xpath <- gt$css_to_xpath(selector)
41+
items <- getNodeSet(document, xpath)
42+
}
43+
n <- length(items)
44+
if (! n)
45+
return(NULL)
46+
result <- character(n)
47+
for (i in seq_len(n)) {
48+
element <- items[[i]]
49+
tmp <- xmlAttrs(element)["id"]
50+
if (is.null(tmp))
51+
tmp <- "nil"
52+
result[i] <- tmp
53+
}
54+
result
55+
}
56+
57+
pcss <- function(main, selectors = NULL, html_only = FALSE) {
58+
result <- select_ids(main, html_only)
59+
if (! is.null(selectors) && length(selectors)) {
60+
n <- length(selectors)
61+
for (i in seq_len(n)) {
62+
tmp_res <- select_ids(selectors[i], html_only = html_only)
63+
if (! is.null(result) && ! is.null(tmp_res) &&
64+
tmp_res != result)
65+
stop("Difference between results of selectors")
66+
}
67+
}
68+
result
69+
}
70+
71+
all_ids <- pcss('*')
72+
expect_that(all_ids[1:6], equals(c('html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div')))
73+
expect_that(tail(all_ids, 1), equals('foobar-span'))
74+
expect_that(pcss('div'), equals(c('outer-div', 'li-div', 'foobar-div')))
75+
expect_that(pcss('DIV', html_only=TRUE), equals(c('outer-div', 'li-div', 'foobar-div'))) # case-insensitive in HTML
76+
expect_that(pcss('div div'), equals('li-div'))
77+
expect_that(pcss('div, div div'), equals(c('outer-div', 'li-div', 'foobar-div')))
78+
expect_that(pcss('a[name]'), equals('name-anchor'))
79+
expect_that(pcss('a[NAme]', html_only=TRUE), equals('name-anchor')) # case-insensitive in HTML:
80+
expect_that(pcss('a[rel]'), equals(c('tag-anchor', 'nofollow-anchor')))
81+
expect_that(pcss('a[rel="tag"]'), equals('tag-anchor'))
82+
expect_that(pcss('a[href*="localhost"]'), equals('tag-anchor'))
83+
expect_that(pcss('a[href*=""]'), equals(NULL))
84+
expect_that(pcss('a[href^="http"]'), equals(c('tag-anchor', 'nofollow-anchor')))
85+
expect_that(pcss('a[href^="http:"]'), equals('tag-anchor'))
86+
expect_that(pcss('a[href^=""]'), equals(NULL))
87+
expect_that(pcss('a[href$="org"]'), equals('nofollow-anchor'))
88+
expect_that(pcss('a[href$=""]'), equals(NULL))
89+
expect_that(pcss('div[foobar~="bc"]', 'div[foobar~="cde"]'), equals('foobar-div'))
90+
expect_that(pcss('[foobar~="ab bc"]', c('[foobar~=""]', '[foobar~=" \t"]')), equals(NULL))
91+
expect_that(pcss('div[foobar~="cd"]'), equals(NULL))
92+
expect_that(pcss('*[lang|="En"]', '[lang|="En-us"]'), equals('second-li'))
93+
# Attribute values are case sensitive
94+
expect_that(pcss('*[lang|="en"]', '[lang|="en-US"]'), equals(NULL))
95+
expect_that(pcss('*[lang|="e"]'), equals(NULL))
96+
# ... :lang() is not.
97+
expect_that(pcss(':lang("EN")', '*:lang(en-US)', html_only=TRUE), equals(c('second-li', 'li-div')))
98+
expect_that(pcss(':lang("e")', html_only=TRUE), equals(NULL))
99+
expect_that(pcss('li:nth-child(-n)'), equals(NULL))
100+
expect_that(pcss('li:nth-child(n)'), equals(c('first-li', 'second-li', 'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li')))
101+
expect_that(pcss('li:nth-child(3)'), equals('third-li'))
102+
expect_that(pcss('li:nth-child(10)'), equals(NULL))
103+
expect_that(pcss('li:nth-child(2n)', c('li:nth-child(even)', 'li:nth-child(2n+0)')), equals(c('second-li', 'fourth-li', 'sixth-li')))
104+
expect_that(pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)'), equals(c('first-li', 'third-li', 'fifth-li', 'seventh-li')))
105+
expect_that(pcss('li:nth-child(2n+4)'), equals(c('fourth-li', 'sixth-li')))
106+
expect_that(pcss('li:nth-child(3n+1)'), equals(c('first-li', 'fourth-li', 'seventh-li')))
107+
expect_that(pcss('li:nth-child(-n+3)'), equals(c('first-li', 'second-li', 'third-li')))
108+
expect_that(pcss('li:nth-child(-2n+4)'), equals(c('second-li', 'fourth-li')))
109+
expect_that(pcss('li:nth-last-child(0)'), equals(NULL))
110+
expect_that(pcss('li:nth-last-child(1)'), equals('seventh-li'))
111+
expect_that(pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)'), equals(c('second-li', 'fourth-li', 'sixth-li')))
112+
expect_that(pcss('li:nth-last-child(2n+2)'), equals(c('second-li', 'fourth-li', 'sixth-li')))
113+
expect_that(pcss('ol:first-of-type'), equals('first-ol'))
114+
expect_that(pcss('ol:nth-child(1)'), equals(NULL))
115+
expect_that(pcss('ol:nth-of-type(2)'), equals('second-ol'))
116+
expect_that(pcss('ol:nth-last-of-type(1)'), equals('second-ol'))
117+
expect_that(pcss('span:only-child'), equals('foobar-span'))
118+
expect_that(pcss('li div:only-child'), equals('li-div'))
119+
expect_that(pcss('div *:only-child'), equals(c('li-div', 'foobar-span')))
120+
#self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
121+
expect_that(pcss('p:only-of-type'), equals('paragraph'))
122+
expect_that(pcss('a:empty', 'a:EMpty'), equals('name-anchor'))
123+
expect_that(pcss('li:empty'), equals(c('third-li', 'fourth-li', 'fifth-li', 'sixth-li')))
124+
expect_that(pcss(':root', 'html:root'), equals('html'))
125+
expect_that(pcss('li:root', '* :root'), equals(NULL))
126+
expect_that(pcss('*:contains("link")', ':CONtains("link")'), equals(c('html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor')))
127+
expect_that(pcss('*:contains("LInk")'), equals(NULL)) # case sensitive
128+
expect_that(pcss('*:contains("e")'), equals(c('html', 'nil', 'outer-div', 'first-ol', 'first-li', 'paragraph', 'p-em')))
129+
expect_that(pcss('*:contains("E")'), equals(NULL)) # case-sensitive
130+
expect_that(pcss('.a', c('.b', '*.a', 'ol.a')), equals('first-ol'))
131+
expect_that(pcss('.c', '*.c'), equals(c('first-ol', 'third-li', 'fourth-li')))
132+
expect_that(pcss('ol *.c', c('ol li.c', 'li ~ li.c', 'ol > li.c')), equals(c('third-li', 'fourth-li')))
133+
expect_that(pcss('#first-li', c('li#first-li', '*#first-li')), equals('first-li'))
134+
expect_that(pcss('li div', c('li > div', 'div div')), equals('li-div'))
135+
expect_that(pcss('div > div'), equals(NULL))
136+
expect_that(pcss('div>.c', 'div > .c'), equals('first-ol'))
137+
expect_that(pcss('div + div'), equals('foobar-div'))
138+
expect_that(pcss('a ~ a'), equals(c('tag-anchor', 'nofollow-anchor')))
139+
expect_that(pcss('a[rel="tag"] ~ a'), equals('nofollow-anchor'))
140+
expect_that(pcss('ol#first-ol li:last-child'), equals('seventh-li'))
141+
expect_that(pcss('ol#first-ol *:last-child'), equals(c('li-div', 'seventh-li')))
142+
expect_that(pcss('#outer-div:first-child'), equals('outer-div'))
143+
expect_that(pcss('#outer-div :first-child'), equals(c('name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-fieldset-disabled', 'area-href')))
144+
expect_that(pcss('a[href]'), equals(c('tag-anchor', 'nofollow-anchor')))
145+
expect_that(pcss(':not(*)'), equals(NULL))
146+
expect_that(pcss('a:not([href])'), equals('name-anchor'))
147+
expect_that(pcss('ol :Not(li[class])'), equals(c('first-li', 'second-li', 'li-div', 'fifth-li', 'sixth-li', 'seventh-li')))
148+
# Invalid characters in XPath element names, should not crash
149+
expect_that(pcss('di\ua0v', 'div\\['), equals(NULL))
150+
expect_that(pcss('[h\ua0ref]', '[h\\]ref]'), equals(NULL))
151+
152+
## HTML-specific
153+
expect_that(pcss(':link', html_only=TRUE), equals(c('link-href', 'tag-anchor', 'nofollow-anchor', 'area-href')))
154+
expect_that(pcss(':visited', html_only=TRUE), equals(NULL))
155+
expect_that(pcss(':enabled', html_only=TRUE), equals(c('link-href', 'tag-anchor', 'nofollow-anchor', 'checkbox-unchecked', 'text-checked', 'checkbox-checked', 'area-href')))
156+
expect_that(pcss(':disabled', html_only=TRUE), equals(c('checkbox-disabled', 'checkbox-disabled-checked', 'fieldset', 'checkbox-fieldset-disabled')))
157+
expect_that(pcss(':checked', html_only=TRUE), equals(c('checkbox-checked', 'checkbox-disabled-checked')))
158+
})

tests/testthat/test-shakespeare.R tests/testthat/test-shakespeare-XML.R

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
context("shakespeare-test")
1+
context("shakespeare-test-XML")
22

33
test_that("selection works correctly on a shakespearean document", {
4-
HTML_SHAKESPEARE <-
5-
c("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"",
6-
"\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">", "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" debug=\"true\">",
4+
HTML_SHAKESPEARE <- paste(
5+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"",
6+
"\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">",
7+
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" debug=\"true\">",
78
"<head>", "\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>",
89
"</head>", "<body>", "\t<div id=\"test\">", "\t<div class=\"dialog\">",
910
"\t<h2>As You Like It</h2>", "\t<div id=\"playwright\">", "\t by William Shakespeare",
@@ -199,7 +200,7 @@ test_that("selection works correctly on a shakespearean document", {
199200
"\t <div id=\"scene1.3.137\">After my flight. Now go we in content</div>",
200201
"\t <div id=\"scene1.3.138\">To liberty and not to banishment.</div>",
201202
"\t <div class=\"direction\">Exeunt</div>", "\t </div>", "\t</div>",
202-
"\t</div>", "</div>", "</body>", "</html>", collapse = "\n")
203+
"\t</div>", "</div>", "</body>", "</html>", sep = "\n")
203204

204205
library(XML)
205206
document <- xmlRoot(htmlParse(HTML_SHAKESPEARE))

0 commit comments

Comments
 (0)