Skip to content

Commit 5df1ed5

Browse files
authored
Amend XML.write to respect xml:space="preserve" (#49)
* Address speed regression in #46 and fix reamining issues in #45. * Use @views more often for slices * undo skip orphan text nodes (cf. EzXML) * Amend `XML.write` to honour `xml:space="preserve"` * Update write to respect xml:space * Correct isspace test * Fix isspace * more isspace * Normalize_newlines * Normalize_newlines * Undo normalize newlines * Undo normalize newlines
1 parent 815d8bf commit 5df1ed5

File tree

3 files changed

+88
-68
lines changed

3 files changed

+88
-68
lines changed

src/XML.jl

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ end
149149
function Node(o::Node, x...; kw...)
150150
attrs = !isnothing(kw) ?
151151
merge(
152-
OrderedDict(string(k) => string(v) for (k,v) in pairs(kw)),
153-
isnothing(o.attributes) ? OrderedDict{String, String}() : o.attributes
152+
OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
153+
isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
154154
) :
155155
o.attributes
156156
children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
@@ -357,56 +357,69 @@ write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
357357

358358
write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
359359

360-
function write(io::IO, x; indentsize::Int=2, depth::Int=1)
360+
function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
361361
indent = ' ' ^ indentsize
362362
nodetype = XML.nodetype(x)
363363
tag = XML.tag(x)
364364
value = XML.value(x)
365365
children = XML.children(x)
366366

367367
padding = indent ^ max(0, depth - 1)
368-
print(io, padding)
368+
!ctx[end] && print(io, padding)
369+
369370
if nodetype === Text
370371
print(io, value)
372+
371373
elseif nodetype === Element
374+
push!(ctx, ctx[end])
375+
update_ctx!(ctx, x)
372376
print(io, '<', tag)
373377
_print_attrs(io, x)
374378
print(io, isempty(children) ? '/' : "", '>')
375379
if !isempty(children)
376380
if length(children) == 1 && XML.nodetype(only(children)) === Text
377-
write(io, only(children); indentsize=0)
381+
write(io, only(children), ctx; indentsize=0)
378382
print(io, "</", tag, '>')
379383
else
380-
println(io)
384+
!ctx[end] && println(io)
381385
foreach(children) do child
382-
write(io, child; indentsize, depth = depth + 1)
383-
println(io)
386+
write(io, child, ctx; indentsize, depth=depth + 1)
387+
!ctx[end] && println(io)
384388
end
385-
print(io, padding, "</", tag, '>')
389+
print(io, !ctx[end] ? padding : "", "</", tag, '>')
386390
end
387391
end
392+
pop!(ctx)
393+
388394
elseif nodetype === DTD
389395
print(io, "<!DOCTYPE ", value, '>')
396+
390397
elseif nodetype === Declaration
391398
print(io, "<?xml")
392399
_print_attrs(io, x)
393400
print(io, "?>")
401+
394402
elseif nodetype === ProcessingInstruction
395403
print(io, "<?", tag)
396404
_print_attrs(io, x)
397405
print(io, "?>")
406+
398407
elseif nodetype === Comment
399408
print(io, "<!--", value, "-->")
409+
400410
elseif nodetype === CData
401411
print(io, "<![CData[", value, "]]>")
412+
402413
elseif nodetype === Document
403414
foreach(children) do child
404-
write(io, child; indentsize)
405-
println(io)
415+
write(io, child, ctx; indentsize)
416+
!ctx[end] && println(io)
406417
end
418+
407419
else
408420
error("Unreachable case reached during XML.write")
409421
end
410-
end
411422

412423
end
424+
425+
end # module XML

src/raw.jl

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ function get_attributes(data, i, j)
146146
out = OrderedDict{String,String}()
147147
while !isnothing(i) && i < j
148148
key, i = get_name(data, i)
149-
#haskey(out, key) && error("Duplicate attribute name found: $key") # would this be useful?
150149
# get quotechar the value is wrapped in (either ' or ")
151150
i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1)
152151
quotechar = data[i]
@@ -329,7 +328,11 @@ function parent(o::Raw)
329328
end
330329

331330
#-----------------------------------------------------------------------------# next Raw
332-
isspace(x::UInt8) = Base.isspace(Char(x))
331+
# isspace(x::UInt8) = Base.isspace(Char(x))
332+
333+
# XML whitespace per XML 1.0/1.1 production S:
334+
# S ::= (#x20 | #x9 | #xD | #xA)+
335+
@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D)
333336

334337
"""
335338
next(node) --> typeof(node) or Nothing
@@ -353,7 +356,7 @@ function next_xml_space(o::Raw)
353356
has_xml_space = o.has_xml_space
354357
ctx = copy(o.ctx)
355358
last_type = type
356-
k = findnext(!isspace, data, i)
359+
k = findnext(!xml_isspace, data, i)
357360
if isnothing(k)
358361
return nothing
359362
end
@@ -369,11 +372,11 @@ function next_xml_space(o::Raw)
369372
if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/'
370373
type = RawText
371374
j = findnext(==(UInt8('<')), data, i) - 1
372-
j = ctx[end] ? j : findprev(!isspace, data, j) # preserving whitespace if needed
375+
j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed
373376
if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument
374377
# Maybe drop pure-whitespace inter-element text nodes?
375378
# (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node)
376-
#if all(isspace, @view data[i:j]) && depth > 1
379+
#if all(xml_isspace, @view data[i:j]) && depth > 1
377380
# return next(Raw(type, depth, j, 0, data, ctx, has_xml_space))
378381
#end
379382
end
@@ -421,15 +424,15 @@ function next_xml_space(o::Raw)
421424
end
422425
return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
423426
end
424-
#
427+
425428
function next_no_xml_space(o::Raw) # same as v0.3.5
426429
i = o.pos + o.len + 1
427430
depth = o.depth
428431
data = o.data
429432
type = o.type
430433
has_xml_space = o.has_xml_space
431434
ctx = [false]
432-
i = findnext(!isspace, data, i)
435+
i = findnext(!xml_isspace, data, i)
433436
if isnothing(i)
434437
return nothing
435438
end
@@ -441,7 +444,7 @@ function next_no_xml_space(o::Raw) # same as v0.3.5
441444
if c !== '<'
442445
type = RawText
443446
j = findnext(==(UInt8('<')), data, i) - 1
444-
j = findprev(!isspace, data, j) # "rstrip"
447+
j = findprev(!xml_isspace, data, j) # "rstrip"
445448
elseif c === '<'
446449
c2 = Char(o.data[i+1])
447450
if c2 === '!'
@@ -514,7 +517,7 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5
514517
ctx = has_xml_space ? copy(o.ctx) : [false]
515518
type === RawDocument && return nothing
516519
j = o.pos - 1
517-
j = findprev(!isspace, data, j)
520+
j = findprev(!xml_isspace, data, j)
518521
if isnothing(j)
519522
return Raw(data, has_xml_space, ctx) # RawDocument
520523
end
@@ -523,7 +526,7 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5
523526
if c !== '>' # text
524527
type = RawText
525528
i = findprev(==(UInt8('>')), data, j) + 1
526-
i = findnext(!isspace, data, i) # "lstrip"
529+
i = findnext(!xml_isspace, data, i) # "lstrip"
527530
elseif c === '>'
528531
c2 = Char(o.data[j-1])
529532
if c2 === '-'
@@ -562,3 +565,4 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5
562565
end
563566
return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
564567
end
568+

test/runtests.jl

Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,13 @@ end
147147
@test text_content == "hello"
148148
n=XML.next(n)
149149
text_content = XML.write(n)
150-
@test text_content == "<text3 xml:space=\"preserve\">\n hello \n <text3b> preserve </text3b>\n</text3>"
150+
@test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>"
151151
n=XML.prev(n)
152152
text_content = XML.write(n)
153153
@test text_content == "hello"
154154
n=XML.next(n)
155155
text_content = XML.write(n)
156-
@test text_content == "<text3 xml:space=\"preserve\">\n hello \n <text3b> preserve </text3b>\n</text3>"
156+
@test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>"
157157
n=XML.next(n)
158158
text_content = XML.write(n)
159159
@test text_content == " hello "
@@ -183,13 +183,13 @@ end
183183
@test text_content == " hello "
184184
n=XML.prev(n)
185185
text_content = XML.write(n)
186-
@test text_content == "<text3 xml:space=\"preserve\">\n hello \n <text3b> preserve </text3b>\n</text3>"
186+
@test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>"
187187
n=XML.next(n)
188188
text_content = XML.write(n)
189189
@test text_content == " hello "
190190
n=XML.prev(n)
191191
text_content = XML.write(n)
192-
@test text_content == "<text3 xml:space=\"preserve\">\n hello \n <text3b> preserve </text3b>\n</text3>"
192+
@test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>"
193193
n=XML.prev(n)
194194
text_content = XML.write(n)
195195
@test text_content == "hello"
@@ -201,7 +201,7 @@ end
201201
@test text_content == "<text/>"
202202
n=XML.prev(n)
203203
text_content = XML.write(n)
204-
@test text_content == "<root>\n <text/>\n <text2>hello</text2>\n <text3 xml:space=\"preserve\">\n hello \n <text3b> preserve </text3b>\n </text3>\n <text4 xml:space=\"preserve\"/>\n <text5/>\n</root>"
204+
@test text_content == "<root>\n <text/>\n <text2>hello</text2>\n <text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>\n <text4 xml:space=\"preserve\"/>\n <text5/>\n</root>"
205205
end
206206

207207
@testset "depth and parent" begin
@@ -428,19 +428,18 @@ end
428428
@test XML.value(d2[1][6][1]) == " after default gap "
429429
@test XML.value(d2[1][7]) == "\n"
430430
end
431-
432-
# @testset "XML whitespace vs Unicode whitespace" begin
433-
# nbsp = "\u00A0"
434-
# s = """<root>
435-
# <a> x\t\n </a>
436-
# <b>$(nbsp) y $(nbsp)</b>
437-
# <c xml:space="default">$(nbsp) z $(nbsp)</c>
438-
# </root>"""
439-
# d = XML.parse(XML.Node, s)
440-
# @test XML.value(d[1][1][1]) == "x"
441-
# @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
442-
# @test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)"
443-
# end
431+
@testset "XML whitespace vs Unicode whitespace" begin
432+
nbsp = "\u00A0"
433+
s = """<root>
434+
<a> x\t\n </a>
435+
<b>$(nbsp) y $(nbsp)</b>
436+
<c xml:space="default">$(nbsp) z $(nbsp)</c>
437+
</root>"""
438+
d = XML.parse(XML.Node, s)
439+
@test XML.value(d[1][1][1]) == "x"
440+
@test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
441+
@test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)"
442+
end
444443

445444
@testset "CDATA/Comment/PI boundaries" begin
446445
s = """<root>
@@ -485,17 +484,21 @@ end
485484
@test XML.value(d[1][1]) == "a"
486485
end
487486

488-
# @testset "entities expanding to whitespace" begin
489-
# s = """<root>
490-
# <a> &#x20; a &#x0A; </a>
491-
# <b xml:space="preserve">&#x20; b &#x0A;</b>
492-
# <c>&#xA0;c&#xA0;</c>
493-
# </root>"""
494-
# d = XML.parse(XML.Node, s)
495-
# @test XML.value(d[1][1][1]) == "a"
496-
# @test XML.value(d[1][2][1]) == " b \n"
497-
# @test XML.value(d[1][3][1]) == "\u00A0c\u00A0"
498-
# end
487+
@testset "entities expanding to whitespace" begin
488+
chr1="\u0020"
489+
chr2="\u000A"
490+
chr3="\u00A0"
491+
492+
s = """<root>
493+
<a> $(chr1) a $(chr2) </a>
494+
<b xml:space="preserve">$(chr1) b $(chr2)</b>
495+
<c>$(chr3)c$(chr3)</c>
496+
</root>"""
497+
d = XML.parse(XML.Node, s)
498+
@test XML.value(d[1][1][1]) == "a"
499+
@test XML.value(d[1][2][1]) == " b \n"
500+
@test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)"
501+
end
499502

500503
@testset "invalid values and placement" begin
501504
s_bad = """<root><x xml:space="weird"> t </x></root>"""
@@ -534,23 +537,22 @@ end
534537
@test reverse(back)[2:end] == toks[1:end-1]
535538
end
536539

537-
# @testset "write/read roundtrip extremes" begin
538-
# XML.write doesn't respect xml:space="preserve" in the current implementation so roundtrip isn't possible.
539-
# xml = """<root>
540-
# <p xml:space="preserve"> </p>
541-
# <q> </q>
542-
# <r xml:space="default"> r </r>
543-
# <s xml:space="preserve"> pre <t/> post </s>
544-
# </root>"""
545-
# n = XML.parse(XML.Node, xml)
546-
# io = IOBuffer(); XML.write(io, n)
547-
# n2 = XML.parse(XML.Node, String(take!(io)))
548-
# @test n == n2
549-
# @test XML.write(n2[1][1]) == "<p xml:space=\"preserve\"> </p>"
550-
# @test XML.write(n2[1][2]) == "<q/>"
551-
# @test XML.value(n2[1][3][1]) == "r"
552-
# @test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
553-
# end
540+
@testset "write/read roundtrip extremes" begin
541+
xml = """<root>
542+
<p xml:space="preserve"> </p>
543+
<q> </q>
544+
<r xml:space="default"> r </r>
545+
<s xml:space="preserve"> pre <t/> post </s>
546+
</root>"""
547+
n = XML.parse(XML.Node, xml)
548+
io = IOBuffer(); XML.write(io, n)
549+
n2 = XML.parse(XML.Node, String(take!(io)))
550+
@test n == n2
551+
@test XML.write(n2[1][1]) == "<p xml:space=\"preserve\"> </p>"
552+
@test XML.write(n2[1][2]) == "<q/>"
553+
@test XML.value(n2[1][3][1]) == "r"
554+
@test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
555+
end
554556

555557
@testset "self-closing/empty/whitespace-only children" begin
556558
s = """<root>
@@ -641,3 +643,4 @@ end
641643
xyz = XML.Element("point"; kw...)
642644
@test collect(keys(attributes(xyz))) == string.(collect('a':'z'))
643645
end
646+

0 commit comments

Comments
 (0)