Skip to content

Respect xml:space="preserve" (#43) #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 116 additions & 76 deletions src/raw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
x === RawDocument ? Document :
nothing

#struct XMLSpaceContext
# preserve_space::Vector{Bool} # Stack to track xml:space state
#end
#XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving

#-----------------------------------------------------------------------------# Raw
"""
Raw(filename::String)
Expand Down Expand Up @@ -64,8 +69,10 @@ struct Raw
pos::Int
len::Int
data::Vector{UInt8}
ctx::Vector{Bool} # Context for xml:space (Vector so mutable)
end
Raw(data::Vector{UInt8}) = Raw(RawDocument, 0, 0, 0, data)
Raw(data::Vector{UInt8}, ctx=[false]) = Raw(RawDocument, 0, 0, 0, data, ctx)


Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
Raw(Mmap.mmap(filename)) :
Expand Down Expand Up @@ -117,7 +124,7 @@ end
# starting at position i, return attributes up until the next '>' or '?' (DTD)
function get_attributes(data, i, j)
i = name_start(data, i)
i > j && return nothing
(isnothing(j) || isnothing(i) || i > j) && return nothing
out = OrderedDict{String, String}()
while !isnothing(i) && i < j
key, i = get_name(data, i)
Expand Down Expand Up @@ -161,7 +168,18 @@ function attributes(o::Raw)
i = o.pos
i = name_start(o.data, i)
i = name_stop(o.data, i)
get_attributes(o.data, i + 1, o.pos + o.len)
out=get_attributes(o.data, i + 1, o.pos + o.len)
if o.type === RawElementOpen && !isnothing(out) && haskey(out, "xml:space")
# If xml:space attribute is present, we need to preserve whitespace
if out["xml:space"] == "preserve"
o.ctx[1]= true
elseif out["xml:space"] == "default"
o.ctx[1] = false
else
error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.")
end
end
out
elseif o.type === RawDeclaration
get_attributes(o.data, o.pos + 6, o.pos + o.len)
else
Expand Down Expand Up @@ -198,7 +216,11 @@ function children(o::Raw)
depth = o.depth
out = Raw[]
for item in xml_nodes(o)
item.depth == depth + 1 && push!(out, item)
if item.depth == depth + 1
item.ctx[1] = o.ctx[1] # inherit the context
o.type==RawElementOpen && attributes(item)
push!(out, item)
end
item.depth == depth && break
o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
end
Expand Down Expand Up @@ -247,55 +269,64 @@ function next(o::Raw)
depth = o.depth
data = o.data
type = o.type
i = findnext(!isspace, data, i) # skip insignificant whitespace
isnothing(i) && return nothing
ctx = o.ctx
k = findnext(!isspace, data, i)
if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0)
return nothing
end
i = (ctx[1]) ? i : k
j = i + 1
c = Char(o.data[k])
d = Char(o.data[k+1])
if type === RawElementOpen || type === RawDocument
depth += 1
end
c = Char(o.data[i])
j = i + 1
if c !== '<'
if c !== '<' || type === RawElementOpen && d === '/' && (ctx[1])
type = RawText
j = findnext(==(UInt8('<')), data, i) - 1
j = findprev(!isspace, data, j) # "rstrip"
elseif c === '<'
c2 = Char(o.data[i + 1])
if c2 === '!'
c3 = Char(o.data[i + 2])
if c3 === '-'
type = RawComment
j = findnext(Vector{UInt8}("-->"), data, i)[end]
elseif c3 === '['
type = RawCData
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
elseif c3 === 'D' || c3 == 'd'
type = RawDTD
j = findnext(==(UInt8('>')), data, i)
while sum(==(UInt8('>')), data[i:j]) != sum(==(UInt8('<')), data[i:j])
j = findnext(==(UInt8('>')), data, j + 1)
j = (ctx[1]) ? j : findprev(!isspace, data, j) # preserving whitespace if needed
else
i=k
j=k+1
if c === '<'
c2 = Char(o.data[i + 1])
if c2 === '!'
c3 = Char(o.data[i + 2])
if c3 === '-'
type = RawComment
j = findnext(Vector{UInt8}("-->"), data, i)[end]
elseif c3 === '['
type = RawCData
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
elseif c3 === 'D' || c3 == 'd'
type = RawDTD
j = findnext(==(UInt8('>')), data, i)
while sum(==(UInt8('>')), data[k:j]) != sum(==(UInt8('<')), data[i:j])
j = findnext(==(UInt8('>')), data, j + 1)
end
end
end
elseif c2 === '?'
if get_name(data, i + 2)[1] == "xml"
type = RawDeclaration
else
type = RawProcessingInstruction
end
j = findnext(Vector{UInt8}("?>"), data, i)[end]
elseif c2 === '/'
type = RawElementClose
depth -= 1
j = findnext(==(UInt8('>')), data, i)
else
j = findnext(==(UInt8('>')), data, i)
if data[j-1] === UInt8('/')
type = RawElementSelfClosed
elseif c2 === '?'
if get_name(data, i + 2)[1] == "xml"
type = RawDeclaration
else
type = RawProcessingInstruction
end
j = findnext(Vector{UInt8}("?>"), data, i)[end]
elseif c2 === '/'
type = RawElementClose
depth -= 1
j = findnext(==(UInt8('>')), data, i)
else
type = RawElementOpen
j = findnext(==(UInt8('>')), data, i)
if data[j-1] === UInt8('/')
type = RawElementSelfClosed
else
type = RawElementOpen
end
end
end
end
return Raw(type, depth, i, j - i, data)
return Raw(type, depth, i, j - i, data, ctx)
end

#-----------------------------------------------------------------------------# prev Raw
Expand All @@ -308,52 +339,61 @@ function prev(o::Raw)
depth = o.depth
data = o.data
type = o.type
ctx = o.ctx
type === RawDocument && return nothing
j = o.pos - 1
j = findprev(!isspace, data, j) # skip insignificant whitespace
isnothing(j) && return Raw(data) # RawDocument
k = findprev(!isspace, data, j)
if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0
return Raw(data, ctx) # RawDocument
end
j = (ctx[1]) ? j : k
c = Char(o.data[j])
d = Char(data[findprev(==(UInt8('<')), data, j)+1])
i = j - 1
next_type = type
if c !== '>' # text
if c !== '>' || type === RawElementClose && d !== '/' && (ctx[1]) # text or empty whitespace
type = RawText
i = findprev(==(UInt8('>')), data, j) + 1
i = findnext(!isspace, data, i) # "lstrip"
elseif c === '>'
c2 = Char(o.data[j - 1])
if c2 === '-'
type = RawComment
i = findprev(Vector{UInt8}("<--"), data, j)[1]
elseif c2 === ']'
type = RawCData
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
elseif c2 === '?'
i = findprev(Vector{UInt8}("<?"), data, j)[1]
if get_name(data, i + 2)[1] == "xml"
type = RawDeclaration
i=findprev(==(UInt8('>')), data, j) + 1
i = (ctx[1]) ? i : findprev(!isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace
else
j=k
i=k-1
if c === '>'
c2 = Char(o.data[j - 1])
if c2 === '-'
type = RawComment
i = findprev(Vector{UInt8}("<--"), data, j)[1]
elseif c2 === ']'
type = RawCData
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
elseif c2 === '?'
i = findprev(Vector{UInt8}("<?"), data, j)[1]
if get_name(data, i + 2)[1] == "xml"
type = RawDeclaration
else
type = RawProcessingInstruction
end
else
type = RawProcessingInstruction
end
i = findprev(==(UInt8('<')), data, j)
char = Char(data[i+1])
if char === '/'
type = RawElementClose
elseif char === '!'
type = DTD
elseif isletter(char) || char === '_'
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
else
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
end
end
else
i = findprev(==(UInt8('<')), data, j)
char = Char(data[i+1])
if char === '/'
type = RawElementClose
elseif char === '!'
type = DTD
elseif isletter(char) || char === '_'
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
else
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
end
error("Unreachable reached in XML.prev")
end
else
error("Unreachable reached in XML.prev")
end
if type !== RawElementOpen && next_type === RawElementClose
depth += 1
elseif type == RawElementOpen && next_type !== RawElementClose
depth -= 1
end
return Raw(type, depth, i, j - i, data)
return Raw(type, depth, i, j - i, data, ctx)
end
108 changes: 108 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,114 @@ end
end
end

#-----------------------------------------------------------------------------# Preserve whitespace
@testset "xml:space" begin
@testset "Basic xml:space functionality" begin

# Test 1: xml:space="preserve" should preserve entirely empty whitespace
xml1 = """<root><text xml:space="preserve"> </text></root>"""
doc1 = parse(XML.Node, xml1)
text_content = XML.value(doc1[1][1][1])
@test text_content == " "

# Test 2: xml:space="preserve" should preserve leading and trailing whitespace
xml2 = """<root><text xml:space="preserve"> leading and trailing spaces </text></root>"""
doc2 = parse(XML.Node, xml2)
text_content = XML.value(doc2[1][1][1])
@test text_content == " leading and trailing spaces "

# Test 3: Without xml:space, entirely empty whitespace should create a self closing node
xml3 = """<root><text> </text></root>"""
doc3 = XML.parse(XML.Node, xml3)
text_content = XML.write(doc3[1][1])
@test text_content == "<text/>"

# Test 4: Without xml:space, whitespace should be normalized
xml4 = """<root><text> gets normalized </text></root>"""
doc4 = XML.parse(XML.Node, xml4)
text_content = XML.value(doc4[1][1][1])
@test text_content == "gets normalized"

# Test 5: xml:space="default" should normalize even with preserve_xml_space=true
xml5 = """<root><text xml:space="default"> gets normalized </text></root>"""
doc5 = XML.parse(XML.Node, xml5)
text_content = XML.value(doc5[1][1][1])
@test text_content == "gets normalized"
end

@testset "xml:space inheritance" begin
# Test 6: Children inherit parent's xml:space="preserve"
xml6 = """<root xml:space="preserve">
<parent> parent text
<child> child text </child>
</parent>
</root>"""
doc6 = XML.parse(XML.Node, xml6)
# Both parent and child should preserve whitespace
@test contains(XML.value(doc6[1][1][1]), "parent text \n")
@test XML.value(doc6[1][1][2][1]) == " child text "

# Test 7: xml:space="default" overrides parent's "preserve"
xml7 = """<root xml:space="preserve">
<child xml:space="default"> normalized despite parent </child>
</root>"""
doc7 = XML.parse(XML.Node, xml7)
@test XML.value(doc7[1][1][1]) == "normalized despite parent"
end

@testset "Nesting scenarios" begin
# Test 8: Multiple levels of xml:space changes
xml8 = """<root xml:space="preserve">
<level1> preserved
<level2 xml:space="default"> normalized
<level3 xml:space="preserve"> preserved again </level3>
</level2>
</level1>
</root>"""
doc8 = XML.parse(XML.Node, xml8)

# level1 should preserve (inherits from root)
level1_text = XML.value(doc8[1][1][1])
@test level1_text == " preserved \n "

# level2 should normalize (explicit xml:space="default")
level2_text = XML.value(doc8[1][1][2][1])
@test level2_text == "normalized"

# level3 should preserve (explicit xml:space="preserve")
level3_text = XML.value(doc8[1][1][2][2][1])
@test level3_text == " preserved again "

# Test 9: repeated multiple levels of xml:space changes
xml9 = """<root xml:space="preserve">
<level1> preserved
<level2 xml:space="default"> normalized
<level3 xml:space="preserve"> preserved again </level3>
</level2>
</level1>
<level1b> preserved b
<level2b xml:space="default"> normalized b
<level3b xml:space="preserve"> preserved again b </level3b>
</level2b>
</level1b>
</root>"""
doc9 = XML.parse(XML.Node, xml9)

# level1b should preserve (inherits from root)
level1b_text = XML.value(doc9[1][2][1])
@test level1b_text == " preserved b \n "

# level2 should normalize (explicit xml:space="default")
level2b_text = XML.value(doc9[1][2][2][1])
@test level2b_text == "normalized b"

# level3 should preserve (explicit xml:space="preserve")
level3b_text = XML.value(doc9[1][2][2][2][1])
@test level3b_text == " preserved again b "

end
end

#-----------------------------------------------------------------------------# roundtrip
@testset "read/write/read roundtrip" begin
for path in all_files
Expand Down
Loading