Skip to content

Commit ee7a6bb

Browse files
committed
Improve iterate for UTF8Str
1 parent a1b29b5 commit ee7a6bb

File tree

2 files changed

+14
-14
lines changed

2 files changed

+14
-14
lines changed

src/support.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -948,15 +948,15 @@ function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
948948
cnt < 0 && repeaterr(cnt)
949949
if ch%UInt32 <= 0xff
950950
buf, pnt = _allocate(UInt8, cnt)
951-
cnt == 1 && set_codepoint!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
951+
cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
952952
Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
953953
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
954954
buf, pnt = _allocate(UInt16, cnt)
955-
cnt == 1 && set_codepoint!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
955+
cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
956956
Str(UCS2CSE, buf)
957957
else
958958
buf, pnt = _allocate(UInt32, cnt)
959-
cnt == 1 && set_codepoint!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
959+
cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
960960
Str(UTF32CSE, buf)
961961
end
962962
end

src/utf8.jl

+11-11
Original file line numberDiff line numberDiff line change
@@ -351,23 +351,23 @@ end
351351
end
352352
=#
353353

354+
function _iterate_utf8(ch, str, pnt, pos)
355+
if ch < 0xe0
356+
ch < 0xc0 ? index_error(str, pos) : UTF32Chr(get_utf8_2byte(pnt + 1, ch)), pos + 2
357+
elseif ch < 0xf0
358+
UTF32Chr(get_utf8_3byte(pnt + 2, ch)), pos + 3
359+
else
360+
UTF32Chr(get_utf8_4byte(pnt + 3, ch)), pos + 4
361+
end
362+
end
363+
354364
@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
355365
pos > ncodeunits(str) && return nothing
356366
@boundscheck pos <= 0 && boundserr(str, pos)
357367
@preserve str begin
358368
pnt = pointer(str) + pos - 1
359369
ch = get_codeunit(pnt)
360-
if ch < 0x80
361-
UTF32Chr(ch), pos + 1
362-
elseif ch < 0xc0
363-
index_error(str, pos)
364-
elseif ch < 0xe0
365-
UTF32Chr(get_utf8_2byte(pnt + 1, ch)), pos + 2
366-
elseif ch < 0xf0
367-
UTF32Chr(get_utf8_3byte(pnt + 2, ch)), pos + 3
368-
else
369-
UTF32Chr(get_utf8_4byte(pnt + 3, ch)), pos + 4
370-
end
370+
ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
371371
end
372372
end
373373

0 commit comments

Comments
 (0)