Skip to content

Commit 3163363

Browse files
committed
Update to use tables for case
1 parent bc78697 commit 3163363

File tree

8 files changed

+648
-147
lines changed

8 files changed

+648
-147
lines changed

src/StrBase.jl

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ include("types.jl")
4848
@static V6_COMPAT && include("compat.jl")
4949
@static NEW_ITERATE && include("fixparse.jl")
5050
include("chars.jl")
51+
include("charcase.jl")
5152
include("access.jl")
5253
include("traits.jl")
5354
include("utf8proc.jl")

src/casefold.jl

+81-49
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
55
Licensed under MIT License, see LICENSE.md
66
=#
77

8-
_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
9-
10-
@inline _wide_lower_ch(ch) =
11-
ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch))
12-
13-
@inline _isupper_ch(ch) =
14-
ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch))
15-
16-
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
17-
18-
_wide_out_upper(ch) =
19-
ifelse(ch == 0xb5, 0x39c,
20-
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
21-
22-
238
function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
249
(len = ncodeunits(str)) == 0 && return str
2510
@preserve str begin
2611
pnt = pointer(str)
2712
ch = get_codeunit(pnt)
2813
_islower_a(ch) || return str
29-
out = _allocate(len)
14+
buf, out = _allocate(UInt8, len)
3015
unsafe_copyto!(out, pnt, len)
3116
set_codeunit!(out, ch - 0x20)
32-
Str(C, out)
17+
Str(C, buf)
3318
end
3419
end
3520

@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
3924
pnt = pointer(str)
4025
ch = get_codeunit(pnt)
4126
_isupper_a(ch) || return str
42-
out = _allocate(len)
27+
buf, out = _allocate(UInt8, len)
4328
unsafe_copyto!(out, pnt, len)
4429
set_codeunit!(out, ch + 0x20)
45-
Str(C, out)
30+
Str(C, buf)
4631
end
4732
end
4833

@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
119104
_can_upper(ch) || return str
120105
buf, out = _allocate(UInt8, len)
121106
set_codeunit!(out, ch - 0x20)
122-
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
107+
len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1)
123108
Str(C, buf)
124109
end
125110
end
@@ -154,10 +139,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
154139
@preserve str begin
155140
pnt = pointer(str)
156141
ch = get_codeunit(pnt)
157-
_isupper(ch) || return str
142+
_isupper_al(ch) || return str
158143
buf, out = _allocate(UInt8, len)
159144
set_codeunit!(out, ch + 0x20)
160-
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
145+
len > 1 && unsafe_copyto!(out+1, pnt+1, len-1)
161146
Str(C, buf)
162147
end
163148
end
@@ -176,7 +161,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE}
176161
out += off
177162
while out < fin
178163
ch = get_codeunit(out)
179-
_can_upper(ch) && set_codeunit!(out, ch - 0x20)
164+
_islower(ch) && set_codeunit!(out, ch - 0x20)
180165
out += 1
181166
end
182167
Str(C, buf)
@@ -264,7 +249,7 @@ end
264249
# result must have at least one character > 0xff, so if the only character(s)
265250
# > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
266251

267-
function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
252+
function _lower(::Type{C}, beg, off, len) where {C<:Union{_UCS2CSE}}
268253
CU = codeunit(C)
269254
buf, out = _allocate(CU, len)
270255
unsafe_copyto!(out, beg, len)
@@ -277,18 +262,20 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
277262
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
278263
elseif ch <= 0xff
279264
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
280-
elseif _isupper_u(ch)
281-
ch = _lowercase_u(ch)
282-
flg = ch <= 0xff
283-
set_codeunit!(out, ch)
265+
elseif ch <= 0xffff
266+
if _can_lower_bmp(ch)
267+
ch = _lower_bmp(ch)
268+
flg = ch <= 0xff
269+
set_codeunit!(out, ch)
270+
end
284271
end
285272
out += sizeof(CU)
286273
end
287274
if flg && is_latin(buf)
288275
out = pointer(buf)
289-
buf = _allocate(len)
290-
_narrow!(pointer(buf), out, out + len)
291-
Str(_LatinCSE, buf)
276+
buf8 = _allocate(len)
277+
_narrow!(pointer(buf8), out, out + len)
278+
Str(_LatinCSE, buf8)
292279
else
293280
Str(C, buf)
294281
end
@@ -302,25 +289,74 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
302289
out += off
303290
while out < fin
304291
ch = get_codeunit(out)
305-
if ch <= 0x7f
306-
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
307-
elseif ch <= 0xff
308-
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
309-
elseif _isupper_u(ch)
310-
set_codeunit!(out, _lowercase_u(ch))
292+
if ch <= 0xff
293+
_isupper_al(ch) && set_codeunit!(out, ch += 0x20)
294+
elseif ch <= 0xffff
295+
_can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch))
296+
elseif ch <= 0x1ffff
297+
_can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch))
311298
end
312299
out += sizeof(CU)
313300
end
314301
Str(C, buf)
315302
end
316303

304+
function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}}
305+
(len = ncodeunits(str)) == 0 && return str
306+
@preserve str begin
307+
pnt = pointer(str)
308+
ch = get_codeunit(pnt)
309+
(ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) :
310+
ch <= 0x1ffff && _can_lower_slp(ch)) ||
311+
return str
312+
cl = _lower_ch(ch)
313+
if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16))
314+
buf8, out8 = _allocate(UInt8, len)
315+
len > 1 && _narrow!(out8 + 1, pnt + 1, pnt + len - 1)
316+
set_codeunit!(out8, cl)
317+
Str(_LatinCSE, buf8)
318+
else
319+
buf, out = _allocate(codeunit(C), len)
320+
len > 1 && unsafe_copyto!(out, pnt, len)
321+
set_codeunit!(out, cl)
322+
Str(C, buf)
323+
end
324+
end
325+
end
326+
327+
function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
328+
(len = ncodeunits(str)) == 0 && return str
329+
@preserve str begin
330+
pnt = pointer(str)
331+
ch = get_codeunit(pnt)
332+
_can_title_ch(ch) || return str
333+
buf, out = _allocate(codeunit(C), len)
334+
len > 1 && unsafe_copyto!(out, pnt, len)
335+
set_codeunit!(out, _title_ch(ch))
336+
Str(C, buf)
337+
end
338+
end
339+
340+
function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}}
341+
(len = ncodeunits(str)) == 0 && return str
342+
@preserve str begin
343+
pnt = pointer(str)
344+
ch = get_codeunit(pnt)
345+
_can_lower_ch(ch) || return str
346+
buf, out = _allocate(codeunit(C), len)
347+
len > 1 && unsafe_copyto!(out, pnt, len)
348+
set_codeunit!(out, _lower_ch(ch))
349+
Str(C, buf)
350+
end
351+
end
352+
317353
function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
318354
@preserve str begin
319355
CU = codeunit(C)
320356
pnt = beg = pointer(str)
321357
fin = beg + sizeof(str)
322358
while pnt < fin
323-
_isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
359+
_can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
324360
pnt += sizeof(CU)
325361
end
326362
end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
337373
ch = get_codeunit(out)
338374
if ch <= 0x7f
339375
_islower_a(ch) && set_codeunit!(out, ch -= 0x20)
340-
elseif ch > 0xff
341-
_islower_u(ch) && set_codeunit!(out, _uppercase_u(ch))
342-
elseif _can_upper(ch)
343-
set_codeunit!(out, ch -= 0x20)
344-
elseif ch == 0xb5
345-
set_codeunit!(out, 0x39c)
346-
elseif ch == 0xff
347-
set_codeunit!(out, 0x178)
348-
elseif !V6_COMPAT && ch == 0xdf
349-
set_codeunit!(out, 0x1e9e)
376+
elseif ch <= 0xff
377+
set_codeunit!(out, _uppercase_l(ch))
378+
elseif ch <= 0xffff
379+
_can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch))
380+
elseif ch <= 0x1ffff
381+
_can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch))
350382
end
351383
out += sizeof(CU)
352384
end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
359391
pnt = beg = pointer(str)
360392
fin = beg + sizeof(str)
361393
while pnt < fin
362-
_wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
394+
_can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
363395
pnt += sizeof(CU)
364396
end
365397
str

src/charcase.jl

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#=
2+
Case folding for Unicode characters
3+
4+
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
5+
Licensed under MIT License, see LICENSE.md
6+
=#
7+
8+
module CaseTables
9+
include("maketables.jl")
10+
11+
const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
12+
end # module CaseTables
13+
14+
using .CaseTables
15+
16+
const ct = CaseTables.ct
17+
18+
using ModuleInterfaceTools
19+
@api extend ChrBase
20+
21+
_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
22+
23+
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
24+
25+
_wide_out_upper(ch) =
26+
ifelse(ch == 0xb5, 0x39c,
27+
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
28+
29+
_check_tab(off, ch) =
30+
off != 0 && (CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0
31+
32+
@inline _get_tab(off, ch) =
33+
off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch :
34+
CaseTables.tupvec[off][(ch & 0x1f) + 1]
35+
36+
@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch)
37+
38+
@inline _upper_bmp(ch) =
39+
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))
40+
41+
@inline _lower_bmp(ch) =
42+
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))
43+
44+
@inline _title_bmp(ch) =
45+
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.t_tab[(t>>1)+1], ch))
46+
47+
@inline _upper_slp(ch) =
48+
(t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))
49+
50+
@inline _lower_slp(ch) =
51+
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))
52+
53+
# Handle range 0x0000-0xffff
54+
@inline _can_lower_bmp(ch) =
55+
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
56+
57+
# Handle range 0x10000-0x1ffff
58+
@inline _can_lower_slp(ch) =
59+
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
60+
61+
# Handle range 0x0000-0xffff
62+
@inline _can_upper_bmp(ch) =
63+
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
64+
65+
# Handle range 0x10000-0x1ffff
66+
@inline _can_upper_slp(ch) =
67+
(t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
68+
69+
#=
70+
# Handle range 0x0000-0xffff
71+
@inline _can_title_bmp(ch) =
72+
(t = (ch >>> 9); ((ct.can_t >>> t) & 1) != 0 && _check_tab(ct.can_t_tab[t+1], ch))
73+
=#
74+
const _can_title_bmp = _can_upper_bmp
75+
76+
# Handle range 0x0000-0xffff
77+
@inline _is_lower_bmp(ch) =
78+
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
79+
80+
# Handle range 0x10000-0x1ffff
81+
@inline _is_lower_slp(ch) =
82+
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
83+
84+
# Handle range 0x0000-0xffff
85+
@inline _is_upper_bmp(ch) =
86+
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
87+
88+
@inline _is_lower_ch(ch) =
89+
ch <= 0x7f ? _islower_a(ch) :
90+
ch <= 0xff ? _islower_l(ch) :
91+
ch <= 0xffff ? _is_lower_bmp(ch) :
92+
ch <= 0x1ffff ? _is_lower_slp(ch) : false
93+
94+
@inline _is_upper_ch(ch) =
95+
ch <= 0x7f ? _isupper_a(ch) :
96+
ch <= 0xff ? _isupper_l(ch) :
97+
ch <= 0xffff ? _is_upper_bmp(ch) :
98+
ch <= 0x1ffff ? _is_upper_slp(ch) : false
99+
100+
@inline _can_lower_ch(ch) =
101+
ch <= 0x7f ? _isupper_a(ch) :
102+
ch <= 0xff ? _isupper_l(ch) :
103+
ch <= 0xffff ? _can_lower_bmp(ch) :
104+
ch <= 0x1ffff ? _can_lower_slp(ch) : false
105+
106+
@inline _can_upper_ch(ch) =
107+
ch <= 0x7f ? _islower_a(ch) :
108+
ch <= 0xff ? _can_upper_lat(ch) :
109+
ch <= 0xffff ? _can_upper_bmp(ch) :
110+
ch <= 0x1ffff ? _can_upper_slp(ch) : false
111+
112+
const _can_title_ch = _can_upper_ch
113+
114+
@inline _lower_ch(ch) =
115+
ch <= 0x7f ? (_isupper_a(ch) ? ch + 0x20 : ch) :
116+
ch <= 0xff ? (_isupper_l(ch) : ch + 0x20 : ch) :
117+
ch <= 0xffff ? _lower_bmp(ch) :
118+
ch <= 0x1ffff ? _lower_slp(ch) : ch
119+
120+
@inline _upper_ch(ch) =
121+
ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
122+
ch <= 0xff ? _upper_lat(ch) :
123+
ch <= 0xffff ? _upper_bmp(ch) :
124+
ch <= 0x1ffff ? _upper_slp(ch) : ch
125+
126+
@inline _title_ch(ch) =
127+
ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
128+
ch <= 0xff ? _upper_lat(ch) :
129+
ch <= 0xffff ? _title_bmp(ch) :
130+
ch <= 0x1ffff ? _upper_slp(ch) : ch

0 commit comments

Comments
 (0)