Skip to content

Commit e797807

Browse files
committed
Change to use GitHub Actions
1 parent 98fedf9 commit e797807

File tree

7 files changed

+191
-116
lines changed

7 files changed

+191
-116
lines changed

.travis.yml

Lines changed: 0 additions & 34 deletions
This file was deleted.

src/ascii.jl

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,12 @@
11
#=
22
ASCIIStr type
33
4-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
4+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
55
and other contributors to the Julia language
66
Licensed under MIT License, see LICENSE.md
77
Based in part on code for ASCIIString that used to be in Julia
88
=#
99

10-
## overload methods for efficiency ##
11-
12-
function _string(coll)
13-
n = 0
14-
for str in coll
15-
n += ncodeunits(str)
16-
end
17-
buf, out = _allocate(UInt8, n)
18-
for str in coll
19-
@preserve str begin
20-
len = ncodeunits(str)
21-
unsafe_copyto!(out, pointer(str), len)
22-
out += len
23-
end
24-
end
25-
buf
26-
end
27-
28-
string(c::MaybeSub{<:Str{ASCIICSE}}...) = length(c) == 1 ? c[1] : Str(ASCIICSE, _string(c))
29-
3010
## transcoding to ASCII ##
3111

3212
function convert(::Type{<:Str{ASCIICSE}}, str::AbstractString)

src/latin.jl

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#=
22
LatinStr/_LatinStr type (ISO Latin1 8-bit subset of Unicode)
33
4-
Copyright 2017 Gandalf Software, Inc., Scott P. Jones, and other contributors to the Julia language
4+
Copyright 2017, 2020 Gandalf Software, Inc., Scott P. Jones,
5+
and other contributors to the Julia language
56
Licensed under MIT License, see LICENSE.md
67
Based in part on code for ASCIIString that used to be in Julia
78
=#
@@ -13,23 +14,6 @@ is_latin(str::MaybeSub{<:Str{<:LatinCSE}}) = true
1314
is_bmp(str::MS_Latin) = true
1415
is_unicode(str::MS_Latin) = true
1516

16-
const MS_ASCIILatin = MaybeSub{<:Str{<:Union{ASCIICSE, Latin_CSEs}}}
17-
18-
function string(collection::MS_ASCIILatin...)
19-
length(collection) == 1 && return collection[1]
20-
len = 0
21-
@inbounds for str in collection
22-
len += ncodeunits(str)
23-
end
24-
buf, pnt = _allocate(len)
25-
@inbounds for str in collection
26-
len = ncodeunits(str)
27-
_memcpy(pnt, pointer(str), len)
28-
pnt += len
29-
end
30-
Str(LatinCSE, buf)
31-
end
32-
3317
## transcoding to Latin1 ##
3418

3519
function convert(::Type{<:Str{C}}, str::AbstractString) where {C<:Latin_CSEs}

src/utf16.jl

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,41 +11,47 @@ const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
1111
const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
1212

1313
const _big_trail_mask = _widen_mask(_trail_mask)
14-
const _big_hi_bit_16 = _widen_mask(_big_hi_bit_16)
14+
const _big_hi_bit_16 = _widen_mask(_hi_bit_16)
1515

1616
@inline _mask_surr(v, msk) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & msk, msk)
1717

18-
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
19-
@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask))
18+
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask), _hi_bit_16)
19+
@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask), _big_hi_bit_16)
2020
@inline _get_masked(qpnt::Ptr) = _get_masked(unsafe_load(qpnt))
2121

2222
@inline _get_lead(qpnt::Ptr{UInt}) = xor(_get_masked(qpnt), _hi_bit_16)
2323
@inline _get_lead(qpnt::Ptr{BigChunk}) = xor(_get_masked(qpnt), _big_hi_bit_16)
2424

25-
@inline function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
26-
# First check very frequent cases of short strings
27-
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
28-
# taking advantage of the knowledge of how String types are stored in Julia,
29-
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
30-
cnt <<= 1
31-
if cnt <= BIGCHUNKSZ
32-
return (cnt <= CHUNKSZ
33-
? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt))
34-
: count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))
35-
end
25+
## overload methods for efficiency ##
26+
27+
function _length_utf16_al(beg::Ptr{UInt16}, cnt::Int)
3628
len = count_ones(_get_lead(_pntchunk(beg)))
3729
cnt -= CHUNKSZ
3830
pnt = _pntbigchunk(beg + CHUNKSZ)
3931
v = _get_lead(pnt)
40-
cnt <= BIGCHUNKSZ && return len + count_ones(_mask_bytes(v, cnt))
41-
fin = pnt + cnt
42-
while (pnt += BIGCHUNKSZ) < fin
43-
len += count_ones(v)
44-
v = _get_lead(pnt)
32+
if cnt > BIGCHUNKSZ
33+
fin = pnt + cnt
34+
while (pnt += BIGCHUNKSZ) < fin
35+
len += count_ones(v)
36+
v = _get_lead(pnt)
37+
end
4538
end
4639
len + count_ones(_mask_bytes(v, cnt))
4740
end
4841

42+
function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
43+
# First check very frequent cases of short strings
44+
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
45+
# taking advantage of the knowledge of how String types are stored in Julia,
46+
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
47+
cnt <<= 1
48+
(cnt <= BIGCHUNKSZ
49+
? (cnt <= CHUNKSZ
50+
? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt)))
51+
: count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))))
52+
: _length_utf16_al(beg, cnt))
53+
end
54+
4955
function _length_ul(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
5056
align = reinterpret(UInt, beg)
5157
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
@@ -104,20 +110,6 @@ function _prevind(::MultiCU, str::MS_UTF16, pos::Int, nchar::Int)
104110
end
105111

106112
# Check for any surrogate characters
107-
function is_bmp(str::MS_UTF16)
108-
(siz = sizeof(str)) == 0 && return true
109-
# Todo: handle unaligned for ARM32
110-
@preserve str begin
111-
siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0
112-
113-
pnt, fin = _calcpnt(str, siz)
114-
while (pnt += CHUNKSZ) <= fin
115-
_get_masked(pnt) == 0 || return false
116-
end
117-
pnt - CHUNKSZ == fin || (_get_masked(pnt) & _mask_bytes(siz)) == 0
118-
end
119-
end
120-
121113
@inline function _check_bmp_utf16_al(beg, cnt)
122114
cnt <= CHUNKSZ && return _mask_bytes(_get_masked(_pntchunk(beg)), cnt) == 0
123115
cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0

src/utf8.jl

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,6 @@ _all_latin(val) =
226226

227227
@inline function _check_latin_utf8_al(beg, cnt)
228228
cnt <= CHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt))
229-
bigmsk = _widen_mask(msk)
230229
cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
231230
_all_latin(unsafe_load(_pntchunk(ptr))) || return false
232231
cnt -= CHUNKSZ
@@ -601,15 +600,6 @@ _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int, nchar::Int) =
601600
_prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int) =
602601
prevind(str.data, pos)
603602

604-
#=
605-
const _ByteStr = Union{Str{ASCIICSE}, SubString{<:Str{ASCIICSE}},
606-
Str{UTF8CSE}, SubString{<:Str{UTF8CSE}}}
607-
608-
string(s::_ByteStr) = s
609-
string(s::_ByteStr, c::_ByteStr...) = UTF8Str(_string(c))
610-
# ^^ at least one must be UTF-8 or the ASCII-only method would get called
611-
=#
612-
613603
function _reverse(::MultiCU, ::Type{UTF8CSE}, len, pnt::Ptr{T}) where {T<:CodeUnitTypes}
614604
buf, beg = _allocate(T, len)
615605
out = beg + len

src/util.jl

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,144 @@ Licensed under MIT License, see LICENSE.md
77
Based initially on julia/test/strings/util.jl
88
=#
99

10+
function _concat(T, a, b)
11+
la = ncodeunits(a)
12+
lb = ncodeunits(b)
13+
buf, out = _allocate(T, la + lb)
14+
@preserve a unsafe_copyto!(out, pointer(a), la)
15+
@preserve b unsafe_copyto!(out + la, pointer(b), lb)
16+
buf
17+
end
18+
19+
function _string(T, a, b, rest)
20+
la = ncodeunits(a)
21+
lb = ncodeunits(b)
22+
len = la + lb
23+
@inbounds for str in rest
24+
len += ncodeunits(str)
25+
end
26+
buf, out = _allocate(T, len)
27+
@preserve a unsafe_copyto!(out, pointer(a), la)
28+
out += la
29+
@preserve b unsafe_copyto!(out, pointer(b), lb)
30+
out += lb
31+
@inbounds for str in rest
32+
len = ncodeunits(str)
33+
@preserve str unsafe_copyto!(out, pointer(str), len)
34+
out += len
35+
end
36+
buf
37+
end
38+
39+
function _string(T, coll)
40+
len = 0
41+
@inbounds for str in coll
42+
len += ncodeunits(str)
43+
end
44+
buf, out = _allocate(T, len)
45+
@inbounds for str in coll
46+
len = ncodeunits(str)
47+
@preserve str unsafe_copyto!(out, pointer(str), len)
48+
out += len
49+
end
50+
buf
51+
end
52+
53+
# Handle concatenation where all the same CSE for strings, and character set for characters
54+
#=
55+
"""
56+
WIP: this is rather tricky.
57+
It really should handle any type of Chr / Str / CSE, not just the ones defined
58+
in CharSetEncodings, ChrBase and StrBase
59+
Ideally, it could also handle mixes with String and Char (or other AbstractString / AbstractChar
60+
types.
61+
It may need to do two or even three passes, one to determine the correct type to be output,
62+
another to determine the output length, and finally another to copy the strings / characters into
63+
the buffer.
64+
The result type should be based on promotion rules, i.e. outputting UCS2Str if only ASCII, Latin, UCS2 characters and strings are in the list.
65+
This is difficult to do in a way that will still be type stable.
66+
"""
67+
68+
function _string_chr(a::Union{<:Chr{CS,T}, <:Str{C}, SubString{<:Str{C}}}...
69+
) where {CS<:CharSet,T,C<:CSE{CS}}
70+
len = 0
71+
for v in a
72+
if v isa Chr
73+
len += 1
74+
else
75+
len += ncodeunits(v)
76+
end
77+
end
78+
buf, out = _allocate(T, len)
79+
for v in a
80+
len = ncodeunits(str)
81+
@preserve str unsafe_copyto!(out, pointer(str), len)
82+
out += len
83+
end
84+
buf
85+
end
86+
=#
87+
88+
string(c::MaybeSub{<:Str}) = c
89+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
90+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
91+
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
92+
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
93+
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))
94+
95+
#=
96+
const MS_Str{C} = MaybeSub{<:Str{C}}
97+
string(a::MS_Str{C}, b::MS_Str{C}) where {C<:CSE} = Str(C, _concat(codeunit(C), a, b))
98+
string(a::MS_Str{C}, b::MS_Str{C}, c::MS_Str{C}...) where {C<:CSE} =
99+
Str(C, _string(codeunit(C), a, b, c))
100+
101+
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
102+
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
103+
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
104+
105+
const MS_AL = MS_Str{<:Union{ASCIICSE,Latin_CSEs}}
106+
string(a::MS_AL, b::MS_AL) = Str(LatinCSE, _concat(UInt8, a, b))
107+
string(a::MS_AL, b::MS_AL, c::MS_AL...) = Str(LatinCSE, _string(UInt8, a, b, c))
108+
109+
const MS_AU = MS_Str{<:Union{ASCIICSE,UTF8CSE}}
110+
string(a::MS_AU, b::MS_AU) = Str(UTF8CSE, _concat(UInt8, a, b))
111+
string(a::MS_AU, b::MS_AU, c::MS_AU...) = Str(UTF8CSE, _string(UInt8, a, b, c))
112+
113+
const MS_U2 = MS_Str{<:UCS2_CSEs}
114+
string(a::MS_U2, b::MS_U2) = Str(UCS2CSE, _concat(UInt16, a, b))
115+
string(a::MS_U2, b::MS_U2, c::MS_U2...) = Str(UCS2CSE, _string(UInt16, a, b, c))
116+
117+
const MS_UT = MS_Str{<:Union{UCS2_CSEs,UTF16CSE}}
118+
string(a::MS_UT, b::MS_UT) = Str(UTF16CSE, _concat(UInt16, a, b))
119+
string(a::MS_UT, b::MS_UT, c::MS_UT...) = Str(UTF16CSE, _string(UInt16, a, b, c))
120+
121+
const MS_U4 = MS_Str{<:UTF32_CSEs}
122+
string(a::MS_U4, b::MS_U4) = Str(UTF32CSE, _concat(UInt32, a, b))
123+
string(a::MS_U4, b::MS_U4, c::MS_U4...) = Str(UTF32CSE, _string(UInt32, a, b, c))
124+
=#
125+
126+
#=
127+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) =
128+
length(c) == 1 ? c[1] : Str(LatinCSE, _string(UInt8, c))
129+
130+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) =
131+
length(c) == 1 ? c[1] : Str(UTF8CSE, _string(UInt8, c))
132+
133+
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) =
134+
length(c) == 1 ? c[1] : Str(UCS2CSE, _string(UInt16, c))
135+
136+
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) =
137+
length(c) == 1 ? c[1] : Str(UTF16CSE, _string(UInt16, c))
138+
139+
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) =
140+
length(c) == 1 ? c[1] : Str(UTF32CSE, _string(UInt32, c))
141+
=#
142+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
143+
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
144+
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
145+
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
146+
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))
147+
10148
# starts with and ends with predicates
11149

12150
starts_with(a::MaybeSub{<:Str{C}}, b::MaybeSub{<:Str{C}}) where {C<:CSE} =

test/util.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,31 @@
307307
#non-hex characters
308308
@test_throws ArgumentError hex2bytes(b"0123456789abcdefABCDEFGH")
309309
end
310+
311+
@testset "Concatenation" begin
312+
asc = ASCIIStr("foo")
313+
lat = LatinStr("bar")
314+
ucs = UCS2Str("baz")
315+
u32 = UTF32Str("silly")
316+
ut8 = UTF8Str("test")
317+
ut16 = UTF16Str("ugly")
318+
haslat = _LatinStr("você")
319+
hasucs = _UCS2Str("")
320+
hasu32 = _UTF32Str("\U1f596")
321+
@test typeof(asc * asc) == ASCIIStr
322+
@test typeof(asc * lat) == LatinStr
323+
@test typeof(asc * ut8) == UTF8Str
324+
@test typeof(asc * haslat) == LatinStr
325+
@test typeof(lat * lat) == LatinStr
326+
@test typeof(haslat * haslat) == _LatinStr
327+
@test typeof(lat * haslat) == LatinStr
328+
@test typeof(ucs * ucs) == UCS2Str
329+
@test typeof(hasucs * hasucs) == _UCS2Str
330+
@test typeof(ucs * hasucs) == UCS2Str
331+
@test typeof(u32 * u32) == UTF32Str
332+
@test typeof(hasu32 * hasu32) == _UTF32Str
333+
@test typeof(u32 * hasu32) == UTF32Str
334+
end
310335
end
311336

312337
# b"" should be immutable

0 commit comments

Comments
 (0)