Change to use GitHub Actions

ScottPJones · ScottPJones · commit e79780795220 · 2021-01-17T10:24:01.000-05:00
diff --git a/.travis.yml b/.travis.yml
diff --git a/src/ascii.jl b/src/ascii.jl
@@ -1,32 +1,12 @@
 #=
 ASCIIStr type
 
-Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
+Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
 and other contributors to the Julia language
 Licensed under MIT License, see LICENSE.md
 Based in part on code for ASCIIString that used to be in Julia
 =#
 
-## overload methods for efficiency ##
-
-function _string(coll)
-    n = 0
-    for str in coll
-        n += ncodeunits(str)
-    end
-    buf, out = _allocate(UInt8, n)
-    for str in coll
-        @preserve str begin
-            len = ncodeunits(str)
-            unsafe_copyto!(out, pointer(str), len)
-            out += len
-        end
-    end
-    buf
-end
-
-string(c::MaybeSub{<:Str{ASCIICSE}}...) = length(c) == 1 ? c[1] : Str(ASCIICSE, _string(c))
-
 ## transcoding to ASCII ##
 
 function convert(::Type{<:Str{ASCIICSE}}, str::AbstractString)
diff --git a/src/latin.jl b/src/latin.jl
@@ -1,7 +1,8 @@
 #=
 LatinStr/_LatinStr type (ISO Latin1 8-bit subset of Unicode)
 
-Copyright 2017 Gandalf Software, Inc., Scott P. Jones, and other contributors to the Julia language
+Copyright 2017, 2020 Gandalf Software, Inc., Scott P. Jones,
+and other contributors to the Julia language
 Licensed under MIT License, see LICENSE.md
 Based in part on code for ASCIIString that used to be in Julia
 =#
@@ -13,23 +14,6 @@ is_latin(str::MaybeSub{<:Str{<:LatinCSE}}) = true
 is_bmp(str::MS_Latin) = true
 is_unicode(str::MS_Latin) = true
 
-const MS_ASCIILatin = MaybeSub{<:Str{<:Union{ASCIICSE, Latin_CSEs}}}
-
-function string(collection::MS_ASCIILatin...)
-    length(collection) == 1 && return collection[1]
-    len = 0
-    @inbounds for str in collection
-        len += ncodeunits(str)
-    end
-    buf, pnt = _allocate(len)
-    @inbounds for str in collection
-        len = ncodeunits(str)
-        _memcpy(pnt, pointer(str), len)
-        pnt += len
-    end
-    Str(LatinCSE, buf)
-end
-
 ## transcoding to Latin1 ##
 
 function convert(::Type{<:Str{C}}, str::AbstractString) where {C<:Latin_CSEs}
diff --git a/src/utf16.jl b/src/utf16.jl
@@ -11,41 +11,47 @@ const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
 const _hi_bit_16  = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
 
 const _big_trail_mask = _widen_mask(_trail_mask)
-const _big_hi_bit_16  = _widen_mask(_big_hi_bit_16)
+const _big_hi_bit_16  = _widen_mask(_hi_bit_16)
 
 @inline _mask_surr(v, msk)  = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & msk, msk)
 
-@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
-@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask))
+@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask), _hi_bit_16)
+@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask), _big_hi_bit_16)
 @inline _get_masked(qpnt::Ptr) = _get_masked(unsafe_load(qpnt))
 
 @inline _get_lead(qpnt::Ptr{UInt}) = xor(_get_masked(qpnt), _hi_bit_16)
 @inline _get_lead(qpnt::Ptr{BigChunk}) = xor(_get_masked(qpnt), _big_hi_bit_16)
 
-@inline function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
-    # First check very frequent cases of short strings
-    # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
-    # taking advantage of the knowledge of how String types are stored in Julia,
-    # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
-    cnt <<= 1
-    if cnt <= BIGCHUNKSZ
-        return (cnt <= CHUNKSZ
-                ? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt))
-                : count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))
-    end
+## overload methods for efficiency ##
+
+function _length_utf16_al(beg::Ptr{UInt16}, cnt::Int)
     len = count_ones(_get_lead(_pntchunk(beg)))
     cnt -= CHUNKSZ
     pnt = _pntbigchunk(beg + CHUNKSZ)
     v = _get_lead(pnt)
-    cnt <= BIGCHUNKSZ && return len + count_ones(_mask_bytes(v, cnt))
-    fin = pnt + cnt
-    while (pnt += BIGCHUNKSZ) < fin
-        len += count_ones(v)
-        v = _get_lead(pnt)
+    if cnt > BIGCHUNKSZ
+        fin = pnt + cnt
+        while (pnt += BIGCHUNKSZ) < fin
+            len += count_ones(v)
+            v = _get_lead(pnt)
+        end
     end
     len + count_ones(_mask_bytes(v, cnt))
 end
 
+function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
+    # First check very frequent cases of short strings
+    # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
+    # taking advantage of the knowledge of how String types are stored in Julia,
+    # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
+    cnt <<= 1
+    (cnt <= BIGCHUNKSZ
+     ? (cnt <= CHUNKSZ
+        ? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt)))
+        : count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))))
+     : _length_utf16_al(beg, cnt))
+end
+
 function _length_ul(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
     align = reinterpret(UInt, beg)
     pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
@@ -104,20 +110,6 @@ function _prevind(::MultiCU, str::MS_UTF16, pos::Int, nchar::Int)
 end
 
 # Check for any surrogate characters
-function is_bmp(str::MS_UTF16)
-    (siz = sizeof(str)) == 0 && return true
-    # Todo: handle unaligned for ARM32
-    @preserve str begin
-        siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0
-
-        pnt, fin = _calcpnt(str, siz)
-        while (pnt += CHUNKSZ) <= fin
-            _get_masked(pnt) == 0 || return false
-        end
-        pnt - CHUNKSZ == fin || (_get_masked(pnt) & _mask_bytes(siz)) == 0
-    end
-end
-
 @inline function _check_bmp_utf16_al(beg, cnt)
     cnt <= CHUNKSZ && return _mask_bytes(_get_masked(_pntchunk(beg)), cnt) == 0
     cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0
diff --git a/src/utf8.jl b/src/utf8.jl
@@ -226,7 +226,6 @@ _all_latin(val) =
 
 @inline function _check_latin_utf8_al(beg, cnt)
     cnt <= CHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt))
-    bigmsk = _widen_mask(msk)
     cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
     _all_latin(unsafe_load(_pntchunk(ptr))) || return false
     cnt -= CHUNKSZ
@@ -601,15 +600,6 @@ _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int, nchar::Int) =
 _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int) =
     prevind(str.data, pos)
 
-#=
-const _ByteStr = Union{Str{ASCIICSE}, SubString{<:Str{ASCIICSE}},
-                       Str{UTF8CSE},  SubString{<:Str{UTF8CSE}}}
-
-string(s::_ByteStr) = s
-string(s::_ByteStr, c::_ByteStr...) = UTF8Str(_string(c))
-    # ^^ at least one must be UTF-8 or the ASCII-only method would get called
-=#
-
 function _reverse(::MultiCU, ::Type{UTF8CSE}, len, pnt::Ptr{T}) where {T<:CodeUnitTypes}
     buf, beg = _allocate(T, len)
     out = beg + len
diff --git a/src/util.jl b/src/util.jl
@@ -7,6 +7,144 @@ Licensed under MIT License, see LICENSE.md
 Based initially on julia/test/strings/util.jl
 =#
 
+function _concat(T, a, b)
+    la = ncodeunits(a)
+    lb = ncodeunits(b)
+    buf, out = _allocate(T, la + lb)
+    @preserve a unsafe_copyto!(out, pointer(a), la)
+    @preserve b unsafe_copyto!(out + la, pointer(b), lb)
+    buf
+end
+
+function _string(T, a, b, rest)
+    la = ncodeunits(a)
+    lb = ncodeunits(b)
+    len = la + lb
+    @inbounds for str in rest
+        len += ncodeunits(str)
+    end
+    buf, out = _allocate(T, len)
+    @preserve a unsafe_copyto!(out, pointer(a), la)
+    out += la
+    @preserve b unsafe_copyto!(out, pointer(b), lb)
+    out += lb
+    @inbounds for str in rest
+        len = ncodeunits(str)
+        @preserve str unsafe_copyto!(out, pointer(str), len)
+        out += len
+    end
+    buf
+end
+
+function _string(T, coll)
+    len = 0
+    @inbounds for str in coll
+        len += ncodeunits(str)
+    end
+    buf, out = _allocate(T, len)
+    @inbounds for str in coll
+        len = ncodeunits(str)
+        @preserve str unsafe_copyto!(out, pointer(str), len)
+        out += len
+    end
+    buf
+end
+
+# Handle concatenation where all the same CSE for strings, and character set for characters
+#=
+"""
+WIP: this is rather tricky.
+It really should handle any type of Chr / Str / CSE, not just the ones defined
+in CharSetEncodings, ChrBase and StrBase
+Ideally, it could also handle mixes with String and Char (or other AbstractString / AbstractChar
+types.
+It may need to do two or even three passes, one to determine the correct type to be output,
+another to determine the output length, and finally another to copy the strings / characters into
+the buffer.
+The result type should be based on promotion rules, i.e. outputting UCS2Str if only ASCII, Latin, UCS2 characters and strings are in the list.
+This is difficult to do in a way that will still be type stable.
+"""
+
+function _string_chr(a::Union{<:Chr{CS,T}, <:Str{C}, SubString{<:Str{C}}}...
+                     ) where {CS<:CharSet,T,C<:CSE{CS}}
+    len = 0
+    for v in a
+        if v isa Chr
+            len += 1
+        else
+            len += ncodeunits(v)
+        end
+    end
+    buf, out = _allocate(T, len)
+    for v in a
+        len = ncodeunits(str)
+        @preserve str unsafe_copyto!(out, pointer(str), len)
+        out += len
+    end
+    buf
+end
+=#
+
+string(c::MaybeSub{<:Str}) = c
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
+string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
+string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
+string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))
+
+#=
+const MS_Str{C} = MaybeSub{<:Str{C}}
+string(a::MS_Str{C}, b::MS_Str{C}) where {C<:CSE} = Str(C, _concat(codeunit(C), a, b))
+string(a::MS_Str{C}, b::MS_Str{C}, c::MS_Str{C}...) where {C<:CSE} =
+    Str(C, _string(codeunit(C), a, b, c))
+
+string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
+string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
+string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
+
+const MS_AL = MS_Str{<:Union{ASCIICSE,Latin_CSEs}}
+string(a::MS_AL, b::MS_AL) = Str(LatinCSE, _concat(UInt8, a, b))
+string(a::MS_AL, b::MS_AL, c::MS_AL...) = Str(LatinCSE, _string(UInt8, a, b, c))
+
+const MS_AU = MS_Str{<:Union{ASCIICSE,UTF8CSE}}
+string(a::MS_AU, b::MS_AU) = Str(UTF8CSE, _concat(UInt8, a, b))
+string(a::MS_AU, b::MS_AU, c::MS_AU...) = Str(UTF8CSE, _string(UInt8, a, b, c))
+
+const MS_U2 = MS_Str{<:UCS2_CSEs}
+string(a::MS_U2, b::MS_U2) = Str(UCS2CSE, _concat(UInt16, a, b))
+string(a::MS_U2, b::MS_U2, c::MS_U2...) = Str(UCS2CSE, _string(UInt16, a, b, c))
+
+const MS_UT = MS_Str{<:Union{UCS2_CSEs,UTF16CSE}}
+string(a::MS_UT, b::MS_UT) = Str(UTF16CSE, _concat(UInt16, a, b))
+string(a::MS_UT, b::MS_UT, c::MS_UT...) = Str(UTF16CSE, _string(UInt16, a, b, c))
+
+const MS_U4 = MS_Str{<:UTF32_CSEs}
+string(a::MS_U4, b::MS_U4) = Str(UTF32CSE, _concat(UInt32, a, b))
+string(a::MS_U4, b::MS_U4, c::MS_U4...) = Str(UTF32CSE, _string(UInt32, a, b, c))
+=#
+
+#=
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) =
+    length(c) == 1 ? c[1] : Str(LatinCSE, _string(UInt8, c))
+
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) =
+    length(c) == 1 ? c[1] : Str(UTF8CSE, _string(UInt8, c))
+
+string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) =
+    length(c) == 1 ? c[1] : Str(UCS2CSE, _string(UInt16, c))
+
+string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) =
+    length(c) == 1 ? c[1] : Str(UTF16CSE, _string(UInt16, c))
+
+string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) =
+    length(c) == 1 ? c[1] : Str(UTF32CSE, _string(UInt32, c))
+=#
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
+string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
+string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
+string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
+string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))
+
 # starts with and ends with predicates
 
 starts_with(a::MaybeSub{<:Str{C}}, b::MaybeSub{<:Str{C}}) where {C<:CSE} =
diff --git a/test/util.jl b/test/util.jl
@@ -307,6 +307,31 @@
             #non-hex characters
             @test_throws ArgumentError hex2bytes(b"0123456789abcdefABCDEFGH")
         end
+
+        @testset "Concatenation" begin
+            asc = ASCIIStr("foo")
+            lat = LatinStr("bar")
+            ucs = UCS2Str("baz")
+            u32 = UTF32Str("silly")
+            ut8 = UTF8Str("test")
+            ut16 = UTF16Str("ugly")
+            haslat = _LatinStr("você")
+            hasucs = _UCS2Str("†")
+            hasu32 = _UTF32Str("\U1f596")
+            @test typeof(asc * asc) == ASCIIStr
+            @test typeof(asc * lat) == LatinStr
+            @test typeof(asc * ut8) == UTF8Str
+            @test typeof(asc * haslat) == LatinStr
+            @test typeof(lat * lat) == LatinStr
+            @test typeof(haslat * haslat) == _LatinStr
+            @test typeof(lat * haslat) == LatinStr
+            @test typeof(ucs * ucs) == UCS2Str
+            @test typeof(hasucs * hasucs) == _UCS2Str
+            @test typeof(ucs * hasucs) == UCS2Str
+            @test typeof(u32 * u32) == UTF32Str
+            @test typeof(hasu32 * hasu32) == _UTF32Str
+            @test typeof(u32 * hasu32) == UTF32Str
+        end
     end
 
     # b"" should be immutable