Skip to content

Commit 2aa1f22

Browse files
authored
Merge pull request #26 from ScottPJones/spj/fasthash
Use MurmurHash3 to allow for fast in-memory hashing with no conversion
2 parents 9d345ef + 6059c89 commit 2aa1f22

File tree

4 files changed

+35
-23
lines changed

4 files changed

+35
-23
lines changed

Project.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
name = "ShortStrings"
22
uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975"
33
authors = ["Dai ZJ <[email protected]>"]
4-
version = "0.2.6"
4+
version = "0.2.7"
55

66
[deps]
77
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
88
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
9+
MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d"
910

1011
[compat]
12+
MurmurHash3 = "1.1"
1113
BitIntegers = "0.2"
1214
SortingAlgorithms = "0.3"
1315
julia = "1"

src/ShortStrings.jl

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@ module ShortStrings
22

33
using BitIntegers
44
using SortingAlgorithms
5+
56
export fsort, fsort!, ShortString,
67
ShortString3, ShortString7, ShortString15, ShortString30, ShortString62, ShortString126,
78
@ss3_str, @ss7_str, @ss15_str, @ss30_str, @ss62_str, @ss126_str
89

9-
export hash # from hash.jl
10-
1110
include("base.jl")
1211
include("hash.jl")
1312

src/base.jl

+26-16
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import Base:unsafe_getindex, ==, show, promote_rule
44

5-
struct ShortString{T} <: AbstractString where T
5+
struct ShortString{T} <: AbstractString where {T}
66
size_content::T
77
end
88

@@ -14,18 +14,24 @@ function check_size(T, sz)
1414
end
1515
end
1616

17-
function ShortString{T}(s::Union{String, SubString{String}}) where T
17+
function ShortString{T}(s::Union{String, SubString{String}}) where {T}
1818
sz = sizeof(s)
1919
check_size(T, sz)
2020
bits_to_wipe = 8(sizeof(T) - sz)
21+
22+
# Warning: if a SubString is at the very end of a string, which is at the end of allocated
23+
# memory, this can cause an access violation, by trying to access past the end
24+
# (for example, reading a 1 byte substring at the end of a length 119 string, could go past
25+
# the end)
26+
2127
# TODO some times this can throw errors for longish strings
2228
# Exception: EXCEPTION_ACCESS_VIOLATION at 0x1e0b7afd -- bswap at C:\Users\RTX2080\.julia\packages\BitIntegers\xU40U\src\BitIntegers.jl:332 [inlined]
2329
# ntoh at .\io.jl:541 [inlined]
2430
content = (T(s |> pointer |> Ptr{T} |> Base.unsafe_load |> ntoh) >> bits_to_wipe) << bits_to_wipe
2531
ShortString{T}(content | T(sz))
2632
end
2733

28-
ShortString{T}(s::ShortString{T}) where T = s
34+
ShortString{T}(s::ShortString{T}) where {T} = s
2935
function ShortString{T}(s::ShortString{S}) where {T, S}
3036
sz = sizeof(s)
3137
check_size(T, sz)
@@ -44,27 +50,30 @@ Base.codeunit(s::ShortString) = UInt8
4450
Base.codeunit(s::ShortString, i) = codeunits(String(s), i)
4551
Base.codeunit(s::ShortString, i::Integer) = codeunit(String(s), i)
4652
Base.codeunits(s::ShortString) = codeunits(String(s))
47-
Base.convert(::ShortString{T}, s::String) where T = ShortString{T}(s)
53+
54+
Base.convert(::ShortString{T}, s::String) where {T} = ShortString{T}(s)
4855
Base.convert(::String, ss::ShortString) = String(ss)
49-
Base.display(s::ShortString) = display(String(s))
56+
57+
Base.sizeof(s::ShortString{T}) where {T} = Int(s.size_content & (size_mask(s) % UInt))
5058
Base.firstindex(::ShortString) = 1
5159
Base.isvalid(s::ShortString, i::Integer) = isvalid(String(s), i)
5260
Base.iterate(s::ShortString) = iterate(String(s))
5361
Base.iterate(s::ShortString, i::Integer) = iterate(String(s), i)
5462
Base.lastindex(s::ShortString) = sizeof(s)
5563
Base.ncodeunits(s::ShortString) = sizeof(s)
64+
65+
Base.display(s::ShortString) = display(String(s))
5666
Base.print(s::ShortString) = print(String(s))
5767
Base.show(io::IO, str::ShortString) = show(io, String(str))
58-
Base.sizeof(s::ShortString{T}) where T = Int(s.size_content & (size_mask(s) % UInt))
5968

6069
size_nibbles(::Type{<:Union{UInt16, UInt32, UInt64, UInt128}}) = 1
6170
size_nibbles(::Type{<:Union{Int16, Int32, Int64, Int128}}) = 1
6271
size_nibbles(::Type{<:Union{UInt256, UInt512, UInt1024}}) = 2
6372
size_nibbles(::Type{<:Union{Int256, Int512, Int1024}}) = 2
64-
size_nibbles(::Type{T}) where T = ceil(log2(sizeof(T))/4)
73+
size_nibbles(::Type{T}) where {T} = ceil(log2(sizeof(T))/4)
6574

6675
size_mask(T) = T(exp2(4*size_nibbles(T)) - 1)
67-
size_mask(s::ShortString{T}) where T = size_mask(T)
76+
size_mask(s::ShortString{T}) where {T} = size_mask(T)
6877

6978

7079
# function Base.getindex(s::ShortString, i::Integer)
@@ -77,7 +86,7 @@ size_mask(s::ShortString{T}) where T = size_mask(T)
7786

7887
Base.collect(s::ShortString) = collect(String(s))
7988

80-
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where S
89+
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where {S}
8190
ncodeunits(b) == ncodeunits(s) || return false
8291
return s == ShortString{S}(b)
8392
end
@@ -88,7 +97,7 @@ function ==(s::ShortString, b::AbstractString)
8897
end
8998

9099
==(a::AbstractString, b::ShortString) = b == a
91-
function ==(a::ShortString{S}, b::ShortString{S}) where S
100+
function ==(a::ShortString{S}, b::ShortString{S}) where {S}
92101
return a.size_content == b.size_content
93102
end
94103
function ==(a::ShortString{A}, b::ShortString{B}) where {A,B}
@@ -98,12 +107,11 @@ function ==(a::ShortString{A}, b::ShortString{B}) where {A,B}
98107
ntoh(a.size_content & ~size_mask(A)) == ntoh(b.size_content & ~size_mask(B))
99108
end
100109

101-
102-
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where S
110+
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where {S}
103111
return cmp(a.size_content, b.size_content)
104112
end
105113

106-
promote_rule(::Type{String}, ::Type{ShortString{S}}) where S = String
114+
promote_rule(::Type{String}, ::Type{ShortString{S}}) where {S} = String
107115

108116
function promote_rule(::Type{ShortString{T}}, ::Type{ShortString{S}}) where {T,S}
109117
if sizeof(T) >= sizeof(S)
@@ -126,7 +134,9 @@ for T in (UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32)
126134
end
127135
end
128136

129-
fsort(v::Vector{ShortString{T}}; rev = false) where T = sort(v, rev = rev, by = size_content, alg = RadixSort)
130-
fsort!(v::Vector{ShortString{T}}; rev = false) where T = sort!(v, rev = rev, by = size_content, alg = RadixSort)
137+
fsort(v::Vector{ShortString{T}}; rev = false) where {T} =
138+
sort(v, rev = rev, by = size_content, alg = RadixSort)
139+
fsort!(v::Vector{ShortString{T}}; rev = false) where {T} =
140+
sort!(v, rev = rev, by = size_content, alg = RadixSort)
131141

132-
fsortperm(v::Vector{ShortString{T}}; rev = false) where T = sortperm(v, rev = rev)
142+
fsortperm(v::Vector{ShortString{T}}; rev = false) where {T} = sortperm(v, rev = rev)

src/hash.jl

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
export hash
1+
using MurmurHash3: mmhash128_a
22

3-
import Base.hash
4-
5-
Base.hash(x::ShortString, h::UInt) = hash(String(x), h)
3+
function Base.hash(x::ShortString, h::UInt)
4+
h += Base.memhash_seed
5+
last(mmhash128_a(sizeof(x), bswap(x.size_content), h%UInt32)) + h
6+
end

0 commit comments

Comments
 (0)