Skip to content

Commit 4864e0e

Browse files
committed
Efficient trie lookup for boolean Unicode properties
Replace binary search of ranges with trie lookup using leaves of 64-bit bitmap chunks. Benchmarks suggest this is approximately 10x faster than the bsearch approach.
1 parent c2aaad4 commit 4864e0e

File tree

2 files changed

+1368
-1134
lines changed

2 files changed

+1368
-1134
lines changed

src/etc/unicode.py

Lines changed: 107 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,12 +307,114 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
307307
format_table_content(f, data, 8)
308308
f.write("\n ];\n\n")
309309

310+
def emit_trie_lookup_range_table(f):
311+
f.write("""
312+
pub struct BoolTrie {
313+
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
314+
r1: [u64; 32], // leaves
315+
316+
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
317+
r2: [u8; 1024], // first level
318+
r3: &'static [u64], // leaves
319+
320+
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
321+
r4: [u8; 272], // first level
322+
r5: &'static [u8], // second level
323+
r6: &'static [u64], // leaves
324+
}
325+
326+
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
327+
((bitmap_chunk >> (c & 63)) & 1) != 0
328+
}
329+
330+
fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
331+
let c = c as usize;
332+
if c < 0x800 {
333+
trie_range_leaf(c, r.r1[c >> 8])
334+
} else if c < 0x10000 {
335+
let child = r.r2[c >> 6];
336+
trie_range_leaf(c, r.r3[child as usize])
337+
} else {
338+
let child = r.r4[c >> 12];
339+
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
340+
trie_range_leaf(c, r.r6[leaf as usize])
341+
}
342+
}\n
343+
""")
344+
345+
def compute_trie(rawdata, chunksize):
346+
root = []
347+
childmap = {}
348+
child_data = []
349+
for i in range(len(rawdata) / chunksize):
350+
data = rawdata[i * chunksize: (i + 1) * chunksize]
351+
child = '|'.join(map(str, data))
352+
if child not in childmap:
353+
childmap[child] = len(childmap)
354+
child_data.extend(data)
355+
root.append(childmap[child])
356+
return (root, child_data)
357+
358+
def emit_bool_trie(f, name, t_data, is_pub=True):
359+
CHUNK = 64
360+
rawdata = [False] * 0x110000;
361+
for (lo, hi) in t_data:
362+
for cp in range(lo, hi + 1):
363+
rawdata[cp] = True
364+
365+
# convert to bitmap chunks of 64 bits each
366+
chunks = []
367+
for i in range(0x110000 / CHUNK):
368+
chunk = 0
369+
for j in range(64):
370+
if rawdata[i * 64 + j]:
371+
chunk |= 1 << j
372+
chunks.append(chunk)
373+
374+
pub_string = ""
375+
if is_pub:
376+
pub_string = "pub "
377+
f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
378+
f.write(" r1: [\n")
379+
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 / CHUNK])
380+
format_table_content(f, data, 12)
381+
f.write("\n ],\n")
382+
383+
# 0x800..0x10000 trie
384+
(r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK)
385+
f.write(" r2: [\n")
386+
data = ','.join(str(node) for node in [255] * 32 + r2)
387+
format_table_content(f, data, 12)
388+
f.write("\n ],\n")
389+
f.write(" r3: &[\n")
390+
data = ','.join('0x%016x' % chunk for chunk in r3)
391+
format_table_content(f, data, 12)
392+
f.write("\n ],\n")
393+
394+
# 0x10000..0x110000 trie
395+
(mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK)
396+
(r4, r5) = compute_trie(mid, 64)
397+
f.write(" r4: [\n")
398+
data = ','.join(str(node) for node in [255] * 16 + r4)
399+
format_table_content(f, data, 12)
400+
f.write("\n ],\n")
401+
f.write(" r5: &[\n")
402+
data = ','.join(str(node) for node in r5)
403+
format_table_content(f, data, 12)
404+
f.write("\n ],\n")
405+
f.write(" r6: &[\n")
406+
data = ','.join('0x%016x' % chunk for chunk in r6)
407+
format_table_content(f, data, 12)
408+
f.write("\n ],\n")
409+
410+
f.write(" };\n\n")
411+
310412
def emit_property_module(f, mod, tbl, emit):
311413
f.write("pub mod %s {\n" % mod)
312414
for cat in sorted(emit):
313-
emit_table(f, "%s_table" % cat, tbl[cat])
415+
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
314416
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
315-
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
417+
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
316418
f.write(" }\n\n")
317419
f.write("}\n\n")
318420

@@ -402,8 +504,9 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
402504
norm_props = load_properties("DerivedNormalizationProps.txt",
403505
["Full_Composition_Exclusion"])
404506

405-
# bsearch_range_table is used in all the property modules below
406-
emit_bsearch_range_table(rf)
507+
# trie_lookup_table is used in all the property modules below
508+
emit_trie_lookup_range_table(rf)
509+
# emit_bsearch_range_table(rf)
407510

408511
# category tables
409512
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \

0 commit comments

Comments
 (0)