Skip to content

Add interner lifetime and improve documentation #76

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ criterion_main!(bench_get_or_intern, bench_resolve, bench_get, bench_iter);

fn bench_get_or_intern_static(c: &mut Criterion) {
let mut g = c.benchmark_group("get_or_intern_static");
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
#[rustfmt::skip]
let static_strings = &[
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
Expand Down Expand Up @@ -87,7 +87,7 @@ fn bench_get_or_intern_static(c: &mut Criterion) {
fn bench_get_or_intern_fill_with_capacity(c: &mut Criterion) {
let mut g = c.benchmark_group("get_or_intern/fill-empty/with_capacity");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand All @@ -113,7 +113,7 @@ fn bench_get_or_intern_fill_with_capacity(c: &mut Criterion) {
fn bench_get_or_intern_fill(c: &mut Criterion) {
let mut g = c.benchmark_group("get_or_intern/fill-empty/new");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand All @@ -139,7 +139,7 @@ fn bench_get_or_intern_fill(c: &mut Criterion) {
fn bench_get_or_intern_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("get_or_intern/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand All @@ -165,7 +165,7 @@ fn bench_get_or_intern_already_filled(c: &mut Criterion) {
fn bench_resolve_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("resolve/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand All @@ -191,7 +191,7 @@ fn bench_resolve_already_filled(c: &mut Criterion) {
fn bench_resolve_unchecked_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("resolve_unchecked/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand Down Expand Up @@ -220,7 +220,7 @@ fn bench_resolve_unchecked_already_filled(c: &mut Criterion) {
fn bench_get_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("get/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
Expand All @@ -246,11 +246,11 @@ fn bench_get_already_filled(c: &mut Criterion) {
fn bench_iter_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("iter/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>)
fn bench_for_backend<'i, BB: BackendBenchmark<'i>>(g: &mut BenchmarkGroup<WallTime>)
where
for<'a> &'a <BB as BackendBenchmark>::Backend: IntoIterator<
for<'a> &'a <BB as BackendBenchmark<'i>>::Backend: IntoIterator<
Item = (
<<BB as BackendBenchmark>::Backend as Backend>::Symbol,
<<BB as BackendBenchmark<'i>>::Backend as Backend<'i>>::Symbol,
&'a str,
),
>,
Expand Down
56 changes: 33 additions & 23 deletions benches/setup.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use string_interner::{
backend::{Backend, BucketBackend, BufferBackend, StringBackend},
DefaultSymbol,
StringInterner,
DefaultSymbol, StringInterner,
};

/// Alphabet containing all characters that may be put into a benchmark string.
Expand Down Expand Up @@ -79,53 +78,64 @@ pub const BENCH_LEN_STRINGS: usize = 100_000;
pub const BENCH_STRING_LEN: usize = 5;

type FxBuildHasher = fxhash::FxBuildHasher;
type StringInternerWith<B> = StringInterner<B, FxBuildHasher>;
type StringInternerWith<'i, B> = StringInterner<'i, B, FxBuildHasher>;

pub trait BackendBenchmark {
pub trait BackendBenchmark<'i> {
const NAME: &'static str;
type Backend: Backend;
type Backend: Backend<'i>;

fn setup() -> StringInternerWith<Self::Backend> {
fn setup() -> StringInternerWith<'i, Self::Backend> {
<StringInternerWith<Self::Backend>>::new()
}

fn setup_with_capacity(cap: usize) -> StringInternerWith<Self::Backend> {
fn setup_with_capacity(cap: usize) -> StringInternerWith<'i, Self::Backend> {
<StringInternerWith<Self::Backend>>::with_capacity(cap)
}

fn setup_filled(words: &[String]) -> StringInternerWith<Self::Backend> {
words.iter().collect::<StringInternerWith<Self::Backend>>()
fn setup_filled<I, S>(words: I) -> StringInternerWith<'i, Self::Backend>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
words
.into_iter()
.map(|it| it.as_ref().to_string())
.collect::<StringInternerWith<Self::Backend>>()
}

fn setup_filled_with_ids(
words: &[String],
fn setup_filled_with_ids<I, S>(
words: I,
) -> (
StringInternerWith<Self::Backend>,
Vec<<Self::Backend as Backend>::Symbol>,
) {
let mut interner = <StringInternerWith<Self::Backend>>::new();
StringInternerWith<'i, Self::Backend>,
Vec<<Self::Backend as Backend<'i>>::Symbol>,
)
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut interner = <StringInternerWith<'i, Self::Backend>>::new();
let word_ids = words
.iter()
.map(|word| interner.get_or_intern(word))
.into_iter()
.map(|word| interner.get_or_intern(word.as_ref()))
.collect::<Vec<_>>();
(interner, word_ids)
}
}

pub struct BenchBucket;
impl BackendBenchmark for BenchBucket {
impl<'i> BackendBenchmark<'i> for BenchBucket {
const NAME: &'static str = "BucketBackend";
type Backend = BucketBackend<DefaultSymbol>;
type Backend = BucketBackend<'i, DefaultSymbol>;
}

pub struct BenchString;
impl BackendBenchmark for BenchString {
impl<'i> BackendBenchmark<'i> for BenchString {
const NAME: &'static str = "StringBackend";
type Backend = StringBackend<DefaultSymbol>;
type Backend = StringBackend<'i, DefaultSymbol>;
}

pub struct BenchBuffer;
impl BackendBenchmark for BenchBuffer {
impl<'i> BackendBenchmark<'i> for BenchBuffer {
const NAME: &'static str = "BufferBackend";
type Backend = BufferBackend<DefaultSymbol>;
type Backend = BufferBackend<'i, DefaultSymbol>;
}
104 changes: 51 additions & 53 deletions src/backend/bucket/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,61 @@ mod fixed_str;
mod interned_str;

use self::{fixed_str::FixedString, interned_str::InternedStr};
use super::Backend;
use super::{Backend, PhantomBackend};
use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol};
use alloc::{string::String, vec::Vec};
use core::{iter::Enumerate, marker::PhantomData, slice};

/// An interner backend that reduces memory allocations by using string buckets.
///
/// # Note
///
/// Implementation inspired by matklad's blog post that can be found here:
/// <https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html>
///
/// # Usage Hint
///
/// Use when deallocations or copy overhead is costly or when
/// interning of static strings is especially common.
///
/// # Usage
///
/// - **Fill:** Efficiency of filling an empty string interner.
/// - **Resolve:** Efficiency of interned string look-up given a symbol.
/// - **Allocations:** The number of allocations performed by the backend.
/// - **Footprint:** The total heap memory consumed by the backend.
/// - **Contiguous:** True if the returned symbols have contiguous values.
/// - **Iteration:** Efficiency of iterating over the interned strings.
///
/// Rating varies between **bad**, **ok**, **good** and **best**.
///
/// | Scenario | Rating |
/// |:------------|:--------:|
/// | Fill | **good** |
/// | Resolve | **best** |
/// | Allocations | **good** |
/// | Footprint | **ok** |
/// | Supports `get_or_intern_static` | **yes** |
/// | `Send` + `Sync` | **yes** |
/// | Contiguous | **yes** |
/// | Iteration | **best** |
/// An interner backend that reduces memory allocations by using buckets.
///
/// # Overview
/// This interner uses fixed-size buckets to store interned strings. Each bucket is
/// allocated once and holds a set number of strings. When a bucket becomes full, a new
/// bucket is allocated to hold more strings. Buckets are never deallocated, which reduces
/// the overhead of frequent memory allocations and copying.
///
/// ## Trade-offs
/// - **Advantages:**
/// - Strings in already used buckets remain valid and accessible even as new strings
/// are added.
/// - **Disadvantages:**
/// - Slightly slower access times due to double indirection (looking up the string
/// involves an extra level of lookup through the bucket).
/// - Memory may be used inefficiently if many buckets are allocated but only partially
/// filled because of large strings.
///
/// ## Use Cases
/// This backend is ideal when interned strings must remain valid even after new ones are
/// added.general use
///
/// Refer to the [comparison table][crate::_docs::comparison_table] for comparison with
/// other backends.
///
/// [matklad's blog post]:
/// https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html
#[derive(Debug)]
pub struct BucketBackend<S = DefaultSymbol> {
pub struct BucketBackend<'i, S: Symbol = DefaultSymbol> {
spans: Vec<InternedStr>,
head: FixedString,
full: Vec<String>,
marker: PhantomData<fn() -> S>,
marker: PhantomBackend<'i, Self>,
}

/// # Safety
///
/// The bucket backend requires a manual [`Send`] impl because it is self
/// referential. When cloning a bucket backend a deep clone is performed and
/// all references to itself are updated for the clone.
unsafe impl<S> Send for BucketBackend<S> where S: Symbol {}
unsafe impl<'i, S> Send for BucketBackend<'i, S> where S: Symbol {}

/// # Safety
///
/// The bucket backend requires a manual [`Send`] impl because it is self
/// referential. Those references won't escape its own scope and also
/// the bucket backend has no interior mutability.
unsafe impl<S> Sync for BucketBackend<S> where S: Symbol {}
unsafe impl<'i, S> Sync for BucketBackend<'i, S> where S: Symbol {}

impl<S> Default for BucketBackend<S> {
impl<'i, S: Symbol> Default for BucketBackend<'i, S> {
#[cfg_attr(feature = "inline-more", inline)]
fn default() -> Self {
Self {
Expand All @@ -76,10 +70,14 @@ impl<S> Default for BucketBackend<S> {
}
}

impl<S> Backend for BucketBackend<S>
impl<'i, S> Backend<'i> for BucketBackend<'i, S>
where
S: Symbol,
{
type Access<'local> = &'local str
where
Self: 'local,
'i: 'local;
type Symbol = S;
type Iter<'a>
= Iter<'a, S>
Expand Down Expand Up @@ -136,7 +134,7 @@ where
}
}

impl<S> BucketBackend<S>
impl<'i, S> BucketBackend<'i, S>
where
S: Symbol,
{
Expand Down Expand Up @@ -167,7 +165,7 @@ where
}
}

impl<S> Clone for BucketBackend<S> {
impl<'i, S: Symbol> Clone for BucketBackend<'i, S> {
fn clone(&self) -> Self {
// For performance reasons we copy all cloned strings into a single cloned
// head string leaving the cloned `full` empty.
Expand All @@ -191,9 +189,9 @@ impl<S> Clone for BucketBackend<S> {
}
}

impl<S> Eq for BucketBackend<S> where S: Symbol {}
impl<'i, S> Eq for BucketBackend<'i, S> where S: Symbol {}

impl<S> PartialEq for BucketBackend<S>
impl<'i, S> PartialEq for BucketBackend<'i, S>
where
S: Symbol,
{
Expand All @@ -203,39 +201,39 @@ where
}
}

impl<'a, S> IntoIterator for &'a BucketBackend<S>
impl<'i, 'l, S> IntoIterator for &'l BucketBackend<'i, S>
where
S: Symbol,
{
type Item = (S, &'a str);
type IntoIter = Iter<'a, S>;
type Item = (S, &'l str);
type IntoIter = Iter<'l, S>;

#[cfg_attr(feature = "inline-more", inline)]
fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

pub struct Iter<'a, S> {
iter: Enumerate<slice::Iter<'a, InternedStr>>,
pub struct Iter<'l, S> {
iter: Enumerate<slice::Iter<'l, InternedStr>>,
symbol_marker: PhantomData<fn() -> S>,
}

impl<'a, S> Iter<'a, S> {
impl<'i, 'l, S: Symbol> Iter<'l, S> {
#[cfg_attr(feature = "inline-more", inline)]
pub fn new(backend: &'a BucketBackend<S>) -> Self {
pub fn new(backend: &'l BucketBackend<'i, S>) -> Self {
Self {
iter: backend.spans.iter().enumerate(),
symbol_marker: Default::default(),
}
}
}

impl<'a, S> Iterator for Iter<'a, S>
impl<'l, S> Iterator for Iter<'l, S>
where
S: Symbol,
{
type Item = (S, &'a str);
type Item = (S, &'l str);

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
Expand Down
Loading