Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ geo-traits = "0.3.0"
geo-types = "0.7.16"
http = "1.1.0"
humantime = "2.2.0"
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
itertools = "0.13"
jieba-rs = { version = "0.8.1", default-features = false }
jsonb = { version = "0.5.3", default-features = false, features = ["databend"] }
Expand Down
1 change: 0 additions & 1 deletion java/lance-jni/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion rust/lance-encoding/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ itertools.workspace = true
log.workspace = true
num-traits.workspace = true
prost.workspace = true
hyperloglogplus.workspace = true
prost-types.workspace = true
rand.workspace = true
snafu.workspace = true
Expand Down
4 changes: 2 additions & 2 deletions rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ impl AllNullDataBlock {

use std::collections::HashMap;

// `BlockInfo` stores the statistics of this `DataBlock`, such as `NullCount` for `NullableDataBlock`,
// `BitWidth` for `FixedWidthDataBlock`, `Cardinality` for all `DataBlock`
// `BlockInfo` stores the statistics of this `DataBlock`, such as `NullCount` for `NullableDataBlock`
// and `BitWidth` for `FixedWidthDataBlock`.
#[derive(Debug, Clone)]
pub struct BlockInfo(pub Arc<RwLock<HashMap<Stat, Arc<dyn Array>>>>);

Expand Down
36 changes: 20 additions & 16 deletions rust/lance-encoding/src/previous/encoder.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
use std::{
collections::{HashMap, HashSet},
env,
sync::Arc,
};

use arrow_array::{cast::AsArray, ArrayRef, UInt8Array};
use arrow_schema::DataType;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use snafu::location;

use crate::{
Expand Down Expand Up @@ -509,29 +512,30 @@ fn get_dict_encoding_threshold() -> u64 {
.unwrap_or(100)
}

// check whether we want to use dictionary encoding or not
// by applying a threshold on cardinality
// returns true if cardinality < threshold but false if the total number of rows is less than the threshold
// The choice to use 100 is just a heuristic for now
// hyperloglog is used for cardinality estimation
// error rate = 1.04 / sqrt(2^p), where p is the precision
// and error rate is 1.04 / sqrt(2^12) = 1.56%
// Check whether dictionary encoding is worthwhile for legacy UTF8 pages.
//
// We track exact unique values until `threshold` and bail out early once we hit
// the limit. This avoids building a full set for high-cardinality inputs while
// keeping the decision deterministic.
//
// Returns true only when:
// 1. total row count is at least `threshold`, and
// 2. exact distinct count is strictly less than `threshold`.
fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
if num_total_rows < threshold as usize {
return false;
}
const PRECISION: u8 = 12;

let mut hll: HyperLogLogPlus<String, RandomState> =
HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
let threshold = usize::try_from(threshold).unwrap_or(usize::MAX);
let mut unique_values = HashSet::with_capacity(threshold.min(1024));

for arr in arrays {
let string_array = arrow_array::cast::as_string_array(arr);
for value in string_array.iter().flatten() {
hll.insert(value);
let estimated_cardinality = hll.count() as u64;
if estimated_cardinality >= threshold {
if !unique_values.contains(value) {
unique_values.insert(value.to_string());
}
if unique_values.len() >= threshold {
return false;
}
}
Expand Down
Loading
Loading