Skip to content

Commit ef734e0

Browse files
committed
tmp
Signed-off-by: Kould <[email protected]>
1 parent 6ed13ee commit ef734e0

File tree

16 files changed

+456
-420
lines changed

16 files changed

+456
-420
lines changed

src/query/functions/src/scalars/binary.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,9 @@ fn eval_binary_to_string(val: Value<BinaryType>, ctx: &mut EvalContext) -> Value
279279
vectorize_binary_to_string(
280280
|col| col.total_bytes_len(),
281281
|val, output, ctx| {
282-
if let Ok(val) = simdutf8::basic::from_utf8(val) {
283-
output.put_str(val);
284-
} else {
285-
ctx.set_error(output.len(), "invalid utf8 sequence");
286-
}
282+
// FIXME
283+
let cow = String::from_utf8_lossy(val);
284+
output.put_str(cow.as_ref());
287285
output.commit_row();
288286
},
289287
)(val, ctx)

src/query/storages/common/cache/src/caches.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ use std::time::Instant;
1818

1919
use arrow::array::ArrayRef;
2020
use databend_common_cache::MemSized;
21-
use databend_storages_common_index::filters::FilterImpl;
2221

2322
pub use crate::cache_items::*;
2423
use crate::manager::CacheManager;
@@ -46,7 +45,7 @@ pub type TableSnapshotCache = InMemoryLruCache<TableSnapshot>;
4645
pub type TableSnapshotStatisticCache = InMemoryLruCache<TableSnapshotStatistics>;
4746
/// In memory object cache of bloom filter.
4847
/// For each indexed data block, the bloom xor8 filter of column is cached individually
49-
pub type BloomIndexFilterCache = HybridCache<FilterImpl>;
48+
pub type BloomIndexFilterCache = HybridCache<Xor8Filter>;
5049
/// In memory object cache of parquet FileMetaData of bloom index data
5150
pub type BloomIndexMetaCache = HybridCache<BloomIndexMeta>;
5251

@@ -124,7 +123,7 @@ impl CachedObject<(PartStatistics, Partitions)> for (PartStatistics, Partitions)
124123
}
125124
}
126125

127-
impl CachedObject<FilterImpl> for FilterImpl {
126+
impl CachedObject<Xor8Filter> for Xor8Filter {
128127
type Cache = BloomIndexFilterCache;
129128
fn cache() -> Option<Self::Cache> {
130129
CacheManager::instance().get_bloom_index_filter_cache()
@@ -236,10 +235,10 @@ impl From<TableSnapshotStatistics> for CacheValue<TableSnapshotStatistics> {
236235
}
237236
}
238237

239-
impl From<FilterImpl> for CacheValue<FilterImpl> {
240-
fn from(value: FilterImpl) -> Self {
238+
impl From<Xor8Filter> for CacheValue<Xor8Filter> {
239+
fn from(value: Xor8Filter) -> Self {
241240
CacheValue {
242-
mem_bytes: value.mem_bytes(),
241+
mem_bytes: std::mem::size_of::<Xor8Filter>() + value.filter.finger_prints.len(),
243242
inner: Arc::new(value),
244243
}
245244
}

src/query/storages/common/index/src/bloom_index.rs

+85-57
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,16 @@ use super::eliminate_cast::is_injective_cast;
7474
use crate::eliminate_cast::cast_const;
7575
use crate::filters::BlockBloomFilterIndexVersion;
7676
use crate::filters::BlockFilter;
77-
use crate::filters::BloomBuilder;
78-
use crate::filters::BloomFilter;
7977
use crate::filters::Filter;
8078
use crate::filters::FilterBuilder;
81-
use crate::filters::FilterImpl;
82-
use crate::filters::FilterImplBuilder;
8379
use crate::filters::V2BloomBlock;
8480
use crate::filters::Xor8Builder;
8581
use crate::filters::Xor8Filter;
8682
use crate::statistics_to_domain;
8783
use crate::Index;
8884

85+
const NGRAM_HASH_SEED: u64 = 1575457558;
86+
8987
#[derive(Clone, Serialize, Deserialize)]
9088
pub struct BloomIndexMeta {
9189
pub columns: Vec<(String, SingleColumnMeta)>,
@@ -187,7 +185,7 @@ pub struct BloomIndex {
187185
pub version: u64,
188186

189187
/// filters.
190-
pub filters: Vec<Arc<FilterImpl>>,
188+
pub filters: Vec<Arc<Xor8Filter>>,
191189

192190
/// Approximate distinct count of columns generated by xor hash function.
193191
pub column_distinct_count: HashMap<ColumnId, usize>,
@@ -218,7 +216,7 @@ impl BloomIndex {
218216
pub fn from_filter_block(
219217
func_ctx: FunctionContext,
220218
filter_schema: TableSchemaRef,
221-
filters: Vec<Arc<FilterImpl>>,
219+
filters: Vec<Arc<Xor8Filter>>,
222220
version: u64,
223221
) -> Result<Self> {
224222
Ok(Self {
@@ -370,10 +368,11 @@ impl BloomIndex {
370368
pub fn calculate_ngram_nullable_column<'a, F, T>(
371369
arg: Value<AnyType>,
372370
gram_size: usize,
371+
bitmap_size: usize,
373372
fn_call: F,
374373
) -> impl Iterator<Item = Vec<T>> + 'a
375374
where
376-
F: Fn(&str) -> T + 'a,
375+
F: Fn(&str, usize) -> T + 'a,
377376
{
378377
(0..arg.len()).filter_map(move |i| {
379378
arg.index(i).and_then(|scalar| {
@@ -387,7 +386,7 @@ impl BloomIndex {
387386
let char_count = indices.len();
388387

389388
if gram_size > char_count {
390-
return Some(vec![fn_call(&text)]);
389+
return None;
391390
}
392391

393392
let times = char_count - gram_size + 1;
@@ -399,18 +398,23 @@ impl BloomIndex {
399398
} else {
400399
text.len()
401400
};
402-
words.push(fn_call(&text[start..end]));
401+
words.push(fn_call(&text[start..end], bitmap_size));
403402
}
404403
Some(words)
405404
})
406405
})
407406
})
408407
}
409408

410-
pub fn ngram_hash(s: &str) -> u64 {
411-
let mut hasher = CityHasher64::with_seed(1575457558);
409+
pub fn ngram_hash(s: &str, bitmap_size: usize) -> u64 {
410+
let mut hasher = CityHasher64::with_seed(NGRAM_HASH_SEED);
412411
DFHash::hash(s, &mut hasher);
413-
hasher.finish()
412+
413+
if bitmap_size > 0 {
414+
hasher.finish() % bitmap_size as u64
415+
} else {
416+
hasher.finish()
417+
}
414418
}
415419

416420
/// calculate digest for constant scalar
@@ -433,7 +437,6 @@ impl BloomIndex {
433437
}
434438

435439
/// Find all columns that can be use for index in the expression.
436-
#[expect(clippy::type_complexity)]
437440
pub fn filter_index_field(
438441
expr: &Expr<String>,
439442
bloom_fields: Vec<TableField>,
@@ -478,45 +481,56 @@ impl BloomIndex {
478481
}
479482
}
480483

481-
pub fn build_filter_ngram_name(field: &TableField) -> String {
482-
format!("Ngram({})", field.column_id())
484+
pub fn build_filter_ngram_name(field: &TableField, gram_size: usize, bitmap_size: usize) -> String {
485+
format!("Ngram({}, gram_size: {gram_size}, bitmap_size: {bitmap_size})", field.column_id())
483486
}
484487

485488
fn find(
486489
&self,
487-
filter_column: &str,
490+
table_field: &TableField,
488491
target: &Scalar,
489492
ty: &DataType,
490493
scalar_map: &HashMap<Scalar, u64>,
491494
ngram_args: &[NgramArgs],
492-
is_ngram: bool,
495+
is_like: bool,
493496
) -> Result<FilterEvalResult> {
494-
if !self.filter_schema.has_field(filter_column)
497+
let (filter_column, ngram_arg) = if is_like {
498+
let Some(ngram_arg) = ngram_args.iter().find(|arg| &arg.field == table_field) else {
499+
// The column doesn't have a Ngram Arg.
500+
return Ok(FilterEvalResult::Uncertain);
501+
};
502+
(BloomIndex::build_filter_ngram_name(table_field, ngram_arg.gram_size, ngram_arg.bitmap_size), Some(ngram_arg))
503+
} else {
504+
(BloomIndex::build_filter_bloom_name(self.version, table_field)?, None)
505+
};
506+
507+
if !self.filter_schema.has_field(&filter_column)
495508
|| !Xor8Filter::supported_type(ty)
496509
|| target.is_null()
497510
{
498511
// The column doesn't have a filter.
499512
return Ok(FilterEvalResult::Uncertain);
500513
}
501514

502-
let idx = self.filter_schema.index_of(filter_column)?;
515+
let idx = self.filter_schema.index_of(&filter_column)?;
503516
let filter = &self.filters[idx];
504517

505518
let contains = if self.version == V2BloomBlock::VERSION {
506519
let data_value = scalar_to_datavalue(target);
507520
filter.contains(&data_value)
508-
} else if is_ngram && !ngram_args.is_empty() {
509-
// NgramFilter is always placed after BloomFilter
510-
let offset = self.filters.len() - ngram_args.len();
511-
let arg = &ngram_args[idx - offset];
521+
} else if let Some(arg) = ngram_arg {
512522
let Some(words) = BloomIndex::calculate_ngram_nullable_column(
513523
Value::Scalar(target.clone()),
514524
arg.gram_size,
515-
|text| text.to_string(),
525+
arg.bitmap_size,
526+
|text, _| text.to_string(),
516527
)
517528
.next() else {
518529
return Ok(FilterEvalResult::Uncertain);
519530
};
531+
if words.is_empty() {
532+
return Ok(FilterEvalResult::Uncertain);
533+
}
520534
!words.into_iter().any(|word| {
521535
scalar_map
522536
.get(&Scalar::String(word))
@@ -563,7 +577,8 @@ struct ColumnFilterBuilder {
563577
field: TableField,
564578
is_ngram: bool,
565579
gram_size: usize,
566-
builder: FilterImplBuilder,
580+
bitmap_size: usize,
581+
builder: Xor8Builder,
567582
}
568583

569584
#[derive(Clone)]
@@ -591,6 +606,10 @@ impl NgramArgs {
591606
pub fn gram_size(&self) -> usize {
592607
self.gram_size
593608
}
609+
610+
pub fn bitmap_size(&self) -> usize {
611+
self.bitmap_size
612+
}
594613
}
595614

596615
impl BloomIndexBuilder {
@@ -606,7 +625,8 @@ impl BloomIndexBuilder {
606625
field: field.clone(),
607626
is_ngram: false,
608627
gram_size: 0,
609-
builder: FilterImplBuilder::Xor(Xor8Builder::create()),
628+
bitmap_size: 0,
629+
builder: Xor8Builder::create(),
610630
});
611631
}
612632
for arg in ngram_args.iter() {
@@ -615,7 +635,8 @@ impl BloomIndexBuilder {
615635
field: arg.field.clone(),
616636
is_ngram: true,
617637
gram_size: arg.gram_size,
618-
builder: FilterImplBuilder::Ngram(BloomBuilder::create(arg.bitmap_size)?),
638+
bitmap_size: arg.bitmap_size,
639+
builder: Xor8Builder::create(),
619640
});
620641
}
621642

@@ -737,7 +758,7 @@ impl BloomIndexBuilder {
737758
}
738759
for (index, index_column) in ngram_iter {
739760
let field_type = &block.get_by_offset(index_column.index).data_type;
740-
if !BloomFilter::supported_type(field_type) {
761+
if !Xor8Filter::supported_type(field_type) {
741762
keys_to_remove.push(index);
742763
continue;
743764
}
@@ -753,7 +774,8 @@ impl BloomIndexBuilder {
753774
for digests in BloomIndex::calculate_ngram_nullable_column(
754775
Value::Column(column),
755776
index_column.gram_size,
756-
BloomIndex::ngram_hash,
777+
index_column.bitmap_size,
778+
BloomIndex::ngram_hash
757779
) {
758780
if digests.is_empty() {
759781
continue;
@@ -774,7 +796,7 @@ impl BloomIndexBuilder {
774796
for column in self.columns.iter_mut() {
775797
let filter = column.builder.build()?;
776798
let filter_name = if column.is_ngram {
777-
BloomIndex::build_filter_ngram_name(&column.field)
799+
BloomIndex::build_filter_ngram_name(&column.field, column.gram_size, column.bitmap_size)
778800
} else {
779801
if let Some(len) = filter.len() {
780802
if !matches!(
@@ -844,19 +866,32 @@ where T: EqVisitor
844866
..
845867
})] => {
846868
if let Some(pattern) = scalar.as_string() {
847-
if let LikePattern::SurroundByPercent(v) =
848-
generate_like_pattern(pattern.as_bytes(), 1)
849-
{
850-
let string = String::from_utf8_lossy(v.needle()).to_string();
851-
852-
result = self.0.enter_target(
853-
*span,
854-
id,
855-
&Scalar::String(string),
856-
column_type,
857-
return_type,
858-
true,
859-
)?;
869+
match generate_like_pattern(pattern.as_bytes(), 1) {
870+
LikePattern::StartOfPercent(v) | LikePattern::EndOfPercent(v) => {
871+
let string = String::from_utf8_lossy(v.as_ref()).to_string();
872+
873+
result = self.0.enter_target(
874+
*span,
875+
id,
876+
&Scalar::String(string),
877+
column_type,
878+
return_type,
879+
true,
880+
)?;
881+
}
882+
LikePattern::SurroundByPercent(v) => {
883+
let string = String::from_utf8_lossy(v.needle()).to_string();
884+
885+
result = self.0.enter_target(
886+
*span,
887+
id,
888+
&Scalar::String(string),
889+
column_type,
890+
return_type,
891+
true,
892+
)?;
893+
}
894+
_ => (),
860895
}
861896
}
862897
}
@@ -1099,16 +1134,11 @@ impl EqVisitor for RewriteVisitor<'_> {
10991134
is_like: bool,
11001135
) -> ResultRewrite {
11011136
let table_field = self.data_schema.field_with_name(col_name)?;
1102-
let filter_column = if is_like {
1103-
BloomIndex::build_filter_ngram_name(table_field)
1104-
} else {
1105-
BloomIndex::build_filter_bloom_name(self.index.version, table_field)?
1106-
};
11071137

11081138
// If the column doesn't contain the constant,
11091139
// we rewrite the expression to a new column with `false` domain.
11101140
if self.index.find(
1111-
&filter_column,
1141+
table_field,
11121142
scalar,
11131143
ty,
11141144
self.scalar_map,
@@ -1251,17 +1281,15 @@ impl EqVisitor for ShortListVisitor {
12511281
) -> ResultRewrite {
12521282
if is_like {
12531283
if let Some((i, v)) = Self::found_field(&self.ngram_fields, col_name) {
1254-
if !scalar.is_null() && BloomFilter::supported_type(ty) {
1284+
if !scalar.is_null() && Xor8Filter::supported_type(ty) {
12551285
self.ngram_founds.push(v.clone());
12561286
self.ngram_scalars.push((i, scalar.clone()));
12571287
}
12581288
}
1259-
} else {
1260-
if let Some((i, v)) = Self::found_field(&self.bloom_fields, col_name) {
1261-
if !scalar.is_null() && Xor8Filter::supported_type(ty) {
1262-
self.bloom_founds.push(v.clone());
1263-
self.bloom_scalars.push((i, scalar.clone(), ty.clone()));
1264-
}
1289+
} else if let Some((i, v)) = Self::found_field(&self.bloom_fields, col_name) {
1290+
if !scalar.is_null() && Xor8Filter::supported_type(ty) {
1291+
self.bloom_founds.push(v.clone());
1292+
self.bloom_scalars.push((i, scalar.clone(), ty.clone()));
12651293
}
12661294
}
12671295
Ok(ControlFlow::Break(None))
@@ -1292,7 +1320,7 @@ impl EqVisitor for ShortListVisitor {
12921320
let Some((i, field)) = Self::found_field(&self.ngram_fields, id) else {
12931321
return Ok(ControlFlow::Break(None));
12941322
};
1295-
if !BloomFilter::supported_type(src_type) || !is_injective_cast(src_type, dest_type) {
1323+
if !Xor8Filter::supported_type(src_type) || !is_injective_cast(src_type, dest_type) {
12961324
return Ok(ControlFlow::Break(None));
12971325
}
12981326

src/query/storages/common/index/src/filters/mod.rs

-6
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ pub use filter::Filter;
2121
pub use filter::FilterBuilder;
2222
pub use xor8::BlockBloomFilterIndexVersion;
2323
pub use xor8::BlockFilter;
24-
pub use xor8::BloomBuilder;
25-
pub use xor8::BloomBuildingError;
26-
pub use xor8::BloomCodecError;
27-
pub use xor8::BloomFilter;
28-
pub use xor8::FilterImpl;
29-
pub use xor8::FilterImplBuilder;
3024
pub use xor8::V2BloomBlock;
3125
pub use xor8::Xor8Builder;
3226
pub use xor8::Xor8BuildingError;

0 commit comments

Comments
 (0)