@@ -74,18 +74,16 @@ use super::eliminate_cast::is_injective_cast;
74
74
use crate :: eliminate_cast:: cast_const;
75
75
use crate :: filters:: BlockBloomFilterIndexVersion ;
76
76
use crate :: filters:: BlockFilter ;
77
- use crate :: filters:: BloomBuilder ;
78
- use crate :: filters:: BloomFilter ;
79
77
use crate :: filters:: Filter ;
80
78
use crate :: filters:: FilterBuilder ;
81
- use crate :: filters:: FilterImpl ;
82
- use crate :: filters:: FilterImplBuilder ;
83
79
use crate :: filters:: V2BloomBlock ;
84
80
use crate :: filters:: Xor8Builder ;
85
81
use crate :: filters:: Xor8Filter ;
86
82
use crate :: statistics_to_domain;
87
83
use crate :: Index ;
88
84
85
+ const NGRAM_HASH_SEED : u64 = 1575457558 ;
86
+
89
87
#[ derive( Clone , Serialize , Deserialize ) ]
90
88
pub struct BloomIndexMeta {
91
89
pub columns : Vec < ( String , SingleColumnMeta ) > ,
@@ -187,7 +185,7 @@ pub struct BloomIndex {
187
185
pub version : u64 ,
188
186
189
187
/// filters.
190
- pub filters : Vec < Arc < FilterImpl > > ,
188
+ pub filters : Vec < Arc < Xor8Filter > > ,
191
189
192
190
/// Approximate distinct count of columns generated by xor hash function.
193
191
pub column_distinct_count : HashMap < ColumnId , usize > ,
@@ -218,7 +216,7 @@ impl BloomIndex {
218
216
pub fn from_filter_block (
219
217
func_ctx : FunctionContext ,
220
218
filter_schema : TableSchemaRef ,
221
- filters : Vec < Arc < FilterImpl > > ,
219
+ filters : Vec < Arc < Xor8Filter > > ,
222
220
version : u64 ,
223
221
) -> Result < Self > {
224
222
Ok ( Self {
@@ -370,10 +368,11 @@ impl BloomIndex {
370
368
pub fn calculate_ngram_nullable_column < ' a , F , T > (
371
369
arg : Value < AnyType > ,
372
370
gram_size : usize ,
371
+ bitmap_size : usize ,
373
372
fn_call : F ,
374
373
) -> impl Iterator < Item = Vec < T > > + ' a
375
374
where
376
- F : Fn ( & str ) -> T + ' a ,
375
+ F : Fn ( & str , usize ) -> T + ' a ,
377
376
{
378
377
( 0 ..arg. len ( ) ) . filter_map ( move |i| {
379
378
arg. index ( i) . and_then ( |scalar| {
@@ -387,7 +386,7 @@ impl BloomIndex {
387
386
let char_count = indices. len ( ) ;
388
387
389
388
if gram_size > char_count {
390
- return Some ( vec ! [ fn_call ( & text ) ] ) ;
389
+ return None ;
391
390
}
392
391
393
392
let times = char_count - gram_size + 1 ;
@@ -399,18 +398,23 @@ impl BloomIndex {
399
398
} else {
400
399
text. len ( )
401
400
} ;
402
- words. push ( fn_call ( & text[ start..end] ) ) ;
401
+ words. push ( fn_call ( & text[ start..end] , bitmap_size ) ) ;
403
402
}
404
403
Some ( words)
405
404
} )
406
405
} )
407
406
} )
408
407
}
409
408
410
- pub fn ngram_hash ( s : & str ) -> u64 {
411
- let mut hasher = CityHasher64 :: with_seed ( 1575457558 ) ;
409
+ pub fn ngram_hash ( s : & str , bitmap_size : usize ) -> u64 {
410
+ let mut hasher = CityHasher64 :: with_seed ( NGRAM_HASH_SEED ) ;
412
411
DFHash :: hash ( s, & mut hasher) ;
413
- hasher. finish ( )
412
+
413
+ if bitmap_size > 0 {
414
+ hasher. finish ( ) % bitmap_size as u64
415
+ } else {
416
+ hasher. finish ( )
417
+ }
414
418
}
415
419
416
420
/// calculate digest for constant scalar
@@ -433,7 +437,6 @@ impl BloomIndex {
433
437
}
434
438
435
439
/// Find all columns that can be use for index in the expression.
436
- #[ expect( clippy:: type_complexity) ]
437
440
pub fn filter_index_field (
438
441
expr : & Expr < String > ,
439
442
bloom_fields : Vec < TableField > ,
@@ -478,45 +481,56 @@ impl BloomIndex {
478
481
}
479
482
}
480
483
481
- pub fn build_filter_ngram_name ( field : & TableField ) -> String {
482
- format ! ( "Ngram({})" , field. column_id( ) )
484
+ pub fn build_filter_ngram_name ( field : & TableField , gram_size : usize , bitmap_size : usize ) -> String {
485
+ format ! ( "Ngram({}, gram_size: {gram_size}, bitmap_size: {bitmap_size} )" , field. column_id( ) )
483
486
}
484
487
485
488
fn find (
486
489
& self ,
487
- filter_column : & str ,
490
+ table_field : & TableField ,
488
491
target : & Scalar ,
489
492
ty : & DataType ,
490
493
scalar_map : & HashMap < Scalar , u64 > ,
491
494
ngram_args : & [ NgramArgs ] ,
492
- is_ngram : bool ,
495
+ is_like : bool ,
493
496
) -> Result < FilterEvalResult > {
494
- if !self . filter_schema . has_field ( filter_column)
497
+ let ( filter_column, ngram_arg) = if is_like {
498
+ let Some ( ngram_arg) = ngram_args. iter ( ) . find ( |arg| & arg. field == table_field) else {
499
+ // The column doesn't have a Ngram Arg.
500
+ return Ok ( FilterEvalResult :: Uncertain ) ;
501
+ } ;
502
+ ( BloomIndex :: build_filter_ngram_name ( table_field, ngram_arg. gram_size , ngram_arg. bitmap_size ) , Some ( ngram_arg) )
503
+ } else {
504
+ ( BloomIndex :: build_filter_bloom_name ( self . version , table_field) ?, None )
505
+ } ;
506
+
507
+ if !self . filter_schema . has_field ( & filter_column)
495
508
|| !Xor8Filter :: supported_type ( ty)
496
509
|| target. is_null ( )
497
510
{
498
511
// The column doesn't have a filter.
499
512
return Ok ( FilterEvalResult :: Uncertain ) ;
500
513
}
501
514
502
- let idx = self . filter_schema . index_of ( filter_column) ?;
515
+ let idx = self . filter_schema . index_of ( & filter_column) ?;
503
516
let filter = & self . filters [ idx] ;
504
517
505
518
let contains = if self . version == V2BloomBlock :: VERSION {
506
519
let data_value = scalar_to_datavalue ( target) ;
507
520
filter. contains ( & data_value)
508
- } else if is_ngram && !ngram_args. is_empty ( ) {
509
- // NgramFilter is always placed after BloomFilter
510
- let offset = self . filters . len ( ) - ngram_args. len ( ) ;
511
- let arg = & ngram_args[ idx - offset] ;
521
+ } else if let Some ( arg) = ngram_arg {
512
522
let Some ( words) = BloomIndex :: calculate_ngram_nullable_column (
513
523
Value :: Scalar ( target. clone ( ) ) ,
514
524
arg. gram_size ,
515
- |text| text. to_string ( ) ,
525
+ arg. bitmap_size ,
526
+ |text, _| text. to_string ( ) ,
516
527
)
517
528
. next ( ) else {
518
529
return Ok ( FilterEvalResult :: Uncertain ) ;
519
530
} ;
531
+ if words. is_empty ( ) {
532
+ return Ok ( FilterEvalResult :: Uncertain ) ;
533
+ }
520
534
!words. into_iter ( ) . any ( |word| {
521
535
scalar_map
522
536
. get ( & Scalar :: String ( word) )
@@ -563,7 +577,8 @@ struct ColumnFilterBuilder {
563
577
field : TableField ,
564
578
is_ngram : bool ,
565
579
gram_size : usize ,
566
- builder : FilterImplBuilder ,
580
+ bitmap_size : usize ,
581
+ builder : Xor8Builder ,
567
582
}
568
583
569
584
#[ derive( Clone ) ]
@@ -591,6 +606,10 @@ impl NgramArgs {
591
606
pub fn gram_size ( & self ) -> usize {
592
607
self . gram_size
593
608
}
609
+
610
+ pub fn bitmap_size ( & self ) -> usize {
611
+ self . bitmap_size
612
+ }
594
613
}
595
614
596
615
impl BloomIndexBuilder {
@@ -606,7 +625,8 @@ impl BloomIndexBuilder {
606
625
field : field. clone ( ) ,
607
626
is_ngram : false ,
608
627
gram_size : 0 ,
609
- builder : FilterImplBuilder :: Xor ( Xor8Builder :: create ( ) ) ,
628
+ bitmap_size : 0 ,
629
+ builder : Xor8Builder :: create ( ) ,
610
630
} ) ;
611
631
}
612
632
for arg in ngram_args. iter ( ) {
@@ -615,7 +635,8 @@ impl BloomIndexBuilder {
615
635
field : arg. field . clone ( ) ,
616
636
is_ngram : true ,
617
637
gram_size : arg. gram_size ,
618
- builder : FilterImplBuilder :: Ngram ( BloomBuilder :: create ( arg. bitmap_size ) ?) ,
638
+ bitmap_size : arg. bitmap_size ,
639
+ builder : Xor8Builder :: create ( ) ,
619
640
} ) ;
620
641
}
621
642
@@ -737,7 +758,7 @@ impl BloomIndexBuilder {
737
758
}
738
759
for ( index, index_column) in ngram_iter {
739
760
let field_type = & block. get_by_offset ( index_column. index ) . data_type ;
740
- if !BloomFilter :: supported_type ( field_type) {
761
+ if !Xor8Filter :: supported_type ( field_type) {
741
762
keys_to_remove. push ( index) ;
742
763
continue ;
743
764
}
@@ -753,7 +774,8 @@ impl BloomIndexBuilder {
753
774
for digests in BloomIndex :: calculate_ngram_nullable_column (
754
775
Value :: Column ( column) ,
755
776
index_column. gram_size ,
756
- BloomIndex :: ngram_hash,
777
+ index_column. bitmap_size ,
778
+ BloomIndex :: ngram_hash
757
779
) {
758
780
if digests. is_empty ( ) {
759
781
continue ;
@@ -774,7 +796,7 @@ impl BloomIndexBuilder {
774
796
for column in self . columns . iter_mut ( ) {
775
797
let filter = column. builder . build ( ) ?;
776
798
let filter_name = if column. is_ngram {
777
- BloomIndex :: build_filter_ngram_name ( & column. field )
799
+ BloomIndex :: build_filter_ngram_name ( & column. field , column . gram_size , column . bitmap_size )
778
800
} else {
779
801
if let Some ( len) = filter. len ( ) {
780
802
if !matches ! (
@@ -844,19 +866,32 @@ where T: EqVisitor
844
866
..
845
867
} ) ] => {
846
868
if let Some ( pattern) = scalar. as_string ( ) {
847
- if let LikePattern :: SurroundByPercent ( v) =
848
- generate_like_pattern ( pattern. as_bytes ( ) , 1 )
849
- {
850
- let string = String :: from_utf8_lossy ( v. needle ( ) ) . to_string ( ) ;
851
-
852
- result = self . 0 . enter_target (
853
- * span,
854
- id,
855
- & Scalar :: String ( string) ,
856
- column_type,
857
- return_type,
858
- true ,
859
- ) ?;
869
+ match generate_like_pattern ( pattern. as_bytes ( ) , 1 ) {
870
+ LikePattern :: StartOfPercent ( v) | LikePattern :: EndOfPercent ( v) => {
871
+ let string = String :: from_utf8_lossy ( v. as_ref ( ) ) . to_string ( ) ;
872
+
873
+ result = self . 0 . enter_target (
874
+ * span,
875
+ id,
876
+ & Scalar :: String ( string) ,
877
+ column_type,
878
+ return_type,
879
+ true ,
880
+ ) ?;
881
+ }
882
+ LikePattern :: SurroundByPercent ( v) => {
883
+ let string = String :: from_utf8_lossy ( v. needle ( ) ) . to_string ( ) ;
884
+
885
+ result = self . 0 . enter_target (
886
+ * span,
887
+ id,
888
+ & Scalar :: String ( string) ,
889
+ column_type,
890
+ return_type,
891
+ true ,
892
+ ) ?;
893
+ }
894
+ _ => ( ) ,
860
895
}
861
896
}
862
897
}
@@ -1099,16 +1134,11 @@ impl EqVisitor for RewriteVisitor<'_> {
1099
1134
is_like : bool ,
1100
1135
) -> ResultRewrite {
1101
1136
let table_field = self . data_schema . field_with_name ( col_name) ?;
1102
- let filter_column = if is_like {
1103
- BloomIndex :: build_filter_ngram_name ( table_field)
1104
- } else {
1105
- BloomIndex :: build_filter_bloom_name ( self . index . version , table_field) ?
1106
- } ;
1107
1137
1108
1138
// If the column doesn't contain the constant,
1109
1139
// we rewrite the expression to a new column with `false` domain.
1110
1140
if self . index . find (
1111
- & filter_column ,
1141
+ table_field ,
1112
1142
scalar,
1113
1143
ty,
1114
1144
self . scalar_map ,
@@ -1251,17 +1281,15 @@ impl EqVisitor for ShortListVisitor {
1251
1281
) -> ResultRewrite {
1252
1282
if is_like {
1253
1283
if let Some ( ( i, v) ) = Self :: found_field ( & self . ngram_fields , col_name) {
1254
- if !scalar. is_null ( ) && BloomFilter :: supported_type ( ty) {
1284
+ if !scalar. is_null ( ) && Xor8Filter :: supported_type ( ty) {
1255
1285
self . ngram_founds . push ( v. clone ( ) ) ;
1256
1286
self . ngram_scalars . push ( ( i, scalar. clone ( ) ) ) ;
1257
1287
}
1258
1288
}
1259
- } else {
1260
- if let Some ( ( i, v) ) = Self :: found_field ( & self . bloom_fields , col_name) {
1261
- if !scalar. is_null ( ) && Xor8Filter :: supported_type ( ty) {
1262
- self . bloom_founds . push ( v. clone ( ) ) ;
1263
- self . bloom_scalars . push ( ( i, scalar. clone ( ) , ty. clone ( ) ) ) ;
1264
- }
1289
+ } else if let Some ( ( i, v) ) = Self :: found_field ( & self . bloom_fields , col_name) {
1290
+ if !scalar. is_null ( ) && Xor8Filter :: supported_type ( ty) {
1291
+ self . bloom_founds . push ( v. clone ( ) ) ;
1292
+ self . bloom_scalars . push ( ( i, scalar. clone ( ) , ty. clone ( ) ) ) ;
1265
1293
}
1266
1294
}
1267
1295
Ok ( ControlFlow :: Break ( None ) )
@@ -1292,7 +1320,7 @@ impl EqVisitor for ShortListVisitor {
1292
1320
let Some ( ( i, field) ) = Self :: found_field ( & self . ngram_fields , id) else {
1293
1321
return Ok ( ControlFlow :: Break ( None ) ) ;
1294
1322
} ;
1295
- if !BloomFilter :: supported_type ( src_type) || !is_injective_cast ( src_type, dest_type) {
1323
+ if !Xor8Filter :: supported_type ( src_type) || !is_injective_cast ( src_type, dest_type) {
1296
1324
return Ok ( ControlFlow :: Break ( None ) ) ;
1297
1325
}
1298
1326
0 commit comments