@@ -25,6 +25,7 @@ use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
25
25
use arrow_data:: { ArrayData , ArrayDataBuilder , ByteView } ;
26
26
use arrow_schema:: { ArrowError , DataType } ;
27
27
use std:: any:: Any ;
28
+ use std:: collections:: { BTreeMap , BTreeSet } ;
28
29
use std:: fmt:: Debug ;
29
30
use std:: marker:: PhantomData ;
30
31
use std:: sync:: Arc ;
@@ -265,6 +266,51 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
265
266
phantom : Default :: default ( ) ,
266
267
}
267
268
}
269
+
270
+ /// Returns whether this array is a compact view
271
+ pub ( self ) fn is_compact_view ( & self ) -> bool {
272
+ todo ! ( )
273
+ }
274
+
275
+ /// Returns a compact version of this array
276
+ ///
277
+ /// # Compaction
278
+ ///
279
+ /// before compaction:
280
+ /// ```text
281
+ /// ┌──────┐
282
+ /// │......│
283
+ /// │......│
284
+ /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
285
+ /// │ View 1 │─ ─ ─ ─ │......│ with data that
286
+ /// ├────────────────────┤ │......│ is not referred
287
+ /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
288
+ /// └────────────────────┘ │......│ View 2
289
+ /// │......│
290
+ /// 2 views, refer to │......│
291
+ /// small portions of a └──────┘
292
+ /// large buffer
293
+ /// ```
294
+ ///
295
+ /// after compaction:
296
+ ///
297
+ /// ```text
298
+ /// ┌────────────────────┐ ┌─────┐ After gc, only
299
+ /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
300
+ /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
301
+ /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
302
+ /// └────────────────────┘ left
303
+ ///
304
+ ///
305
+ /// 2 views
306
+ /// ```
307
+ /// this method will compact the data buffers to only include the data
308
+ /// that is pointed to by the views
309
+ /// and return a new array with the compacted data buffers.
310
+ /// the original array will be left as is.
311
+ pub fn compact ( & self ) -> Self {
312
+ todo ! ( )
313
+ }
268
314
}
269
315
270
316
impl < T : ByteViewType + ?Sized > Debug for GenericByteViewArray < T > {
@@ -482,6 +528,60 @@ impl From<Vec<Option<String>>> for StringViewArray {
482
528
}
483
529
}
484
530
531
+ /// A helper struct that used to check if the array is compact view
532
+ ///
533
+ /// # Note
534
+ ///
535
+ /// The checker is lazy and will not check the array until `finish` is called.
536
+ ///
537
+ /// This is based on the assumption that the array will most likely to be not compact,
538
+ /// so it is likely to scan the entire array.
539
+ ///
540
+ /// Then it is better to do the check at once, rather than doing it for each accumulate operation.
541
+ struct CompactChecker {
542
+ length : usize ,
543
+ coverage : BTreeMap < usize , usize > ,
544
+ }
545
+
546
+ impl CompactChecker {
547
+ pub fn new ( length : usize ) -> Self {
548
+ Self {
549
+ length,
550
+ coverage : BTreeMap :: new ( ) ,
551
+ }
552
+ }
553
+
554
+ /// Accumulate a new covered interval to the checker
555
+ pub fn accumulate ( & mut self , offset : usize , length : usize ) {
556
+ if length == 0 {
557
+ return ;
558
+ }
559
+ let end = offset + length;
560
+ if let Some ( val) = self . coverage . get_mut ( & offset) {
561
+ if * val < end {
562
+ * val = end;
563
+ }
564
+ } else {
565
+ self . coverage . insert ( offset, end) ;
566
+ }
567
+ }
568
+
569
+ /// Check if the checker is fully covered
570
+ pub fn finish ( & self ) -> bool {
571
+ // check if the coverage is continuous and full
572
+ let mut last_end = 0 ;
573
+ // todo: can be optimized
574
+ for ( start, end) in self . coverage . iter ( ) {
575
+ if * start > last_end {
576
+ return false ;
577
+ }
578
+ last_end = * end;
579
+ }
580
+
581
+ last_end == self . length
582
+ }
583
+ }
584
+
485
585
#[ cfg( test) ]
486
586
mod tests {
487
587
use crate :: builder:: { BinaryViewBuilder , StringViewBuilder } ;
@@ -645,4 +745,64 @@ mod tests {
645
745
646
746
StringViewArray :: new ( views, buffers, None ) ;
647
747
}
748
+
749
+ #[ test]
750
+ fn test_compact_checker ( ) {
751
+ use super :: CompactChecker ;
752
+ // single coverage, full
753
+ let mut checker = CompactChecker :: new ( 10 ) ;
754
+ checker. accumulate ( 0 , 10 ) ;
755
+ assert ! ( checker. finish( ) ) ;
756
+ // single coverage, partial
757
+ let mut checker = CompactChecker :: new ( 10 ) ;
758
+ checker. accumulate ( 0 , 5 ) ;
759
+ assert ! ( !checker. finish( ) ) ;
760
+ // multiple coverage, no overlapping, partial
761
+ let mut checker = CompactChecker :: new ( 10 ) ;
762
+ checker. accumulate ( 0 , 5 ) ;
763
+ checker. accumulate ( 5 , 4 ) ;
764
+ assert ! ( !checker. finish( ) ) ;
765
+
766
+ //multiple coverage, no overlapping, full
767
+ let mut checker = CompactChecker :: new ( 10 ) ;
768
+ checker. accumulate ( 0 , 5 ) ;
769
+ checker. accumulate ( 5 , 5 ) ;
770
+ assert ! ( checker. finish( ) ) ;
771
+ //multiple coverage, overlapping, partial
772
+ let mut checker = CompactChecker :: new ( 10 ) ;
773
+ checker. accumulate ( 0 , 5 ) ;
774
+ checker. accumulate ( 4 , 5 ) ;
775
+ assert ! ( !checker. finish( ) ) ;
776
+
777
+ //multiple coverage, overlapping, full
778
+ let mut checker = CompactChecker :: new ( 10 ) ;
779
+ checker. accumulate ( 0 , 5 ) ;
780
+ checker. accumulate ( 4 , 6 ) ;
781
+ assert ! ( checker. finish( ) ) ;
782
+ //mutiple coverage, no overlapping, full, out of order
783
+ let mut checker = CompactChecker :: new ( 10 ) ;
784
+ checker. accumulate ( 4 , 6 ) ;
785
+ checker. accumulate ( 0 , 4 ) ;
786
+ assert ! ( checker. finish( ) ) ;
787
+
788
+ // multiple coverage, overlapping, full, out of order
789
+ let mut checker = CompactChecker :: new ( 10 ) ;
790
+ checker. accumulate ( 4 , 6 ) ;
791
+ checker. accumulate ( 0 , 4 ) ;
792
+ assert ! ( checker. finish( ) ) ;
793
+
794
+ // multiple coverage, overlapping, full, containing null
795
+ let mut checker = CompactChecker :: new ( 10 ) ;
796
+ checker. accumulate ( 0 , 5 ) ;
797
+ checker. accumulate ( 5 , 0 ) ;
798
+ checker. accumulate ( 5 , 5 ) ;
799
+ assert ! ( checker. finish( ) ) ;
800
+ // multiple coverage, overlapping, full, containing null
801
+ let mut checker = CompactChecker :: new ( 10 ) ;
802
+ checker. accumulate ( 0 , 5 ) ;
803
+ checker. accumulate ( 5 , 0 ) ;
804
+ checker. accumulate ( 4 , 6 ) ;
805
+ checker. accumulate ( 5 , 5 ) ;
806
+ assert ! ( checker. finish( ) ) ;
807
+ }
648
808
}
0 commit comments