Skip to content

Commit 25197a6

Browse files
committed
feat: support gc for view_arrays
part 1: implement checker to check if current buffer is compact Signed-off-by: 蔡略 <[email protected]>
1 parent 6348dc3 commit 25197a6

File tree

1 file changed

+160
-0
lines changed

1 file changed

+160
-0
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
2525
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
2626
use arrow_schema::{ArrowError, DataType};
2727
use std::any::Any;
28+
use std::collections::{BTreeMap, BTreeSet};
2829
use std::fmt::Debug;
2930
use std::marker::PhantomData;
3031
use std::sync::Arc;
@@ -265,6 +266,51 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
265266
phantom: Default::default(),
266267
}
267268
}
269+
270+
/// Returns whether this array is a compact view
271+
pub(self) fn is_compact_view(&self) -> bool {
272+
todo!()
273+
}
274+
275+
/// Returns a compact version of this array
276+
///
277+
/// # Compaction
278+
///
279+
/// before compaction:
280+
/// ```text
281+
/// ┌──────┐
282+
/// │......│
283+
/// │......│
284+
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
285+
/// │ View 1 │─ ─ ─ ─ │......│ with data that
286+
/// ├────────────────────┤ │......│ is not referred
287+
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
288+
/// └────────────────────┘ │......│ View 2
289+
/// │......│
290+
/// 2 views, refer to │......│
291+
/// small portions of a └──────┘
292+
/// large buffer
293+
/// ```
294+
///
295+
/// after compaction:
296+
///
297+
/// ```text
298+
/// ┌────────────────────┐ ┌─────┐ After gc, only
299+
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
300+
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
301+
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
302+
/// └────────────────────┘ left
303+
///
304+
///
305+
/// 2 views
306+
/// ```
307+
/// this method will compact the data buffers to only include the data
308+
/// that is pointed to by the views
309+
/// and return a new array with the compacted data buffers.
310+
/// the original array will be left as is.
311+
pub fn compact(&self) -> Self {
312+
todo!()
313+
}
268314
}
269315

270316
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
@@ -482,6 +528,60 @@ impl From<Vec<Option<String>>> for StringViewArray {
482528
}
483529
}
484530

531+
/// A helper struct that used to check if the array is compact view
532+
///
533+
/// # Note
534+
///
535+
/// The checker is lazy and will not check the array until `finish` is called.
536+
///
537+
/// This is based on the assumption that the array will most likely to be not compact,
538+
/// so it is likely to scan the entire array.
539+
///
540+
/// Then it is better to do the check at once, rather than doing it for each accumulate operation.
541+
struct CompactChecker {
542+
length: usize,
543+
coverage: BTreeMap<usize, usize>,
544+
}
545+
546+
impl CompactChecker {
547+
pub fn new(length: usize) -> Self {
548+
Self {
549+
length,
550+
coverage: BTreeMap::new(),
551+
}
552+
}
553+
554+
/// Accumulate a new covered interval to the checker
555+
pub fn accumulate(&mut self, offset: usize, length: usize) {
556+
if length == 0 {
557+
return;
558+
}
559+
let end = offset + length;
560+
if let Some(val) = self.coverage.get_mut(&offset) {
561+
if *val < end {
562+
*val = end;
563+
}
564+
} else {
565+
self.coverage.insert(offset, end);
566+
}
567+
}
568+
569+
/// Check if the checker is fully covered
570+
pub fn finish(&self) -> bool {
571+
// check if the coverage is continuous and full
572+
let mut last_end = 0;
573+
// todo: can be optimized
574+
for (start, end) in self.coverage.iter() {
575+
if *start > last_end {
576+
return false;
577+
}
578+
last_end = *end;
579+
}
580+
581+
last_end == self.length
582+
}
583+
}
584+
485585
#[cfg(test)]
486586
mod tests {
487587
use crate::builder::{BinaryViewBuilder, StringViewBuilder};
@@ -645,4 +745,64 @@ mod tests {
645745

646746
StringViewArray::new(views, buffers, None);
647747
}
748+
749+
#[test]
750+
fn test_compact_checker() {
751+
use super::CompactChecker;
752+
// single coverage, full
753+
let mut checker = CompactChecker::new(10);
754+
checker.accumulate(0, 10);
755+
assert!(checker.finish());
756+
// single coverage, partial
757+
let mut checker = CompactChecker::new(10);
758+
checker.accumulate(0, 5);
759+
assert!(!checker.finish());
760+
// multiple coverage, no overlapping, partial
761+
let mut checker = CompactChecker::new(10);
762+
checker.accumulate(0, 5);
763+
checker.accumulate(5, 4);
764+
assert!(!checker.finish());
765+
766+
//multiple coverage, no overlapping, full
767+
let mut checker = CompactChecker::new(10);
768+
checker.accumulate(0, 5);
769+
checker.accumulate(5, 5);
770+
assert!(checker.finish());
771+
//multiple coverage, overlapping, partial
772+
let mut checker = CompactChecker::new(10);
773+
checker.accumulate(0, 5);
774+
checker.accumulate(4, 5);
775+
assert!(!checker.finish());
776+
777+
//multiple coverage, overlapping, full
778+
let mut checker = CompactChecker::new(10);
779+
checker.accumulate(0, 5);
780+
checker.accumulate(4, 6);
781+
assert!(checker.finish());
782+
//mutiple coverage, no overlapping, full, out of order
783+
let mut checker = CompactChecker::new(10);
784+
checker.accumulate(4, 6);
785+
checker.accumulate(0, 4);
786+
assert!(checker.finish());
787+
788+
// multiple coverage, overlapping, full, out of order
789+
let mut checker = CompactChecker::new(10);
790+
checker.accumulate(4, 6);
791+
checker.accumulate(0, 4);
792+
assert!(checker.finish());
793+
794+
// multiple coverage, overlapping, full, containing null
795+
let mut checker = CompactChecker::new(10);
796+
checker.accumulate(0, 5);
797+
checker.accumulate(5, 0);
798+
checker.accumulate(5, 5);
799+
assert!(checker.finish());
800+
// multiple coverage, overlapping, full, containing null
801+
let mut checker = CompactChecker::new(10);
802+
checker.accumulate(0, 5);
803+
checker.accumulate(5, 0);
804+
checker.accumulate(4, 6);
805+
checker.accumulate(5, 5);
806+
assert!(checker.finish());
807+
}
648808
}

0 commit comments

Comments
 (0)