@@ -54,7 +54,7 @@ static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY =
54
54
" \x04\x07\x08\x04 " sv;
55
55
56
56
template <typename Generator>
57
- auto build_vector (size_t items, Generator && gen) {
57
+ auto GenerateVector (size_t items, Generator && gen) {
58
58
std::vector<std::result_of_t <Generator (size_t )>> result;
59
59
result.reserve (items);
60
60
for (size_t i = 0 ; i < items; ++i) {
@@ -64,7 +64,7 @@ auto build_vector(size_t items, Generator && gen) {
64
64
return result;
65
65
}
66
66
67
- std::string foobar (size_t i) {
67
+ std::string FooBarSeq (size_t i) {
68
68
std::string result;
69
69
if (i % 3 == 0 )
70
70
result += " Foo" ;
@@ -76,6 +76,34 @@ std::string foobar(size_t i) {
76
76
return result;
77
77
}
78
78
79
+ template <typename T, typename U = T>
80
+ auto SameValueSeq (const U & value) {
81
+ return [&value](size_t ) -> T {
82
+ return value;
83
+ };
84
+ }
85
+
86
+ template <typename ResultType, typename Generator1, typename Generator2>
87
+ auto AlternateGenerators (Generator1 && gen1, Generator2 && gen2) {
88
+ return [&gen1, &gen2](size_t i) -> ResultType {
89
+ if (i % 2 == 0 )
90
+ return gen1 (i/2 );
91
+ else
92
+ return gen2 (i/2 );
93
+ };
94
+ }
95
+
96
+ template <typename T>
97
+ std::vector<T> ConcatSequences (std::vector<T> && vec1, std::vector<T> && vec2)
98
+ {
99
+ std::vector<T> result (vec1);
100
+
101
+ result.reserve (vec1.size () + vec2.size ());
102
+ result.insert (result.end (), vec2.begin (), vec2.end ());
103
+
104
+ return result;
105
+ }
106
+
79
107
static std::vector<Int64> MakeDateTime64s () {
80
108
static const auto seconds_multiplier = 1'000'000 ;
81
109
static const auto year = 86400ull * 365 * seconds_multiplier; // ~approx, but this doesn't matter here.
@@ -85,7 +113,7 @@ static std::vector<Int64> MakeDateTime64s() {
85
113
// Please note there are values outside of DateTime (32-bit) range that might
86
114
// not have correct string representation in CH yet,
87
115
// but still are supported as Int64 values.
88
- return build_vector (200 ,
116
+ return GenerateVector (200 ,
89
117
[] (size_t i )-> Int64 {
90
118
return (i - 100 ) * year * 2 + (i * 10 ) * seconds_multiplier + i;
91
119
});
@@ -375,26 +403,26 @@ TEST(ColumnsCase, UUIDSlice) {
375
403
ASSERT_EQ (sub->At (1 ), UInt128 (0x3507213c178649f9llu, 0x9faf035d662f60aellu));
376
404
}
377
405
378
- TEST (ColumnsCase, LowCardinalityWrapperString_Append_and_Read ) {
406
+ TEST (ColumnsCase, ColumnLowCardinalityString_Append_and_Read ) {
379
407
const size_t items_count = 11 ;
380
408
ColumnLowCardinalityT<ColumnString> col;
381
- for (const auto & item : build_vector (items_count, &foobar )) {
409
+ for (const auto & item : GenerateVector (items_count, &FooBarSeq )) {
382
410
col.Append (item);
383
411
}
384
412
385
413
ASSERT_EQ (col.Size (), items_count);
386
414
ASSERT_EQ (col.GetDictionarySize (), 8u + 1 ); // 8 unique items from sequence + 1 null-item
387
415
388
416
for (size_t i = 0 ; i < items_count; ++i) {
389
- ASSERT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
390
- ASSERT_EQ (col[i], foobar (i)) << " at pos: " << i;
417
+ ASSERT_EQ (col.At (i), FooBarSeq (i)) << " at pos: " << i;
418
+ ASSERT_EQ (col[i], FooBarSeq (i)) << " at pos: " << i;
391
419
}
392
420
}
393
421
394
- TEST (ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append ) {
422
+ TEST (ColumnsCase, ColumnLowCardinalityString_Clear_and_Append ) {
395
423
const size_t items_count = 11 ;
396
424
ColumnLowCardinalityT<ColumnString> col;
397
- for (const auto & item : build_vector (items_count, &foobar ))
425
+ for (const auto & item : GenerateVector (items_count, &FooBarSeq ))
398
426
{
399
427
col.Append (item);
400
428
}
@@ -403,7 +431,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
403
431
ASSERT_EQ (col.Size (), 0u );
404
432
ASSERT_EQ (col.GetDictionarySize (), 1u ); // null-item
405
433
406
- for (const auto & item : build_vector (items_count, &foobar ))
434
+ for (const auto & item : GenerateVector (items_count, &FooBarSeq ))
407
435
{
408
436
col.Append (item);
409
437
}
@@ -412,7 +440,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
412
440
ASSERT_EQ (col.GetDictionarySize (), 8u + 1 ); // 8 unique items from sequence + 1 null-item
413
441
}
414
442
415
- TEST (ColumnsCase, LowCardinalityString_Load ) {
443
+ TEST (ColumnsCase, ColumnLowCardinalityString_Load ) {
416
444
const size_t items_count = 10 ;
417
445
ColumnLowCardinalityT<ColumnString> col;
418
446
@@ -423,25 +451,116 @@ TEST(ColumnsCase, LowCardinalityString_Load) {
423
451
EXPECT_TRUE (col.Load (&stream, items_count));
424
452
425
453
for (size_t i = 0 ; i < items_count; ++i) {
426
- EXPECT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
454
+ EXPECT_EQ (col.At (i), FooBarSeq (i)) << " at pos: " << i;
427
455
}
428
456
}
429
457
430
- TEST (ColumnsCase, LowCardinalityString_Save) {
458
+ // This is temporary diabled since we are not 100% compatitable with ClickHouse
459
+ // on how we serailize LC columns, but we check interoperability in other tests (see client_ut.cpp)
460
+ TEST (ColumnsCase, DISABLED_ColumnLowCardinalityString_Save) {
431
461
const size_t items_count = 10 ;
432
462
ColumnLowCardinalityT<ColumnString> col;
433
- for (const auto & item : build_vector (items_count, &foobar )) {
463
+ for (const auto & item : GenerateVector (items_count, &FooBarSeq )) {
434
464
col.Append (item);
435
465
}
436
466
437
- const auto & data = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY;
438
- ArrayInput buffer (data.data (), data.size ());
439
- CodedInputStream stream (&buffer);
467
+ ArrayOutput output (0 , 0 );
468
+ CodedOutputStream output_stream (&output);
440
469
441
- EXPECT_TRUE (col.Load (&stream, items_count));
470
+ const size_t expected_output_size = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY.size ();
471
+ // Enough space to account for possible overflow from both right and left sides.
472
+ char buffer[expected_output_size * 10 ] = {' \0 ' };
473
+ const char margin_content[sizeof (buffer)] = {' \0 ' };
442
474
443
- for (size_t i = 0 ; i < items_count; ++i) {
444
- EXPECT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
475
+ const size_t left_margin_size = 10 ;
476
+ const size_t right_margin_size = sizeof (buffer) - left_margin_size - expected_output_size;
477
+
478
+ // Since overflow from left side is less likely to happen, leave only tiny margin there.
479
+ auto write_pos = buffer + left_margin_size;
480
+ const auto left_margin = buffer;
481
+ const auto right_margin = write_pos + expected_output_size;
482
+
483
+ output.Reset (write_pos, expected_output_size);
484
+
485
+ EXPECT_NO_THROW (col.Save (&output_stream));
486
+
487
+ // Left margin should be blank
488
+ EXPECT_EQ (std::string_view (margin_content, left_margin_size), std::string_view (left_margin, left_margin_size));
489
+ // Right margin should be blank too
490
+ EXPECT_EQ (std::string_view (margin_content, right_margin_size), std::string_view (right_margin, right_margin_size));
491
+
492
+ // TODO: right now LC columns do not write indexes in the most compact way possible, so binary representation is a bit different
493
+ // (there might be other inconsistances too)
494
+ EXPECT_EQ (LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY, std::string_view (write_pos, expected_output_size));
495
+ }
496
+
497
+ TEST (ColumnsCase, ColumnLowCardinalityString_SaveAndLoad) {
498
+ // Verify that we can load binary representation back
499
+ ColumnLowCardinalityT<ColumnString> col;
500
+
501
+ const auto items = GenerateVector (10 , &FooBarSeq);
502
+ for (const auto & item : items) {
503
+ col.Append (item);
504
+ }
505
+
506
+ char buffer[256 ] = {' \0 ' }; // about 3 times more space than needed for this set of values.
507
+ {
508
+ ArrayOutput output (buffer, sizeof (buffer));
509
+ CodedOutputStream output_stream (&output);
510
+ EXPECT_NO_THROW (col.Save (&output_stream));
511
+ }
512
+
513
+ col.Clear ();
514
+
515
+ {
516
+ // Load the data back
517
+ ArrayInput input (buffer, sizeof (buffer));
518
+ CodedInputStream input_stream (&input);
519
+ EXPECT_TRUE (col.Load (&input_stream, items.size ()));
520
+ }
521
+
522
+ for (size_t i = 0 ; i < items.size (); ++i) {
523
+ EXPECT_EQ (col.At (i), items[i]) << " at pos: " << i;
524
+ }
525
+ }
526
+
527
+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_1) {
528
+ // Verify that when empty string is added to a LC column it can be retrieved back as empty string.
529
+ ColumnLowCardinalityT<ColumnString> col;
530
+ const auto values = GenerateVector (10 , AlternateGenerators<std::string>(SameValueSeq<std::string>(" " ), FooBarSeq));
531
+ for (const auto & item : values) {
532
+ col.Append (item);
533
+ }
534
+
535
+ for (size_t i = 0 ; i < values.size (); ++i) {
536
+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
537
+ }
538
+ }
539
+
540
+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_2) {
541
+ // Verify that when empty string is added to a LC column it can be retrieved back as empty string.
542
+ // (Ver2): Make sure that outcome doesn't depend if empty values are on odd positions
543
+ ColumnLowCardinalityT<ColumnString> col;
544
+ const auto values = GenerateVector (10 , AlternateGenerators<std::string>(FooBarSeq, SameValueSeq<std::string>(" " )));
545
+ for (const auto & item : values) {
546
+ col.Append (item);
547
+ }
548
+
549
+ for (size_t i = 0 ; i < values.size (); ++i) {
550
+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
551
+ }
552
+ }
553
+
554
+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) {
555
+ // When we have many leading empty strings and some non-empty values.
556
+ ColumnLowCardinalityT<ColumnString> col;
557
+ const auto values = ConcatSequences (GenerateVector (100 , SameValueSeq<std::string>(" " )), GenerateVector (5 , FooBarSeq));
558
+ for (const auto & item : values) {
559
+ col.Append (item);
560
+ }
561
+
562
+ for (size_t i = 0 ; i < values.size (); ++i) {
563
+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
445
564
}
446
565
}
447
566
0 commit comments