Skip to content

Commit 04a4dce

Browse files
authored
Merge pull request #53 from Enmk/fix_LowCardinality_Empty_value
Fix of LowCardinality bug: empty string converted to other value on INSERT
2 parents 077fe6f + 6691e63 commit 04a4dce

File tree

2 files changed

+147
-28
lines changed

2 files changed

+147
-28
lines changed

clickhouse/columns/lowcardinality.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,18 @@ ColumnLowCardinality::ColumnLowCardinality(ColumnRef dictionary_column)
125125
dictionary_column_(dictionary_column->Slice(0, 0)), // safe way to get an column of the same type.
126126
index_column_(std::make_shared<ColumnUInt32>())
127127
{
128-
if (dictionary_column->Size() != 0) {
129-
AppendNullItemToEmptyColumn();
128+
AppendNullItemToEmptyColumn();
130129

130+
if (dictionary_column->Size() != 0) {
131131
// Add values, updating index_column_ and unique_items_map_.
132+
133+
// TODO: it would be possible to eliminate copying
134+
// by adding InsertUnsafe(pos, ItemView) method to a Column
135+
// (to insert null-item at pos 0),
136+
// but that is too much work for now.
132137
for (size_t i = 0; i < dictionary_column->Size(); ++i) {
133-
// TODO: it would be possible to eliminate copying
134-
// by adding InsertUnsafe(pos, ItemView) method to a Column,
135-
// but that is too much work for now.
136138
AppendUnsafe(dictionary_column->GetItem(i));
137139
}
138-
} else {
139-
AppendNullItemToEmptyColumn();
140140
}
141141
}
142142

@@ -365,7 +365,7 @@ void ColumnLowCardinality::AppendNullItemToEmptyColumn()
365365

366366
const auto null_item = GetNullItemForDictionary(dictionary_column_);
367367
AppendToDictionary(*dictionary_column_, null_item);
368-
unique_items_map_.emplace(computeHashKey(null_item), dictionary_column_->Size());
368+
unique_items_map_.emplace(computeHashKey(null_item), 0);
369369
}
370370

371371
size_t ColumnLowCardinality::GetDictionarySize() const {

ut/columns_ut.cpp

+139-20
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY =
5454
"\x04\x07\x08\x04"sv;
5555

5656
template <typename Generator>
57-
auto build_vector(size_t items, Generator && gen) {
57+
auto GenerateVector(size_t items, Generator && gen) {
5858
std::vector<std::result_of_t<Generator(size_t)>> result;
5959
result.reserve(items);
6060
for (size_t i = 0; i < items; ++i) {
@@ -64,7 +64,7 @@ auto build_vector(size_t items, Generator && gen) {
6464
return result;
6565
}
6666

67-
std::string foobar(size_t i) {
67+
std::string FooBarSeq(size_t i) {
6868
std::string result;
6969
if (i % 3 == 0)
7070
result += "Foo";
@@ -76,6 +76,34 @@ std::string foobar(size_t i) {
7676
return result;
7777
}
7878

79+
template <typename T, typename U = T>
80+
auto SameValueSeq(const U & value) {
81+
return [&value](size_t) -> T {
82+
return value;
83+
};
84+
}
85+
86+
template <typename ResultType, typename Generator1, typename Generator2>
87+
auto AlternateGenerators(Generator1 && gen1, Generator2 && gen2) {
88+
return [&gen1, &gen2](size_t i) -> ResultType {
89+
if (i % 2 == 0)
90+
return gen1(i/2);
91+
else
92+
return gen2(i/2);
93+
};
94+
}
95+
96+
template <typename T>
97+
std::vector<T> ConcatSequences(std::vector<T> && vec1, std::vector<T> && vec2)
98+
{
99+
std::vector<T> result(vec1);
100+
101+
result.reserve(vec1.size() + vec2.size());
102+
result.insert(result.end(), vec2.begin(), vec2.end());
103+
104+
return result;
105+
}
106+
79107
static std::vector<Int64> MakeDateTime64s() {
80108
static const auto seconds_multiplier = 1'000'000;
81109
static const auto year = 86400ull * 365 * seconds_multiplier; // ~approx, but this doesn't matter here.
@@ -85,7 +113,7 @@ static std::vector<Int64> MakeDateTime64s() {
85113
// Please note there are values outside of DateTime (32-bit) range that might
86114
// not have correct string representation in CH yet,
87115
// but still are supported as Int64 values.
88-
return build_vector(200,
116+
return GenerateVector(200,
89117
[] (size_t i )-> Int64 {
90118
return (i - 100) * year * 2 + (i * 10) * seconds_multiplier + i;
91119
});
@@ -375,26 +403,26 @@ TEST(ColumnsCase, UUIDSlice) {
375403
ASSERT_EQ(sub->At(1), UInt128(0x3507213c178649f9llu, 0x9faf035d662f60aellu));
376404
}
377405

378-
TEST(ColumnsCase, LowCardinalityWrapperString_Append_and_Read) {
406+
TEST(ColumnsCase, ColumnLowCardinalityString_Append_and_Read) {
379407
const size_t items_count = 11;
380408
ColumnLowCardinalityT<ColumnString> col;
381-
for (const auto & item : build_vector(items_count, &foobar)) {
409+
for (const auto & item : GenerateVector(items_count, &FooBarSeq)) {
382410
col.Append(item);
383411
}
384412

385413
ASSERT_EQ(col.Size(), items_count);
386414
ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item
387415

388416
for (size_t i = 0; i < items_count; ++i) {
389-
ASSERT_EQ(col.At(i), foobar(i)) << " at pos: " << i;
390-
ASSERT_EQ(col[i], foobar(i)) << " at pos: " << i;
417+
ASSERT_EQ(col.At(i), FooBarSeq(i)) << " at pos: " << i;
418+
ASSERT_EQ(col[i], FooBarSeq(i)) << " at pos: " << i;
391419
}
392420
}
393421

394-
TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
422+
TEST(ColumnsCase, ColumnLowCardinalityString_Clear_and_Append) {
395423
const size_t items_count = 11;
396424
ColumnLowCardinalityT<ColumnString> col;
397-
for (const auto & item : build_vector(items_count, &foobar))
425+
for (const auto & item : GenerateVector(items_count, &FooBarSeq))
398426
{
399427
col.Append(item);
400428
}
@@ -403,7 +431,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
403431
ASSERT_EQ(col.Size(), 0u);
404432
ASSERT_EQ(col.GetDictionarySize(), 1u); // null-item
405433

406-
for (const auto & item : build_vector(items_count, &foobar))
434+
for (const auto & item : GenerateVector(items_count, &FooBarSeq))
407435
{
408436
col.Append(item);
409437
}
@@ -412,7 +440,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
412440
ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item
413441
}
414442

415-
TEST(ColumnsCase, LowCardinalityString_Load) {
443+
TEST(ColumnsCase, ColumnLowCardinalityString_Load) {
416444
const size_t items_count = 10;
417445
ColumnLowCardinalityT<ColumnString> col;
418446

@@ -423,25 +451,116 @@ TEST(ColumnsCase, LowCardinalityString_Load) {
423451
EXPECT_TRUE(col.Load(&stream, items_count));
424452

425453
for (size_t i = 0; i < items_count; ++i) {
426-
EXPECT_EQ(col.At(i), foobar(i)) << " at pos: " << i;
454+
EXPECT_EQ(col.At(i), FooBarSeq(i)) << " at pos: " << i;
427455
}
428456
}
429457

430-
TEST(ColumnsCase, LowCardinalityString_Save) {
458+
// This is temporary diabled since we are not 100% compatitable with ClickHouse
459+
// on how we serailize LC columns, but we check interoperability in other tests (see client_ut.cpp)
460+
TEST(ColumnsCase, DISABLED_ColumnLowCardinalityString_Save) {
431461
const size_t items_count = 10;
432462
ColumnLowCardinalityT<ColumnString> col;
433-
for (const auto & item : build_vector(items_count, &foobar)) {
463+
for (const auto & item : GenerateVector(items_count, &FooBarSeq)) {
434464
col.Append(item);
435465
}
436466

437-
const auto & data = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY;
438-
ArrayInput buffer(data.data(), data.size());
439-
CodedInputStream stream(&buffer);
467+
ArrayOutput output(0, 0);
468+
CodedOutputStream output_stream(&output);
440469

441-
EXPECT_TRUE(col.Load(&stream, items_count));
470+
const size_t expected_output_size = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY.size();
471+
// Enough space to account for possible overflow from both right and left sides.
472+
char buffer[expected_output_size * 10] = {'\0'};
473+
const char margin_content[sizeof(buffer)] = {'\0'};
442474

443-
for (size_t i = 0; i < items_count; ++i) {
444-
EXPECT_EQ(col.At(i), foobar(i)) << " at pos: " << i;
475+
const size_t left_margin_size = 10;
476+
const size_t right_margin_size = sizeof(buffer) - left_margin_size - expected_output_size;
477+
478+
// Since overflow from left side is less likely to happen, leave only tiny margin there.
479+
auto write_pos = buffer + left_margin_size;
480+
const auto left_margin = buffer;
481+
const auto right_margin = write_pos + expected_output_size;
482+
483+
output.Reset(write_pos, expected_output_size);
484+
485+
EXPECT_NO_THROW(col.Save(&output_stream));
486+
487+
// Left margin should be blank
488+
EXPECT_EQ(std::string_view(margin_content, left_margin_size), std::string_view(left_margin, left_margin_size));
489+
// Right margin should be blank too
490+
EXPECT_EQ(std::string_view(margin_content, right_margin_size), std::string_view(right_margin, right_margin_size));
491+
492+
// TODO: right now LC columns do not write indexes in the most compact way possible, so binary representation is a bit different
493+
// (there might be other inconsistances too)
494+
EXPECT_EQ(LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY, std::string_view(write_pos, expected_output_size));
495+
}
496+
497+
TEST(ColumnsCase, ColumnLowCardinalityString_SaveAndLoad) {
498+
// Verify that we can load binary representation back
499+
ColumnLowCardinalityT<ColumnString> col;
500+
501+
const auto items = GenerateVector(10, &FooBarSeq);
502+
for (const auto & item : items) {
503+
col.Append(item);
504+
}
505+
506+
char buffer[256] = {'\0'}; // about 3 times more space than needed for this set of values.
507+
{
508+
ArrayOutput output(buffer, sizeof(buffer));
509+
CodedOutputStream output_stream(&output);
510+
EXPECT_NO_THROW(col.Save(&output_stream));
511+
}
512+
513+
col.Clear();
514+
515+
{
516+
// Load the data back
517+
ArrayInput input(buffer, sizeof(buffer));
518+
CodedInputStream input_stream(&input);
519+
EXPECT_TRUE(col.Load(&input_stream, items.size()));
520+
}
521+
522+
for (size_t i = 0; i < items.size(); ++i) {
523+
EXPECT_EQ(col.At(i), items[i]) << " at pos: " << i;
524+
}
525+
}
526+
527+
TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_1) {
528+
// Verify that when empty string is added to a LC column it can be retrieved back as empty string.
529+
ColumnLowCardinalityT<ColumnString> col;
530+
const auto values = GenerateVector(10, AlternateGenerators<std::string>(SameValueSeq<std::string>(""), FooBarSeq));
531+
for (const auto & item : values) {
532+
col.Append(item);
533+
}
534+
535+
for (size_t i = 0; i < values.size(); ++i) {
536+
EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i;
537+
}
538+
}
539+
540+
TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_2) {
541+
// Verify that when empty string is added to a LC column it can be retrieved back as empty string.
542+
// (Ver2): Make sure that outcome doesn't depend if empty values are on odd positions
543+
ColumnLowCardinalityT<ColumnString> col;
544+
const auto values = GenerateVector(10, AlternateGenerators<std::string>(FooBarSeq, SameValueSeq<std::string>("")));
545+
for (const auto & item : values) {
546+
col.Append(item);
547+
}
548+
549+
for (size_t i = 0; i < values.size(); ++i) {
550+
EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i;
551+
}
552+
}
553+
554+
TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) {
555+
// When we have many leading empty strings and some non-empty values.
556+
ColumnLowCardinalityT<ColumnString> col;
557+
const auto values = ConcatSequences(GenerateVector(100, SameValueSeq<std::string>("")), GenerateVector(5, FooBarSeq));
558+
for (const auto & item : values) {
559+
col.Append(item);
560+
}
561+
562+
for (size_t i = 0; i < values.size(); ++i) {
563+
EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i;
445564
}
446565
}
447566

0 commit comments

Comments
 (0)