Skip to content

Commit fcb50cd

Browse files
committed
Squashed into single commit relative to next-major
1 parent a6bb5e9 commit fcb50cd

31 files changed

+1725
-57
lines changed

src/realm/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ set(REALM_SOURCES
6262
table.cpp
6363
table_ref.cpp
6464
obj_list.cpp
65+
string_interner.cpp
66+
string_compressor.cpp
6567
object_id.cpp
6668
table_view.cpp
6769
tokenizer.cpp
@@ -178,6 +180,8 @@ set(REALM_INSTALL_HEADERS
178180
null.hpp
179181
obj.hpp
180182
obj_list.hpp
183+
string_interner.hpp
184+
string_compressor.hpp
181185
object_id.hpp
182186
path.hpp
183187
owned_data.hpp

src/realm/array.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ void Array::set_type(Type type)
294294
set_hasrefs_in_header(init_has_refs, header);
295295
}
296296

297-
void Array::destroy_children(size_t offset) noexcept
297+
void Array::destroy_children(size_t offset, bool ro_only) noexcept
298298
{
299299
for (size_t i = offset; i != m_size; ++i) {
300300
int64_t value = get(i);
@@ -310,7 +310,7 @@ void Array::destroy_children(size_t offset) noexcept
310310
continue;
311311

312312
ref_type ref = to_ref(value);
313-
destroy_deep(ref, m_alloc);
313+
destroy_deep(ref, m_alloc, ro_only);
314314
}
315315
}
316316

@@ -607,6 +607,14 @@ void Array::do_ensure_minimum_width(int_fast64_t value)
607607
}
608608
}
609609

610+
size_t Array::size() const noexcept
611+
{
612+
// in case the array is in compressed format. Never read directly
613+
// from the header the size, since it will result very likely in a cache miss.
614+
// For compressed arrays m_size should always be kept updated, due to init_from_mem
615+
return m_size;
616+
}
617+
610618
bool Array::compress_array(Array& arr) const
611619
{
612620
if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) {

src/realm/array.hpp

+18-13
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class Array : public Node, public ArrayParent {
117117
/// pointer.
118118
void init_from_mem(MemRef) noexcept;
119119

120-
/// Same as `init_from_ref(get_ref_from_parent())`.
120+
/// Same as `init_from_ref(ref_from_parent())`.
121121
void init_from_parent() noexcept
122122
{
123123
ref_type ref = get_ref_from_parent();
@@ -210,6 +210,8 @@ class Array : public Node, public ArrayParent {
210210
update_width_cache_from_header();
211211
}
212212

213+
size_t size() const noexcept;
214+
213215
bool is_empty() const noexcept
214216
{
215217
return size() == 0;
@@ -362,7 +364,8 @@ class Array : public Node, public ArrayParent {
362364
/// state (as if calling detach()), then free the allocated memory. If this
363365
/// accessor is already in the detached state, this function has no effect
364366
/// (idempotency).
365-
void destroy_deep() noexcept;
367+
/// If 'ro_only', only free space in read-only memory (the file)
368+
void destroy_deep(bool ro_only = false) noexcept;
366369

367370
/// check if the array is encoded (in B format)
368371
inline bool is_compressed() const;
@@ -377,13 +380,13 @@ class Array : public Node, public ArrayParent {
377380
bool try_decompress();
378381

379382
/// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`.
380-
static void destroy_deep(ref_type ref, Allocator& alloc) noexcept;
383+
static void destroy_deep(ref_type ref, Allocator& alloc, bool ro_only = false) noexcept;
381384

382385
/// Destroy the specified array node and all of its children, recursively.
383386
///
384387
/// This is done by freeing the specified array node after calling
385388
/// destroy_deep() for every contained 'ref' element.
386-
static void destroy_deep(MemRef, Allocator&) noexcept;
389+
static void destroy_deep(MemRef, Allocator&, bool ro_only = false) noexcept;
387390

388391
// Clone deep
389392
static MemRef clone(MemRef, Allocator& from_alloc, Allocator& target_alloc);
@@ -540,7 +543,7 @@ class Array : public Node, public ArrayParent {
540543
// Overriding method in ArrayParent
541544
ref_type get_child_ref(size_t) const noexcept override;
542545

543-
void destroy_children(size_t offset = 0) noexcept;
546+
void destroy_children(size_t offset = 0, bool ro_only = false) noexcept;
544547

545548
protected:
546549
// Getters and Setters for adaptive-packed arrays
@@ -912,16 +915,17 @@ inline void Array::set_context_flag(bool value) noexcept
912915
}
913916
}
914917

915-
inline void Array::destroy_deep() noexcept
918+
inline void Array::destroy_deep(bool ro_only) noexcept
916919
{
917920
if (!is_attached())
918921
return;
919922

920923
if (m_has_refs)
921-
destroy_children();
924+
destroy_children(0, ro_only);
922925

923926
char* header = get_header_from_data(m_data);
924-
m_alloc.free_(m_ref, header);
927+
if (!ro_only || is_read_only())
928+
m_alloc.free_(m_ref, header);
925929
m_data = nullptr;
926930
}
927931

@@ -964,20 +968,21 @@ inline void Array::clear_and_destroy_children()
964968
truncate_and_destroy_children(0);
965969
}
966970

967-
inline void Array::destroy_deep(ref_type ref, Allocator& alloc) noexcept
971+
inline void Array::destroy_deep(ref_type ref, Allocator& alloc, bool ro_only) noexcept
968972
{
969-
destroy_deep(MemRef(ref, alloc), alloc);
973+
destroy_deep(MemRef(ref, alloc), alloc, ro_only);
970974
}
971975

972-
inline void Array::destroy_deep(MemRef mem, Allocator& alloc) noexcept
976+
inline void Array::destroy_deep(MemRef mem, Allocator& alloc, bool ro_only) noexcept
973977
{
974978
if (!get_hasrefs_from_header(mem.get_addr())) {
975-
alloc.free_(mem);
979+
if (!ro_only || alloc.is_read_only(mem.get_ref()))
980+
alloc.free_(mem);
976981
return;
977982
}
978983
Array array(alloc);
979984
array.init_from_mem(mem);
980-
array.destroy_deep();
985+
array.destroy_deep(ro_only);
981986
}
982987

983988

src/realm/array_integer.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <realm/impl/destroy_guard.hpp>
2323
#include <realm/column_integer.hpp>
2424

25+
#include <iostream>
26+
2527
using namespace realm;
2628

2729
ArrayInteger::ArrayInteger(Allocator& allocator) noexcept

src/realm/array_integer.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}
174174

175175
inline size_t ArrayIntNull::size() const noexcept
176176
{
177+
// this cannot be right, what if size is 0
177178
return Array::size() - 1;
178179
}
179180

src/realm/array_string.cpp

+84-11
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
**************************************************************************/
1818

1919
#include <realm/array_string.hpp>
20+
#include <realm/impl/array_writer.hpp>
2021
#include <realm/spec.hpp>
2122
#include <realm/mixed.hpp>
2223

@@ -52,14 +53,24 @@ void ArrayString::init_from_mem(MemRef mem) noexcept
5253
else {
5354
auto arr = new (&m_storage) Array(m_alloc);
5455
arr->init_from_mem(mem);
55-
m_string_enum_values = std::make_unique<ArrayString>(m_alloc);
56-
ArrayParent* p;
57-
REALM_ASSERT(m_spec != nullptr);
58-
REALM_ASSERT(m_col_ndx != realm::npos);
59-
ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p);
60-
m_string_enum_values->init_from_ref(r);
61-
m_string_enum_values->set_parent(p, m_col_ndx);
62-
m_type = Type::enum_strings;
56+
// The context flag is used to indicate interned strings vs old enum strings
57+
// (in conjunction with has_refs() == false)
58+
if (arr->get_context_flag_from_header(arr->get_header())) {
59+
// init for new interned strings (replacing old enum strings)
60+
m_type = Type::interned_strings;
61+
// consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner);
62+
}
63+
else {
64+
// init for old enum strings
65+
m_string_enum_values = std::make_unique<ArrayString>(m_alloc);
66+
ArrayParent* p;
67+
REALM_ASSERT(m_spec != nullptr);
68+
REALM_ASSERT(m_col_ndx != realm::npos);
69+
ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p);
70+
m_string_enum_values->init_from_ref(r);
71+
m_string_enum_values->set_parent(p, m_col_ndx);
72+
m_type = Type::enum_strings;
73+
}
6374
}
6475
}
6576
else {
@@ -111,6 +122,7 @@ size_t ArrayString::size() const
111122
case Type::big_strings:
112123
return static_cast<ArrayBigBlobs*>(m_arr)->size();
113124
case Type::enum_strings:
125+
case Type::interned_strings:
114126
return static_cast<Array*>(m_arr)->size();
115127
}
116128
return {};
@@ -128,7 +140,8 @@ void ArrayString::add(StringData value)
128140
case Type::big_strings:
129141
static_cast<ArrayBigBlobs*>(m_arr)->add_string(value);
130142
break;
131-
case Type::enum_strings: {
143+
case Type::enum_strings:
144+
case Type::interned_strings: {
132145
auto a = static_cast<Array*>(m_arr);
133146
size_t ndx = a->size();
134147
a->add(0);
@@ -150,6 +163,11 @@ void ArrayString::set(size_t ndx, StringData value)
150163
case Type::big_strings:
151164
static_cast<ArrayBigBlobs*>(m_arr)->set_string(ndx, value);
152165
break;
166+
case Type::interned_strings: {
167+
auto id = m_string_interner->intern(value);
168+
static_cast<Array*>(m_arr)->set(ndx, id);
169+
break;
170+
}
153171
case Type::enum_strings: {
154172
size_t sz = m_string_enum_values->size();
155173
size_t res = m_string_enum_values->find_first(value, 0, sz);
@@ -178,6 +196,12 @@ void ArrayString::insert(size_t ndx, StringData value)
178196
case Type::enum_strings: {
179197
static_cast<Array*>(m_arr)->insert(ndx, 0);
180198
set(ndx, value);
199+
break;
200+
}
201+
case Type::interned_strings: {
202+
static_cast<Array*>(m_arr)->insert(ndx, 0);
203+
set(ndx, value);
204+
break;
181205
}
182206
}
183207
}
@@ -195,6 +219,10 @@ StringData ArrayString::get(size_t ndx) const
195219
size_t index = size_t(static_cast<Array*>(m_arr)->get(ndx));
196220
return m_string_enum_values->get(index);
197221
}
222+
case Type::interned_strings: {
223+
size_t id = size_t(static_cast<Array*>(m_arr)->get(ndx));
224+
return m_string_interner->get(id);
225+
}
198226
}
199227
return {};
200228
}
@@ -212,6 +240,10 @@ StringData ArrayString::get_legacy(size_t ndx) const
212240
size_t index = size_t(static_cast<Array*>(m_arr)->get(ndx));
213241
return m_string_enum_values->get(index);
214242
}
243+
case Type::interned_strings: {
244+
size_t id = size_t(static_cast<Array*>(m_arr)->get(ndx));
245+
return m_string_interner->get(id);
246+
}
215247
}
216248
return {};
217249
}
@@ -231,8 +263,12 @@ bool ArrayString::is_null(size_t ndx) const
231263
case Type::big_strings:
232264
return static_cast<ArrayBigBlobs*>(m_arr)->is_null(ndx);
233265
case Type::enum_strings: {
234-
size_t index = size_t(static_cast<Array*>(m_arr)->get(ndx));
235-
return m_string_enum_values->is_null(index);
266+
size_t id = size_t(static_cast<Array*>(m_arr)->get(ndx));
267+
return m_string_enum_values->is_null(id);
268+
}
269+
case Type::interned_strings: {
270+
size_t id = size_t(static_cast<Array*>(m_arr)->get(ndx));
271+
return id == 0;
236272
}
237273
}
238274
return {};
@@ -250,6 +286,7 @@ void ArrayString::erase(size_t ndx)
250286
case Type::big_strings:
251287
static_cast<ArrayBigBlobs*>(m_arr)->erase(ndx);
252288
break;
289+
case Type::interned_strings:
253290
case Type::enum_strings:
254291
static_cast<Array*>(m_arr)->erase(ndx);
255292
break;
@@ -277,6 +314,9 @@ void ArrayString::move(ArrayString& dst, size_t ndx)
277314
// this operation will never be called for enumerated columns
278315
REALM_UNREACHABLE();
279316
break;
317+
case Type::interned_strings:
318+
m_arr->truncate(ndx);
319+
break;
280320
}
281321
}
282322

@@ -293,6 +333,7 @@ void ArrayString::clear()
293333
static_cast<ArrayBigBlobs*>(m_arr)->clear();
294334
break;
295335
case Type::enum_strings:
336+
case Type::interned_strings:
296337
static_cast<Array*>(m_arr)->clear();
297338
break;
298339
}
@@ -321,6 +362,15 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const
321362
}
322363
break;
323364
}
365+
case Type::interned_strings: {
366+
// we need a way to avoid this lookup for each leaf array. The lookup must appear
367+
// higher up the call stack and passed down.
368+
auto id = m_string_interner->lookup(value);
369+
if (id) {
370+
return static_cast<Array*>(m_arr)->find_first(*id, begin, end);
371+
}
372+
break;
373+
}
324374
}
325375
return not_found;
326376
}
@@ -371,6 +421,9 @@ size_t ArrayString::lower_bound(StringData value)
371421
return lower_bound_string(static_cast<ArrayBigBlobs*>(m_arr), value);
372422
case Type::enum_strings:
373423
break;
424+
case Type::interned_strings:
425+
REALM_UNREACHABLE();
426+
break;
374427
}
375428
return realm::npos;
376429
}
@@ -383,6 +436,9 @@ ArrayString::Type ArrayString::upgrade_leaf(size_t value_size)
383436
if (m_type == Type::enum_strings)
384437
return Type::enum_strings;
385438

439+
if (m_type == Type::interned_strings)
440+
return Type::interned_strings;
441+
386442
if (m_type == Type::medium_strings) {
387443
if (value_size <= medium_string_max_size)
388444
return Type::medium_strings;
@@ -473,8 +529,25 @@ void ArrayString::verify() const
473529
static_cast<ArrayBigBlobs*>(m_arr)->verify();
474530
break;
475531
case Type::enum_strings:
532+
case Type::interned_strings:
476533
static_cast<Array*>(m_arr)->verify();
477534
break;
478535
}
479536
#endif
480537
}
538+
539+
ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner)
540+
{
541+
REALM_ASSERT(interner);
542+
// we have to write out all, modified or not, to match the total cleanup
543+
Array interned(Allocator::get_default());
544+
auto sz = size();
545+
interned.create(NodeHeader::type_Normal, true, sz);
546+
for (size_t i = 0; i < sz; ++i) {
547+
interned.set(i, interner->intern(get(i)));
548+
}
549+
auto retval = interned.write(out, false, false, out.compress);
550+
interned.destroy();
551+
return retval;
552+
// return m_arr->write(out, true, false, false);
553+
}

0 commit comments

Comments
 (0)