man-group · JohanMabille · Apr 4, 2025
@@ -199,7 +199,6 @@ set(arcticdb_srcs
         column_store/column_data.hpp
         column_store/column_data_random_accessor.hpp
         column_store/column.hpp
-        column_store/column_utils.hpp
         column_store/key_segment.hpp
         column_store/memory_segment.hpp
         column_store/memory_segment_impl.hpp
@@ -345,6 +344,7 @@ set(arcticdb_srcs
         util/buffer.hpp
         util/buffer_pool.hpp
         util/clock.hpp
+        util/concepts.hpp
         util/configs_map.hpp
         util/constants.hpp
         util/constructors.hpp

@@ -15,6 +15,7 @@
 #include <arcticdb/entity/performance_tracing.hpp>
 #include <arcticdb/entity/types.hpp>
 #include <arcticdb/util/bitset.hpp>
+#include <arcticdb/util/concepts.hpp>
 #include <arcticdb/util/cursored_buffer.hpp>
 #include <arcticdb/util/flatten_utils.hpp>
 #include <arcticdb/util/preconditions.hpp>
@@ -26,23 +27,14 @@
 #include <cstdio>
 #endif
 #include <folly/Function.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
 
 #include <concepts>
 #include <numeric>
 #include <optional>
 
-namespace py = pybind11;
-
 namespace arcticdb {
 
-// this is needed to make templates of templates work
-// since py::array_t has more than one template parameter
-// (the rest are defaulted)
-template< class T>
-using py_array_t = py::array_t<T>;
-
+using util::arithmetic_tensor;
 using namespace arcticdb::entity;
 
 // N.B. this will not catch all the things that C++ considers to be narrowing conversions, because
@@ -449,10 +441,9 @@ class Column {
         return std::move(shapes_.buffer());
     }
 
-    template<class T, template<class> class Tensor, std::enable_if_t<
-            std::is_integral_v<T> || std::is_floating_point_v<T>,
-            int> = 0>
-    void set_array(ssize_t row_offset, Tensor<T> &val) {
+    template<arithmetic_tensor TensorType>
+    void set_array(ssize_t row_offset, TensorType& val) {
+        using value_type = typename TensorType::value_type;
         ARCTICDB_SAMPLE(ColumnSetArray, RMTSF_Aggregate)
         magic_.check();
         util::check_arg(last_logical_row_ + 1 == row_offset, "set_array expected row {}, actual {} ", last_logical_row_ + 1, row_offset);
@@ -461,26 +452,8 @@ class Column {
         memcpy(shapes_.cursor(), val.shape(), val.ndim() * sizeof(shape_t));
         auto info = val.request();
         util::FlattenHelper flatten(val);
-        auto data_ptr = reinterpret_cast<T*>(data_.cursor());
-        flatten.flatten(data_ptr, reinterpret_cast<const T *>(info.ptr));
-        update_offsets(val.nbytes());
-        data_.commit();
-        shapes_.commit();
-        ++last_logical_row_;
-    }
-
-    template<class T, std::enable_if_t< std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
-    void set_array(ssize_t row_offset, py::array_t<T>& val) {
-        ARCTICDB_SAMPLE(ColumnSetArray, RMTSF_Aggregate)
-        magic_.check();
-        util::check_arg(last_logical_row_ + 1 == row_offset, "set_array expected row {}, actual {} ", last_logical_row_ + 1, row_offset);
-        data_.ensure_bytes(val.nbytes());
-        shapes_.ensure<shape_t>(val.ndim());
-        memcpy(shapes_.cursor(), val.shape(), val.ndim() * sizeof(shape_t));
-        auto info = val.request();
-        util::FlattenHelper<T, py_array_t> flatten(val);
-        auto data_ptr = reinterpret_cast<T*>(data_.cursor());
-        flatten.flatten(data_ptr, reinterpret_cast<const T*>(info.ptr));
+        auto data_ptr = reinterpret_cast<value_type*>(data_.cursor());
+        flatten.flatten(data_ptr, reinterpret_cast<const value_type*>(info.ptr));
         update_offsets(val.nbytes());
         data_.commit();
         shapes_.commit();

@@ -126,15 +126,8 @@ class SegmentInMemory {
         impl_->init_column_map();
     }
 
-    template<class T, template<class> class Tensor>
-    requires std::integral<T> || std::floating_point<T>
-    void set_array(position_t pos, Tensor<T> &val) {
-        impl_->set_array(pos, val);
-    }
-
-    template<class T>
-    requires std::integral<T> || std::floating_point<T>
-    void set_array(position_t pos, py::array_t<T>& val) {
+    template<arithmetic_tensor TensorType>
+    void set_array(position_t pos, TensorType &val) {
         impl_->set_array(pos, val);
     }
 

@@ -9,6 +9,7 @@
 
 #include <arcticdb/entity/types.hpp>
 #include <arcticdb/column_store/column.hpp>
+#include <arcticdb/util/concepts.hpp>
 #include <arcticdb/util/offset_string.hpp>
 #include <arcticdb/util/preconditions.hpp>
 
@@ -506,17 +507,8 @@ class SegmentInMemoryImpl {
         set_string(idx, val);
     }
 
-    template<class T, template<class> class Tensor>
-    requires std::integral<T> || std::floating_point<T>
-    void set_array(position_t pos, Tensor<T> &val) {
-        magic_.check();
-        ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)
-        column_unchecked(pos).set_array(row_id_ + 1, val);
-    }
-
-    template<class T>
-    requires std::integral<T> || std::floating_point<T>
-    void set_array(position_t pos, py::array_t<T>& val) {
+    template<arithmetic_tensor TensorType>
+    void set_array(position_t pos, TensorType &val) {
         magic_.check();
         ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)
         column_unchecked(pos).set_array(row_id_ + 1, val);

@@ -28,7 +28,15 @@ void register_column_store(py::module &m) {
     py::class_<StringPool>(m, "StringPool")
         .def(py::init())
         .def_property_readonly("nbytes", &StringPool::size)
-        .def("as_buffer_info", &StringPool::as_buffer_info);
+        .def("as_buffer_info", [](const StringPool& s) {
+                return py::buffer_info{
+                    (void *) s.get_const_view(0).data(),
+                    1,
+                    py::format_descriptor<char>::format(),
+                    ssize_t(s.get_const_view(0).size())
+
+                };
+            });
 }
 
 } // namespace arcticc::column_store

@@ -10,8 +10,6 @@
 #include <arcticdb/column_store/segment_utils.hpp>
 #include <ankerl/unordered_dense.h>
 
-#include <pybind11/pybind11.h>
-
 namespace arcticdb {
 
 /*****************
@@ -199,15 +197,6 @@ size_t StringPool::size() const {
     return block_.size();
 }
 
-py::buffer_info StringPool::as_buffer_info() const {
-    return py::buffer_info{
-        (void *) block_.at(0).data(),
-        1,
-        py::format_descriptor<char>::format(),
-        ssize_t(block_.at(0).size())
-    };
-}
-
 std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) {
     auto unique_values = unique_values_for_string_column(column);
     remove_nones_and_nans(unique_values);

@@ -18,12 +18,6 @@
 #include <arcticdb/column_store/chunked_buffer.hpp>
 #include <arcticdb/column_store/column_data.hpp>
 
-namespace pybind11 {
-    struct buffer_info;
-}
-
-namespace py = pybind11;
-
 #include <ankerl/unordered_dense.h>
 
 namespace arcticdb {
@@ -172,8 +166,6 @@ class StringPool {
 
     [[nodiscard]] size_t num_blocks() const;
 
-    py::buffer_info as_buffer_info() const;
-
     std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column);
     ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column);
   private:

@@ -14,10 +14,6 @@
 // for std::accumulate
 #include <numeric>
 
-#include <pybind11/numpy.h>
-
-namespace py = pybind11;
-
 namespace arcticdb::entity {
 
 inline ssize_t calc_elements(const shape_t* shape, ssize_t ndim) {
@@ -166,6 +162,7 @@ ssize_t byte_offset_impl(const stride_t* strides, ssize_t i, Ix... index) {
 //TODO is the conversion to a typed tensor really necessary for the codec part?
 template<typename T>
 struct TypedTensor : public NativeTensor {
+    using value_type = T;
     static size_t itemsize() { return sizeof(T); }
 
     std::array<stride_t, 2> f_style_strides() {
@@ -255,12 +252,8 @@ struct TypedTensor : public NativeTensor {
         }  
     }
 };
-template<typename T>
-py::array to_py_array(const TypedTensor<T>& tensor) {
-    return py::array({tensor.shape(), tensor.shape() + tensor.ndim()}, reinterpret_cast<const T*>(tensor.data()));
-}
 
 template<typename T>
 using TensorType = TypedTensor<T>;
 
-}//namespace arcticdb
+}//namespace arcticdb
@@ -8,7 +8,152 @@
 #include <arcticdb/pipeline/pandas_output_frame.hpp>
 #include <arcticdb/entity/performance_tracing.hpp>
 #include <arcticdb/util/memory_tracing.hpp>
-#include <arcticdb/column_store/column_utils.hpp>
+//#include <arcticdb/column_store/column_utils.hpp>
+
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+namespace arcticdb::detail {
+
+inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, OutputFormat output_format, py::object &anchor) {
+    ARCTICDB_SAMPLE_DEFAULT(PythonOutputFrameArrayAt)
+    if (frame.empty()) {
+        return visit_field(frame.field(col_pos), [output_format] (auto tag) {
+            using TypeTag = std::decay_t<decltype(tag)>;
+            constexpr auto data_type = TypeTag::DataTypeTag::data_type;
+            std::string dtype;
+            ssize_t esize = is_sequence_type(data_type) && is_fixed_string_type(data_type) ? 1 : get_type_size(data_type);
+            if constexpr (is_sequence_type(data_type)) {
+                if constexpr (is_fixed_string_type(data_type)) {
+                    dtype = data_type == DataType::ASCII_FIXED64 ? "<S0" : "<U0";
+                } else {
+                    dtype = "O";
+                }
+            } else if constexpr((is_numeric_type(data_type) || is_bool_type(data_type)) && tag.dimension() == Dimension::Dim0) {
+                constexpr auto dim = TypeTag::DimensionTag::value;
+                util::check(dim == Dimension::Dim0, "Only scalars supported, {}", data_type);
+                if constexpr (data_type == DataType::NANOSECONDS_UTC64) {
+                    // NOTE: this is safe as of Pandas < 2.0 because `datetime64` _always_ has been using nanosecond resolution,
+                    // i.e. Pandas < 2.0 _always_ provides `datetime64[ns]` and ignores any other resolution.
+                    // Yet, this has changed in Pandas 2.0 and other resolution can be used,
+                    // i.e. Pandas >= 2.0 will also provides `datetime64[us]`, `datetime64[ms]` and `datetime64[s]`.
+                    // See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
+                    // TODO: for the support of Pandas>=2.0, convert any `datetime` to `datetime64[ns]` before-hand and do not
+                    // rely uniquely on the resolution-less 'M' specifier if it this doable.
+                    dtype = "datetime64[ns]";
+                } else {
+                    dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize);
+                }
+            } else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type) || is_array_type(TypeDescriptor(tag))) {
+                dtype= "O";
+                // The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
+                // we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
+                // per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
+                // is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
+                // (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
+                // This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
+                // cpp/arcticdb/pipeline/column_mapping.hpp::external_datatype_size
+                esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
+            } else if constexpr(tag.dimension() == Dimension::Dim2) {
+                util::raise_rte("Read resulted in two dimensional type. This is not supported.");
+            } else {
+                static_assert(!sizeof(data_type), "Unhandled data type");
+            }
+            return py::array{py::dtype{dtype}, py::array::ShapeContainer{0}, py::array::StridesContainer{esize}};
+        });
+    }
+    return visit_field(frame.field(col_pos), [&, frame=frame, col_pos=col_pos, output_format] (auto tag) {
+        using TypeTag = std::decay_t<decltype(tag)>;
+        constexpr auto data_type = TypeTag::DataTypeTag::data_type;
+        auto column_data = frame.column(col_pos).data();
+        const auto& buffer = column_data.buffer();
+        std::string dtype;
+        ssize_t esize = get_type_size(data_type);
+        if constexpr (is_sequence_type(data_type)) {
+            if (is_fixed_string_type(data_type)) {
+                esize = buffer.bytes() / frame.row_count();
+                auto char_count = esize;
+                if (data_type == DataType::UTF_FIXED64) {
+                    char_count /= UNICODE_WIDTH;
+                }
+                dtype = fmt::format((data_type == DataType::ASCII_FIXED64 ? "<S{:d}" : "<U{:d}"), char_count);
+            } else {
+                dtype = "O";
+            }
+        } else if constexpr((is_numeric_type(data_type) || is_bool_type(data_type)) && tag.dimension() == Dimension::Dim0) {
+            constexpr auto dim = TypeTag::DimensionTag::value;
+            util::check(dim == Dimension::Dim0, "Only scalars supported, {}", frame.field(col_pos));
+            if constexpr (data_type == DataType::NANOSECONDS_UTC64) {
+                // NOTE: this is safe as of Pandas < 2.0 because `datetime64` _always_ has been using nanosecond resolution,
+                // i.e. Pandas < 2.0 _always_ provides `datetime64[ns]` and ignores any other resolution.
+                // Yet, this has changed in Pandas 2.0 and other resolution can be used,
+                // i.e. Pandas >= 2.0 will also provides `datetime64[us]`, `datetime64[ms]` and `datetime64[s]`.
+                // See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
+                // TODO: for the support of Pandas>=2.0, convert any `datetime` to `datetime64[ns]` before-hand and do not
+                // rely uniquely on the resolution-less 'M' specifier if it this doable.
+                dtype = "datetime64[ns]";
+            } else {
+                dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize);
+            }
+        } else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type)) {
+            dtype = "O";
+            // The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
+            // we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
+            // per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
+            // is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
+            // (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
+            // This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
+            // cpp/arcticdb/pipeline/column_mapping.hpp::datatype_size
+            esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
+        } else if constexpr (is_array_type(TypeDescriptor(tag))) {
+            dtype= "O";
+            // The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
+            // we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
+            // per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
+            // is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
+            // (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
+            // This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
+            // cpp/arcticdb/pipeline/column_mapping.hpp::external_datatype_size
+            auto none = py::none();
+            auto &api = py::detail::npy_api::get();
+            auto it = column_data.buffer().iterator(sizeof(PyObject*));
+            while(!it.finished()) {
+                auto* ptr = reinterpret_cast<PyObject*>(it.value());
+                util::check(ptr != nullptr, "Can't set base object on null item");
+                if(ptr != none.ptr())
+                    api.PyArray_SetBaseObject_(ptr, anchor.inc_ref().ptr());
+
+                it.next();
+            }
+            esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
+    } else if constexpr(tag.dimension() == Dimension::Dim2) {
+            util::raise_rte("Read resulted in two dimensional type. This is not supported.");
+        } else {
+            static_assert(!sizeof(data_type), "Unhandled data type");
+        }
+        // Note how base is passed to the array to register the data owner.
+        // It's especially important to keep the frame data object alive for as long as the array is alive
+        // so that regular python ref counting logic handles the liveness
+        return py::array(py::dtype{dtype}, {frame.row_count()}, {esize}, buffer.data(), anchor);
+    });
+}
+
+inline std::shared_ptr<pipelines::FrameDataWrapper> initialize_array(
+        const SegmentInMemory& frame,
+        OutputFormat output_format,
+        py::object &ref) {
+    auto output = std::make_shared<pipelines::FrameDataWrapper>(frame.fields().size());
+    ARCTICDB_SAMPLE(InitializeArrays, 0);
+    ARCTICDB_DEBUG(log::memory(), "Initializing arrays");
+    util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
+    for (std::size_t c = 0; c < static_cast<size_t>(frame.fields().size()); ++c) {
+        output->data_[c] = array_at(frame, c, output_format, ref);
+    }
+    util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
+    return output;
+}
+
+}
 
 namespace arcticdb::pipelines {