Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ set(arcticdb_srcs
column_store/column_data.hpp
column_store/column_data_random_accessor.hpp
column_store/column.hpp
column_store/column_utils.hpp
column_store/key_segment.hpp
column_store/memory_segment.hpp
column_store/memory_segment_impl.hpp
Expand Down Expand Up @@ -345,6 +344,7 @@ set(arcticdb_srcs
util/buffer.hpp
util/buffer_pool.hpp
util/clock.hpp
util/concepts.hpp
util/configs_map.hpp
util/constants.hpp
util/constructors.hpp
Expand Down
41 changes: 7 additions & 34 deletions cpp/arcticdb/column_store/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <arcticdb/entity/performance_tracing.hpp>
#include <arcticdb/entity/types.hpp>
#include <arcticdb/util/bitset.hpp>
#include <arcticdb/util/concepts.hpp>
#include <arcticdb/util/cursored_buffer.hpp>
#include <arcticdb/util/flatten_utils.hpp>
#include <arcticdb/util/preconditions.hpp>
Expand All @@ -26,23 +27,14 @@
#include <cstdio>
#endif
#include <folly/Function.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

#include <concepts>
#include <numeric>
#include <optional>

namespace py = pybind11;

namespace arcticdb {

// this is needed to make templates of templates work
// since py::array_t has more than one template parameter
// (the rest are defaulted)
template< class T>
using py_array_t = py::array_t<T>;

using util::arithmetic_tensor;
using namespace arcticdb::entity;

// N.B. this will not catch all the things that C++ considers to be narrowing conversions, because
Expand Down Expand Up @@ -449,10 +441,9 @@ class Column {
return std::move(shapes_.buffer());
}

template<class T, template<class> class Tensor, std::enable_if_t<
std::is_integral_v<T> || std::is_floating_point_v<T>,
int> = 0>
void set_array(ssize_t row_offset, Tensor<T> &val) {
template<arithmetic_tensor TensorType>
void set_array(ssize_t row_offset, TensorType& val) {
using value_type = typename TensorType::value_type;
ARCTICDB_SAMPLE(ColumnSetArray, RMTSF_Aggregate)
magic_.check();
util::check_arg(last_logical_row_ + 1 == row_offset, "set_array expected row {}, actual {} ", last_logical_row_ + 1, row_offset);
Expand All @@ -461,26 +452,8 @@ class Column {
memcpy(shapes_.cursor(), val.shape(), val.ndim() * sizeof(shape_t));
auto info = val.request();
util::FlattenHelper flatten(val);
auto data_ptr = reinterpret_cast<T*>(data_.cursor());
flatten.flatten(data_ptr, reinterpret_cast<const T *>(info.ptr));
update_offsets(val.nbytes());
data_.commit();
shapes_.commit();
++last_logical_row_;
}

template<class T, std::enable_if_t< std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
void set_array(ssize_t row_offset, py::array_t<T>& val) {
ARCTICDB_SAMPLE(ColumnSetArray, RMTSF_Aggregate)
magic_.check();
util::check_arg(last_logical_row_ + 1 == row_offset, "set_array expected row {}, actual {} ", last_logical_row_ + 1, row_offset);
data_.ensure_bytes(val.nbytes());
shapes_.ensure<shape_t>(val.ndim());
memcpy(shapes_.cursor(), val.shape(), val.ndim() * sizeof(shape_t));
auto info = val.request();
util::FlattenHelper<T, py_array_t> flatten(val);
auto data_ptr = reinterpret_cast<T*>(data_.cursor());
flatten.flatten(data_ptr, reinterpret_cast<const T*>(info.ptr));
auto data_ptr = reinterpret_cast<value_type*>(data_.cursor());
flatten.flatten(data_ptr, reinterpret_cast<const value_type*>(info.ptr));
update_offsets(val.nbytes());
data_.commit();
shapes_.commit();
Expand Down
11 changes: 2 additions & 9 deletions cpp/arcticdb/column_store/memory_segment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,8 @@ class SegmentInMemory {
impl_->init_column_map();
}

template<class T, template<class> class Tensor>
requires std::integral<T> || std::floating_point<T>
void set_array(position_t pos, Tensor<T> &val) {
impl_->set_array(pos, val);
}

template<class T>
requires std::integral<T> || std::floating_point<T>
void set_array(position_t pos, py::array_t<T>& val) {
template<arithmetic_tensor TensorType>
void set_array(position_t pos, TensorType &val) {
impl_->set_array(pos, val);
}

Expand Down
14 changes: 3 additions & 11 deletions cpp/arcticdb/column_store/memory_segment_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <arcticdb/entity/types.hpp>
#include <arcticdb/column_store/column.hpp>
#include <arcticdb/util/concepts.hpp>
#include <arcticdb/util/offset_string.hpp>
#include <arcticdb/util/preconditions.hpp>

Expand Down Expand Up @@ -506,17 +507,8 @@ class SegmentInMemoryImpl {
set_string(idx, val);
}

template<class T, template<class> class Tensor>
requires std::integral<T> || std::floating_point<T>
void set_array(position_t pos, Tensor<T> &val) {
magic_.check();
ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)
column_unchecked(pos).set_array(row_id_ + 1, val);
}

template<class T>
requires std::integral<T> || std::floating_point<T>
void set_array(position_t pos, py::array_t<T>& val) {
template<arithmetic_tensor TensorType>
void set_array(position_t pos, TensorType &val) {
magic_.check();
ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)
column_unchecked(pos).set_array(row_id_ + 1, val);
Expand Down
10 changes: 9 additions & 1 deletion cpp/arcticdb/column_store/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,15 @@ void register_column_store(py::module &m) {
py::class_<StringPool>(m, "StringPool")
.def(py::init())
.def_property_readonly("nbytes", &StringPool::size)
.def("as_buffer_info", &StringPool::as_buffer_info);
.def("as_buffer_info", [](const StringPool& s) {
return py::buffer_info{
(void *) s.get_const_view(0).data(),
1,
py::format_descriptor<char>::format(),
ssize_t(s.get_const_view(0).size())

};
});
}

} // namespace arcticc::column_store
Expand Down
11 changes: 0 additions & 11 deletions cpp/arcticdb/column_store/string_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#include <arcticdb/column_store/segment_utils.hpp>
#include <ankerl/unordered_dense.h>

#include <pybind11/pybind11.h>

namespace arcticdb {

/*****************
Expand Down Expand Up @@ -199,15 +197,6 @@ size_t StringPool::size() const {
return block_.size();
}

py::buffer_info StringPool::as_buffer_info() const {
return py::buffer_info{
(void *) block_.at(0).data(),
1,
py::format_descriptor<char>::format(),
ssize_t(block_.at(0).size())
};
}

std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);
Expand Down
8 changes: 0 additions & 8 deletions cpp/arcticdb/column_store/string_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@
#include <arcticdb/column_store/chunked_buffer.hpp>
#include <arcticdb/column_store/column_data.hpp>

namespace pybind11 {
struct buffer_info;
}

namespace py = pybind11;

#include <ankerl/unordered_dense.h>

namespace arcticdb {
Expand Down Expand Up @@ -172,8 +166,6 @@ class StringPool {

[[nodiscard]] size_t num_blocks() const;

py::buffer_info as_buffer_info() const;

std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column);
ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column);
private:
Expand Down
11 changes: 2 additions & 9 deletions cpp/arcticdb/entity/native_tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
// for std::accumulate
#include <numeric>

#include <pybind11/numpy.h>

namespace py = pybind11;

namespace arcticdb::entity {

inline ssize_t calc_elements(const shape_t* shape, ssize_t ndim) {
Expand Down Expand Up @@ -166,6 +162,7 @@ ssize_t byte_offset_impl(const stride_t* strides, ssize_t i, Ix... index) {
//TODO is the conversion to a typed tensor really necessary for the codec part?
template<typename T>
struct TypedTensor : public NativeTensor {
using value_type = T;
static size_t itemsize() { return sizeof(T); }

std::array<stride_t, 2> f_style_strides() {
Expand Down Expand Up @@ -255,12 +252,8 @@ struct TypedTensor : public NativeTensor {
}
}
};
template<typename T>
py::array to_py_array(const TypedTensor<T>& tensor) {
return py::array({tensor.shape(), tensor.shape() + tensor.ndim()}, reinterpret_cast<const T*>(tensor.data()));
}

template<typename T>
using TensorType = TypedTensor<T>;

}//namespace arcticdb
}//namespace arcticdb
147 changes: 146 additions & 1 deletion cpp/arcticdb/pipeline/pandas_output_frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,152 @@
#include <arcticdb/pipeline/pandas_output_frame.hpp>
#include <arcticdb/entity/performance_tracing.hpp>
#include <arcticdb/util/memory_tracing.hpp>
#include <arcticdb/column_store/column_utils.hpp>
//#include <arcticdb/column_store/column_utils.hpp>

#include <pybind11/pybind11.h>
namespace py = pybind11;

namespace arcticdb::detail {

inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, OutputFormat output_format, py::object &anchor) {
ARCTICDB_SAMPLE_DEFAULT(PythonOutputFrameArrayAt)
if (frame.empty()) {
return visit_field(frame.field(col_pos), [output_format] (auto tag) {
using TypeTag = std::decay_t<decltype(tag)>;
constexpr auto data_type = TypeTag::DataTypeTag::data_type;
std::string dtype;
ssize_t esize = is_sequence_type(data_type) && is_fixed_string_type(data_type) ? 1 : get_type_size(data_type);
if constexpr (is_sequence_type(data_type)) {
if constexpr (is_fixed_string_type(data_type)) {
dtype = data_type == DataType::ASCII_FIXED64 ? "<S0" : "<U0";
} else {
dtype = "O";
}
} else if constexpr((is_numeric_type(data_type) || is_bool_type(data_type)) && tag.dimension() == Dimension::Dim0) {
constexpr auto dim = TypeTag::DimensionTag::value;
util::check(dim == Dimension::Dim0, "Only scalars supported, {}", data_type);
if constexpr (data_type == DataType::NANOSECONDS_UTC64) {
// NOTE: this is safe as of Pandas < 2.0 because `datetime64` _always_ has been using nanosecond resolution,
// i.e. Pandas < 2.0 _always_ provides `datetime64[ns]` and ignores any other resolution.
// Yet, this has changed in Pandas 2.0 and other resolution can be used,
// i.e. Pandas >= 2.0 will also provides `datetime64[us]`, `datetime64[ms]` and `datetime64[s]`.
// See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
// TODO: for the support of Pandas>=2.0, convert any `datetime` to `datetime64[ns]` before-hand and do not
// rely uniquely on the resolution-less 'M' specifier if it this doable.
dtype = "datetime64[ns]";
} else {
dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize);
}
} else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type) || is_array_type(TypeDescriptor(tag))) {
dtype= "O";
// The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
// we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
// per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
// is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
// (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
// This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
// cpp/arcticdb/pipeline/column_mapping.hpp::external_datatype_size
esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
} else if constexpr(tag.dimension() == Dimension::Dim2) {
util::raise_rte("Read resulted in two dimensional type. This is not supported.");
} else {
static_assert(!sizeof(data_type), "Unhandled data type");
}
return py::array{py::dtype{dtype}, py::array::ShapeContainer{0}, py::array::StridesContainer{esize}};
});
}
return visit_field(frame.field(col_pos), [&, frame=frame, col_pos=col_pos, output_format] (auto tag) {
using TypeTag = std::decay_t<decltype(tag)>;
constexpr auto data_type = TypeTag::DataTypeTag::data_type;
auto column_data = frame.column(col_pos).data();
const auto& buffer = column_data.buffer();
std::string dtype;
ssize_t esize = get_type_size(data_type);
if constexpr (is_sequence_type(data_type)) {
if (is_fixed_string_type(data_type)) {
esize = buffer.bytes() / frame.row_count();
auto char_count = esize;
if (data_type == DataType::UTF_FIXED64) {
char_count /= UNICODE_WIDTH;
}
dtype = fmt::format((data_type == DataType::ASCII_FIXED64 ? "<S{:d}" : "<U{:d}"), char_count);
} else {
dtype = "O";
}
} else if constexpr((is_numeric_type(data_type) || is_bool_type(data_type)) && tag.dimension() == Dimension::Dim0) {
constexpr auto dim = TypeTag::DimensionTag::value;
util::check(dim == Dimension::Dim0, "Only scalars supported, {}", frame.field(col_pos));
if constexpr (data_type == DataType::NANOSECONDS_UTC64) {
// NOTE: this is safe as of Pandas < 2.0 because `datetime64` _always_ has been using nanosecond resolution,
// i.e. Pandas < 2.0 _always_ provides `datetime64[ns]` and ignores any other resolution.
// Yet, this has changed in Pandas 2.0 and other resolution can be used,
// i.e. Pandas >= 2.0 will also provides `datetime64[us]`, `datetime64[ms]` and `datetime64[s]`.
// See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
// TODO: for the support of Pandas>=2.0, convert any `datetime` to `datetime64[ns]` before-hand and do not
// rely uniquely on the resolution-less 'M' specifier if it this doable.
dtype = "datetime64[ns]";
} else {
dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize);
}
} else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type)) {
dtype = "O";
// The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
// we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
// per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
// is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
// (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
// This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
// cpp/arcticdb/pipeline/column_mapping.hpp::datatype_size
esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
} else if constexpr (is_array_type(TypeDescriptor(tag))) {
dtype= "O";
// The python representation of multidimensional columns differs from the in-memory/on-storage. In memory,
// we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there
// per array. Each element is of size sizeof(DataTypeTag::raw_type). For the python representation the column
// is represented as an array of (numpy) arrays. Each nested arrays is represented as a pointer to the
// (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer.
// This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and
// cpp/arcticdb/pipeline/column_mapping.hpp::external_datatype_size
auto none = py::none();
auto &api = py::detail::npy_api::get();
auto it = column_data.buffer().iterator(sizeof(PyObject*));
while(!it.finished()) {
auto* ptr = reinterpret_cast<PyObject*>(it.value());
util::check(ptr != nullptr, "Can't set base object on null item");
if(ptr != none.ptr())
api.PyArray_SetBaseObject_(ptr, anchor.inc_ref().ptr());

it.next();
}
esize = data_type_size(TypeDescriptor{tag}, output_format, DataTypeMode::EXTERNAL);
} else if constexpr(tag.dimension() == Dimension::Dim2) {
util::raise_rte("Read resulted in two dimensional type. This is not supported.");
} else {
static_assert(!sizeof(data_type), "Unhandled data type");
}
// Note how base is passed to the array to register the data owner.
// It's especially important to keep the frame data object alive for as long as the array is alive
// so that regular python ref counting logic handles the liveness
return py::array(py::dtype{dtype}, {frame.row_count()}, {esize}, buffer.data(), anchor);
});
}

inline std::shared_ptr<pipelines::FrameDataWrapper> initialize_array(
const SegmentInMemory& frame,
OutputFormat output_format,
py::object &ref) {
auto output = std::make_shared<pipelines::FrameDataWrapper>(frame.fields().size());
ARCTICDB_SAMPLE(InitializeArrays, 0);
ARCTICDB_DEBUG(log::memory(), "Initializing arrays");
util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
for (std::size_t c = 0; c < static_cast<size_t>(frame.fields().size()); ++c) {
output->data_[c] = array_at(frame, c, output_format, ref);
}
util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
return output;
}

}

namespace arcticdb::pipelines {

Expand Down
Loading
Loading