Skip to content
Open
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
endif()

set(PYBIND11_HEADERS
include/pybind11/detail/argument_vector.h
include/pybind11/detail/class.h
include/pybind11/detail/common.h
include/pybind11/detail/cpp_conduit.h
Expand Down
9 changes: 7 additions & 2 deletions include/pybind11/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include "detail/argument_vector.h"
#include "detail/common.h"
#include "detail/descr.h"
#include "detail/native_enum_data.h"
Expand Down Expand Up @@ -2037,6 +2038,8 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
// forward declaration (definition in attr.h)
struct function_record;

constexpr std::size_t arg_vector_small_size = 6;

/// Internal data associated with a single function call
struct function_call {
function_call(const function_record &f, handle p); // Implementation in attr.h
Expand All @@ -2045,10 +2048,12 @@ struct function_call {
const function_record &func;

/// Arguments passed to the function:
std::vector<handle> args;
/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
/// (16 pointers) in size.)
argument_vector<arg_vector_small_size> args;

/// The `convert` value the arguments should be loaded with
std::vector<bool> args_convert;
args_convert_vector<arg_vector_small_size> args_convert;

/// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
/// present, are also in `args` but without a reference).
Expand Down
321 changes: 321 additions & 0 deletions include/pybind11/detail/argument_vector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
/*
pybind11/detail/argument_vector.h: small_vector-like containers to
avoid heap allocation of arguments during function call dispatch.

Copyright (c) Meta Platforms, Inc. and affiliates.

All rights reserved. Use of this source code is governed by a
BSD-style license that can be found in the LICENSE file.
*/

#pragma once

#include <pybind11/pytypes.h>

#include "common.h"

#include <algorithm>
#include <array>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <utility>
#include <vector>

PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)

PYBIND11_WARNING_DISABLE_MSVC(4127)

PYBIND11_NAMESPACE_BEGIN(detail)

// Shared implementation utility for our small_vector-like containers.
// We support C++11 and C++14, so we cannot use
// std::variant. Union with the tag packed next to the inline
// array's size is smaller anyway, allowing 1 extra handle of
// inline storage for free. Compare the layouts (1 line per
// size_t/void*):
// With variant, total is N + 2 for N >= 2:
// - variant tag (cannot be packed with the array size)
// - array size (or first pointer of 3 in std::vector)
// - N pointers of inline storage (or 2 remaining pointers of std::vector)
// Custom union, total is N + 1 for N >= 3:
// - variant tag & array size if applicable
// - N pointers of inline storage (or 3 pointers of std::vector)
//
// NOTE: this is a low-level representational convenience; the two
// use cases of this union are materially different and in particular
// have different semantics for inline_array::size. All that is being
// shared is the memory management behavior.
template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
union inline_array_or_vector {
struct inline_array {
bool is_inline = true;
std::uint32_t size = 0;
std::array<ArrayT, InlineSize> arr;
};
struct heap_vector {
bool is_inline = false;
std::vector<VectorT> vec;

heap_vector() = default;
heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
};

inline_array array;
heap_vector vector;

static_assert(std::is_trivially_move_constructible<ArrayT>::value,
"ArrayT must be trivially move constructible");
static_assert(std::is_trivially_destructible<ArrayT>::value,
"ArrayT must be trivially destructible");

inline_array_or_vector() : array() {}
~inline_array_or_vector() {
if (!is_inline()) {
vector.~heap_vector();
}
}
inline_array_or_vector(const inline_array_or_vector &) = delete;
inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;

inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
if (rhs.is_inline()) {
std::memcpy(&array, &rhs.array, sizeof(array));
} else {
new (&vector) heap_vector(std::move(rhs.vector));
}
assert(is_inline() == rhs.is_inline());
}

inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
if (this == &rhs) {
return *this;
}

if (rhs.is_inline()) {
if (!is_inline()) {
vector.~heap_vector();
}
std::memcpy(&array, &rhs.array, sizeof(array));
} else {
if (is_inline()) {
new (&vector) heap_vector(std::move(rhs.vector));
} else {
vector = std::move(rhs.vector);
}
}
return *this;
}

bool is_inline() const {
// It is undefined behavior to access the inactive member of a
// union directly. However, it is well-defined to reinterpret_cast any
// pointer into a pointer to char and examine it as an array
// of bytes. See
// https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
bool result = false;
std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
return result;
}
};

// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t N>
struct argument_vector {
public:
argument_vector() = default;

argument_vector(const argument_vector &) = delete;
argument_vector &operator=(const argument_vector &) = delete;
argument_vector(argument_vector &&) noexcept = default;
argument_vector &operator=(argument_vector &&) noexcept = default;

std::size_t size() const {
if (is_inline()) {
return m_repr.array.size;
}
return m_repr.vector.vec.size();
}

handle &operator[](std::size_t idx) {
assert(idx < size());
if (is_inline()) {
return m_repr.array.arr[idx];
}
return m_repr.vector.vec[idx];
}

handle operator[](std::size_t idx) const {
assert(idx < size());
if (is_inline()) {
return m_repr.array.arr[idx];
}
return m_repr.vector.vec[idx];
}

void push_back(handle x) {
if (is_inline()) {
auto &ha = m_repr.array;
if (ha.size == N) {
move_to_vector_with_reserved_size(N + 1);
push_back_slow_path(x);
} else {
ha.arr[ha.size++] = x;
}
} else {
push_back_slow_path(x);
}
}

template <typename Arg>
void emplace_back(Arg &&x) {
push_back(handle(x));
}

void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > N) {
move_to_vector_with_reserved_size(sz);
}
} else {
reserve_slow_path(sz);
}
}

private:
using repr_type = inline_array_or_vector<handle, N>;
repr_type m_repr;

PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
assert(is_inline());
auto &ha = m_repr.array;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
new (&m_repr.vector) heap_vector(std::move(hv));
}

PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.vector.vec.push_back(x); }

PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.vector.vec.reserve(sz); }

bool is_inline() const { return m_repr.is_inline(); }
};

// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t kRequestedInlineSize>
struct args_convert_vector {
private:
public:
args_convert_vector() = default;

args_convert_vector(const args_convert_vector &) = delete;
args_convert_vector &operator=(const args_convert_vector &) = delete;
args_convert_vector(args_convert_vector &&) noexcept = default;
args_convert_vector &operator=(args_convert_vector &&) noexcept = default;

args_convert_vector(std::size_t count, bool value) {
if (count > kInlineSize) {
new (&m_repr.vector) typename repr_type::heap_vector(count, value);
} else {
auto &inline_arr = m_repr.array;
inline_arr.arr.fill(value ? std::size_t(-1) : 0);
inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
}
}

std::size_t size() const {
if (is_inline()) {
return m_repr.array.size;
}
return m_repr.vector.vec.size();
}

void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > kInlineSize) {
move_to_vector_with_reserved_size(sz);
}
} else {
m_repr.vector.vec.reserve(sz);
}
}

bool operator[](std::size_t idx) const {
if (is_inline()) {
return inline_index(idx);
}
assert(idx < m_repr.vector.vec.size());
return m_repr.vector.vec[idx];
}

void push_back(bool b) {
if (is_inline()) {
auto &ha = m_repr.array;
if (ha.size == kInlineSize) {
move_to_vector_with_reserved_size(kInlineSize + 1);
push_back_slow_path(b);
} else {
assert(ha.size < kInlineSize);
const auto wbi = word_and_bit_index(ha.size++);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
if (b) {
ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
} else {
ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
}
assert(operator[](ha.size - 1) == b);
}
} else {
push_back_slow_path(b);
}
}

void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }

private:
struct WordAndBitIndex {
std::size_t word;
std::size_t bit;
};

static WordAndBitIndex word_and_bit_index(std::size_t idx) {
return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
}

bool inline_index(std::size_t idx) const {
const auto wbi = word_and_bit_index(idx);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
}

PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
auto &inline_arr = m_repr.array;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
hv.vec.push_back(inline_index(ii));
}
new (&m_repr.vector) heap_vector(std::move(hv));
}

PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.vector.vec.push_back(b); }

static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
static constexpr auto kInlineSize = kWords * kBitsPerWord;

using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
repr_type m_repr;

bool is_inline() const { return m_repr.is_inline(); }
};

PYBIND11_NAMESPACE_END(detail)
PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
7 changes: 4 additions & 3 deletions include/pybind11/pybind11.h
Original file line number Diff line number Diff line change
Expand Up @@ -1048,13 +1048,14 @@ class cpp_function : public function {
}
#endif

std::vector<bool> second_pass_convert;
args_convert_vector<arg_vector_small_size> second_pass_convert;
if (overloaded) {
// We're in the first no-convert pass, so swap out the conversion flags for a
// set of all-false flags. If the call fails, we'll swap the flags back in for
// the conversion-allowed call below.
second_pass_convert.resize(func.nargs, false);
call.args_convert.swap(second_pass_convert);
second_pass_convert = std::move(call.args_convert);
call.args_convert
= args_convert_vector<arg_vector_small_size>(func.nargs, false);
}

// 6. Call the function.
Expand Down
4 changes: 2 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
# Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
add_subdirectory(pure_cpp)

# Test embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_embed)
# Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_with_catch)

# Test CMake build using functions and targets from subdirectory or installed location
add_subdirectory(test_cmake_build)
Expand Down
1 change: 1 addition & 0 deletions tests/extra_python_package/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
}

detail_headers = {
"include/pybind11/detail/argument_vector.h",
"include/pybind11/detail/class.h",
"include/pybind11/detail/common.h",
"include/pybind11/detail/cpp_conduit.h",
Expand Down
Loading
Loading