-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Support for sub-interpreters #5564
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
756feb0
b761a1f
aea87f8
63a614d
c496c00
71f8370
6aa619a
431ec00
ebb2082
83bafeb
0387086
bbda5eb
1f21a65
da7f2bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
|
||
#include "common.h" | ||
|
||
#include <atomic> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to self since I checked: we are already using atomic in object.h (using atomic requires linking to libatomic on some platforms, like armv7l). I think we are actually missing that currently (https://github.com/scikit-hep/boost-histogram/blob/38ae735c07a9bbbbc80ca5ad9b57af106f61ef43/CMakeLists.txt#L91-L93 for example), but this isn't a new issue. |
||
#include <exception> | ||
#include <mutex> | ||
#include <thread> | ||
|
@@ -53,6 +54,7 @@ constexpr const char *internals_function_record_capsule_name = "pybind11_functio | |
inline PyTypeObject *make_static_property_type(); | ||
inline PyTypeObject *make_default_metaclass(); | ||
inline PyObject *make_object_base_type(PyTypeObject *metaclass); | ||
inline void translate_exception(std::exception_ptr p); | ||
|
||
// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new | ||
// Thread Specific Storage (TSS) API. | ||
|
@@ -149,6 +151,20 @@ struct instance_map_shard { | |
|
||
static_assert(sizeof(instance_map_shard) % 64 == 0, | ||
"instance_map_shard size is not a multiple of 64 bytes"); | ||
|
||
inline uint64_t round_up_to_next_pow2(uint64_t x) { | ||
// Round-up to the next power of two. | ||
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 | ||
x--; | ||
x |= (x >> 1); | ||
x |= (x >> 2); | ||
x |= (x >> 4); | ||
x |= (x >> 8); | ||
x |= (x >> 16); | ||
x |= (x >> 32); | ||
x++; | ||
return x; | ||
} | ||
#endif | ||
|
||
/// Internal data structure used to track registered instances and types. | ||
|
@@ -178,9 +194,9 @@ struct internals { | |
// extensions | ||
std::forward_list<std::string> static_strings; // Stores the std::strings backing | ||
// detail::c_str() | ||
PyTypeObject *static_property_type; | ||
PyTypeObject *default_metaclass; | ||
PyObject *instance_base; | ||
PyTypeObject *static_property_type = nullptr; | ||
PyTypeObject *default_metaclass = nullptr; | ||
PyObject *instance_base = nullptr; | ||
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: | ||
PYBIND11_TLS_KEY_INIT(tstate) | ||
PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key) | ||
|
@@ -189,7 +205,36 @@ struct internals { | |
|
||
type_map<PyObject *> native_enum_type_map; | ||
|
||
internals() = default; | ||
internals() { | ||
PyThreadState *curtstate = PyThreadState_Get(); | ||
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition) | ||
if (!PYBIND11_TLS_KEY_CREATE(tstate)) { | ||
pybind11_fail( | ||
"internals constructor: could not successfully initialize the tstate TSS key!"); | ||
} | ||
PYBIND11_TLS_REPLACE_VALUE(tstate, curtstate); | ||
|
||
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition) | ||
if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) { | ||
pybind11_fail("internals constructor: could not successfully initialize the " | ||
"loader_life_support TSS key!"); | ||
} | ||
|
||
istate = curtstate->interp; | ||
registered_exception_translators.push_front(&translate_exception); | ||
static_property_type = make_static_property_type(); | ||
default_metaclass = make_default_metaclass(); | ||
#ifdef Py_GIL_DISABLED | ||
// Scale proportional to the number of cores. 2x is a heuristic to reduce contention. | ||
auto num_shards | ||
= static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency())); | ||
if (num_shards == 0) { | ||
num_shards = 1; | ||
} | ||
instance_shards.reset(new instance_map_shard[num_shards]); | ||
instance_shards_mask = num_shards - 1; | ||
#endif | ||
} | ||
internals(const internals &other) = delete; | ||
internals &operator=(const internals &other) = delete; | ||
~internals() { | ||
|
@@ -252,11 +297,49 @@ struct type_info { | |
"__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) \ | ||
PYBIND11_COMPILER_TYPE_LEADING_UNDERSCORE PYBIND11_PLATFORM_ABI_ID "__" | ||
|
||
inline PyThreadState *get_thread_state_unchecked() { | ||
#if defined(PYPY_VERSION) || defined(GRAALVM_PYTHON) | ||
return PyThreadState_GET(); | ||
#elif PY_VERSION_HEX < 0x030D0000 | ||
return _PyThreadState_UncheckedGet(); | ||
#else | ||
return PyThreadState_GetUnchecked(); | ||
#endif | ||
} | ||
|
||
/// We use this counter to figure out if there are or have been multiple subinterpreters active at | ||
/// any point. This must never decrease while any interpreter may be running in any thread! | ||
inline std::atomic<int> &get_interpreter_counter() { | ||
static std::atomic<int> counter(0); | ||
return counter; | ||
} | ||
|
||
/// Each module locally stores a pointer to the `internals` data. The data | ||
/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`. | ||
inline internals **&get_internals_pp() { | ||
static internals **internals_pp = nullptr; | ||
return internals_pp; | ||
#ifdef PYBIND11_SUBINTERPRETER_SUPPORT | ||
if (get_interpreter_counter() > 1) { | ||
// Internals is one per interpreter. When multiple interpreters are alive in different | ||
// threads we have to allow them to have different internals, so we need a thread_local. | ||
static thread_local internals **t_internals_pp = nullptr; | ||
static thread_local PyInterpreterState *istate_cached = nullptr; | ||
// Whenever the interpreter changes on the current thread we need to invalidate the | ||
// internals_pp so that it can be pulled from the interpreter's state dict. That is slow, | ||
// so we use the current PyThreadState to check if it is necessary. The caller will see a | ||
// null return and do the fetch from the state dict or create a new one (as needed). | ||
auto *tstate = get_thread_state_unchecked(); | ||
if (!tstate) { | ||
istate_cached = nullptr; | ||
t_internals_pp = nullptr; | ||
} else if (tstate->interp != istate_cached) { | ||
istate_cached = tstate->interp; | ||
t_internals_pp = nullptr; | ||
} | ||
return t_internals_pp; | ||
} | ||
#endif | ||
static internals **s_internals_pp = nullptr; | ||
return s_internals_pp; | ||
} | ||
|
||
// forward decl | ||
|
@@ -387,49 +470,46 @@ inline object get_python_state_dict() { | |
return state_dict; | ||
} | ||
|
||
inline object get_internals_obj_from_state_dict(handle state_dict) { | ||
return reinterpret_steal<object>( | ||
dict_getitemstringref(state_dict.ptr(), PYBIND11_INTERNALS_ID)); | ||
} | ||
|
||
inline internals **get_internals_pp_from_capsule(handle obj) { | ||
void *raw_ptr = PyCapsule_GetPointer(obj.ptr(), /*name=*/nullptr); | ||
if (raw_ptr == nullptr) { | ||
raise_from(PyExc_SystemError, "pybind11::detail::get_internals_pp_from_capsule() FAILED"); | ||
throw error_already_set(); | ||
template <typename InternalsType> | ||
inline InternalsType **get_internals_pp_from_capsule_in_state_dict(dict &state_dict, | ||
char const *state_dict_key) { | ||
auto internals_obj | ||
= reinterpret_steal<object>(dict_getitemstringref(state_dict.ptr(), state_dict_key)); | ||
if (internals_obj) { | ||
void *raw_ptr = PyCapsule_GetPointer(internals_obj.ptr(), /*name=*/nullptr); | ||
if (!raw_ptr) { | ||
raise_from(PyExc_SystemError, | ||
"pybind11::detail::get_internals_pp_from_capsule_in_state_dict() FAILED"); | ||
throw error_already_set(); | ||
} | ||
return reinterpret_cast<InternalsType **>(raw_ptr); | ||
} | ||
return static_cast<internals **>(raw_ptr); | ||
} | ||
|
||
inline uint64_t round_up_to_next_pow2(uint64_t x) { | ||
// Round-up to the next power of two. | ||
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 | ||
x--; | ||
x |= (x >> 1); | ||
x |= (x >> 2); | ||
x |= (x >> 4); | ||
x |= (x >> 8); | ||
x |= (x >> 16); | ||
x |= (x >> 32); | ||
x++; | ||
return x; | ||
return nullptr; | ||
} | ||
|
||
/// Return a reference to the current `internals` data | ||
PYBIND11_NOINLINE internals &get_internals() { | ||
auto **&internals_pp = get_internals_pp(); | ||
b-pass marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (internals_pp && *internals_pp) { | ||
// This is the fast path, everything is already setup, just return it | ||
return **internals_pp; | ||
} | ||
|
||
// Slow path, something needs fetched from the state dict or created | ||
|
||
// Cannot use py::gil_scoped_acquire inside get_internals since that calls get_internals. | ||
gil_scoped_acquire_simple gil; | ||
error_scope err_scope; | ||
|
||
dict state_dict = get_python_state_dict(); | ||
if (object internals_obj = get_internals_obj_from_state_dict(state_dict)) { | ||
internals_pp = get_internals_pp_from_capsule(internals_obj); | ||
internals_pp = get_internals_pp_from_capsule_in_state_dict<internals>(state_dict, | ||
PYBIND11_INTERNALS_ID); | ||
if (!internals_pp) { | ||
internals_pp = new internals *(nullptr); | ||
state_dict[PYBIND11_INTERNALS_ID] = capsule(reinterpret_cast<void *>(internals_pp)); | ||
} | ||
if (internals_pp && *internals_pp) { | ||
|
||
if (*internals_pp) { | ||
// We loaded the internals through `state_dict`, which means that our `error_already_set` | ||
// and `builtin_exception` may be different local classes than the ones set up in the | ||
// initial exception translator, below, so add another for our local exception classes. | ||
|
@@ -438,45 +518,24 @@ PYBIND11_NOINLINE internals &get_internals() { | |
// libc++ with CPython doesn't require this (types are explicitly exported) | ||
// libc++ with PyPy still need it, awaiting further investigation | ||
#if !defined(__GLIBCXX__) | ||
(*internals_pp)->registered_exception_translators.push_front(&translate_local_exception); | ||
if ((*internals_pp)->registered_exception_translators.empty() | ||
|| (*internals_pp)->registered_exception_translators.front() | ||
!= &translate_local_exception) { | ||
(*internals_pp) | ||
->registered_exception_translators.push_front(&translate_local_exception); | ||
} | ||
#endif | ||
} else { | ||
if (!internals_pp) { | ||
internals_pp = new internals *(); | ||
} | ||
auto *&internals_ptr = *internals_pp; | ||
auto &internals_ptr = *internals_pp; | ||
internals_ptr = new internals(); | ||
|
||
PyThreadState *tstate = PyThreadState_Get(); | ||
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition) | ||
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) { | ||
pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!"); | ||
} | ||
PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate); | ||
|
||
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition) | ||
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) { | ||
pybind11_fail("get_internals: could not successfully initialize the " | ||
"loader_life_support TSS key!"); | ||
if (!internals_ptr->instance_base) { | ||
// This calls get_internals, so cannot be called from within the internals constructor | ||
// called above because internals_ptr must be set before get_internals is called again | ||
internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass); | ||
} | ||
|
||
internals_ptr->istate = tstate->interp; | ||
state_dict[PYBIND11_INTERNALS_ID] = capsule(reinterpret_cast<void *>(internals_pp)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow. I think we should break out this fix (moving it up) into a separate PR: See end of this ChatGPT conversation: https://chatgpt.com/share/67fc091a-2f00-8008-bb9e-4e3437fa3c80 I'll stop reviewing this PR for now. Let's do two PRs in preparation:
On the back of my mind: I had a lot of trouble with I believe that's still a problem, although I haven't tried in a while. Maybe that'll be fixed if we follow the ChatGPT suggestions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think ChatGPT got confused here (maybe because of a couple of these ifdef blocks). All this code was (and still is) part of a block that only runs when The push_front which is only for PyPy, I do not understand why that code is necessary enough to remove it. I added the check to make sure that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, you're right, this can be cleaned up at least so far as moving most of that code into the constructor where it belongs. I've done that in this PR. One line could not be moved because it calls |
||
internals_ptr->registered_exception_translators.push_front(&translate_exception); | ||
internals_ptr->static_property_type = make_static_property_type(); | ||
internals_ptr->default_metaclass = make_default_metaclass(); | ||
internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass); | ||
#ifdef Py_GIL_DISABLED | ||
// Scale proportional to the number of cores. 2x is a heuristic to reduce contention. | ||
auto num_shards | ||
= static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency())); | ||
if (num_shards == 0) { | ||
num_shards = 1; | ||
} | ||
internals_ptr->instance_shards.reset(new instance_map_shard[num_shards]); | ||
internals_ptr->instance_shards_mask = num_shards - 1; | ||
#endif // Py_GIL_DISABLED | ||
} | ||
|
||
return **internals_pp; | ||
} | ||
|
||
|
@@ -491,15 +550,66 @@ struct local_internals { | |
std::forward_list<ExceptionTranslator> registered_exception_translators; | ||
}; | ||
|
||
inline local_internals **&get_local_internals_pp() { | ||
#ifdef PYBIND11_SUBINTERPRETER_SUPPORT | ||
if (get_interpreter_counter() > 1) { | ||
// Internals is one per interpreter. When multiple interpreters are alive in different | ||
// threads we have to allow them to have different internals, so we need a thread_local. | ||
static thread_local local_internals **t_internals_pp = nullptr; | ||
static thread_local PyInterpreterState *istate_cached = nullptr; | ||
// Whenever the interpreter changes on the current thread we need to invalidate the | ||
// internals_pp so that it can be pulled from the interpreter's state dict. That is slow, | ||
// so we use the current PyThreadState to check if it is necessary. The caller will see a | ||
// null return and do the fetch from the state dict or create a new one (as needed). | ||
auto *tstate = get_thread_state_unchecked(); | ||
if (!tstate) { | ||
istate_cached = nullptr; | ||
t_internals_pp = nullptr; | ||
} else if (tstate->interp != istate_cached) { | ||
istate_cached = tstate->interp; | ||
t_internals_pp = nullptr; | ||
} | ||
return t_internals_pp; | ||
} | ||
#endif | ||
static local_internals **s_internals_pp = nullptr; | ||
return s_internals_pp; | ||
} | ||
|
||
/// A string key uniquely describing this module | ||
inline char const *get_local_internals_id() { | ||
// Use the address of this static itself as part of the key, so that the value is uniquely tied | ||
// to where the module is loaded in memory | ||
static const std::string this_module_idstr | ||
= PYBIND11_MODULE_LOCAL_ID | ||
+ std::to_string(reinterpret_cast<uintptr_t>(&this_module_idstr)); | ||
return this_module_idstr.c_str(); | ||
} | ||
|
||
/// Works like `get_internals`, but for things which are locally registered. | ||
inline local_internals &get_local_internals() { | ||
// Current static can be created in the interpreter finalization routine. If the later will be | ||
// destroyed in another static variable destructor, creation of this static there will cause | ||
// static deinitialization fiasco. In order to avoid it we avoid destruction of the | ||
// local_internals static. One can read more about the problem and current solution here: | ||
// https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables | ||
static auto *locals = new local_internals(); | ||
return *locals; | ||
auto **&local_internals_pp = get_local_internals_pp(); | ||
if (local_internals_pp && *local_internals_pp) { | ||
return **local_internals_pp; | ||
} | ||
|
||
// Cannot use py::gil_scoped_acquire inside get_internals since that calls get_internals. | ||
gil_scoped_acquire_simple gil; | ||
error_scope err_scope; | ||
|
||
dict state_dict = get_python_state_dict(); | ||
local_internals_pp = get_internals_pp_from_capsule_in_state_dict<local_internals>( | ||
state_dict, get_local_internals_id()); | ||
if (!local_internals_pp) { | ||
local_internals_pp = new local_internals *(nullptr); | ||
state_dict[get_local_internals_id()] | ||
= capsule(reinterpret_cast<void *>(local_internals_pp)); | ||
} | ||
if (!*local_internals_pp) { | ||
*local_internals_pp = new local_internals(); | ||
} | ||
|
||
return **local_internals_pp; | ||
} | ||
|
||
#ifdef Py_GIL_DISABLED | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm ... what are the pros-and-cons of forced-in (what you have here) vs opt-in (my suggestion from a few days ago)?
How widely used and mature is the subinterpreter feature in general?
Also, is there someone you know who might be available to fully review this PR? It looks like a lot of work, at least for me. I don't know anything specific about subinterpreters.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The initial version of this PR had some potential performance problems, but they're eliminated now IMO so it doesn't seem necessary to complicate the feature with cmake options and compile defines. These preprocessor lines just consolidate the different circumstances where sub-interpreters are not supported into one place... could be removed as the resulting define here is only used in 2 other places.
Sub-interpreters have been around for a long time but not widely used. Python 3.12 introduced per-interpreter GIL support into the stable API (PEP 684). It is stable in 3.12, while free threading is still experimental in 3.13 and requires opt-in flags in CPython. If someone is writing "production" code in 2025 they should not be using free threading, but they could consider using sub-interpreters with per-interpreter GIL to get some benefits of parallelism.
I don't know anyone else who specifically knows a lot about sub-interpreters. The main things to know about working with sub-interpreters:
internals
andlocal_internals
are global state which really need to be per-interpreter state.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel strongly it should stay here, for clarity, and future maintainability (including refactoring).
I'm still on the fence, force-in vs opt-in. @henryiii do you have an opinion?