Skip to content

Commit 4e15b22

Browse files
authored
ThreadPool: Spend less time busy waiting. (#21545)
The purpose of the patch is primarily to save power, but it also has nice perf benefits (mostly from allowing the system to better distribute power to cores doing meaningful work). Changes are twofold: 1) Decrease WorkerLoop spin count dramatically ~10^6 -> ~10^4. The reality is after ~10^4 spins, if there hasn't been any new work added its unlikely any new work is imminent so sleep to preserve power. This aligns more closely with upstream EigenV3. 2) Use exponential backoff for waiting on memory. This saves a bit more power, and important increases the time between iterations in WorkerLoop to help accomidate the dramatically lowering spin counts. Since the tuning for both the iteration counts / backoff counts are dramatically different for hybrid/non-hybrid systems, this patch templates the affected functions and dynamically choses based on `CPUIDInfo::IsHybrid()`. This seemed like the "lightest weight" way of getting the change in, although its likely we could incur less dynamic overhead if we added the template argument to the entirety of `ThreadPoolTempl`. Measured performance on an [Intel Meteor Lake CPU](https://www.intel.com/content/www/us/en/products/sku/237329/intel-core-ultra-7-processor-165u-12m-cache-up-to-4-90-ghz/specifications.html) across a range of models. Below are the result of 3 runs with each metric being the value-before-patch / value-after-patch (so for something like inference time, lower is better). <div align="center"> <table> <tr> <th>Session creation time cost</th> <td>0.7179</td> </tr> <tr> <th>First inference time cost</th> <td>0.7156</td> </tr> <tr> <th>Total inference time cost</th> <td>1.0146</td> </tr> <tr> <th>Total inference requests</th> <td>0.8874</td> </tr> <tr> <th>Average inference time cost</th> <td>0.8800</td> </tr> <tr> <th>Total inference run time</th> <td>1.0146</td> </tr> <tr> <th>Number of inferences per second</th> <td>0.8955</td> </tr> <tr> <th>Avg CPU usage</th> <td>0.9462</td> </tr> <tr> <th>Peak working set size</th> <td>0.9922</td> </tr> <tr> <th>Runs</th> <td>1.1552</td> </tr> <tr> <th>Min Latency</th> <td>0.7283</td> </tr> <tr> <th>Max Latency</th> <td>0.9258</td> </tr> <tr> <th>P50 Latency</th> <td>0.9534</td> </tr> <tr> <th>P90 Latency</th> <td>0.9639</td> </tr> <tr> <th>P95 Latency</th> <td>0.9659</td> </tr> <tr> <th>P99 Latency</th> <td>0.9640</td> </tr> </table> </div> So the net result is a 1.16x improvement in throughput and between 1.08-1.37x improvement in latency.
1 parent 14d1bfc commit 4e15b22

File tree

3 files changed

+83
-25
lines changed

3 files changed

+83
-25
lines changed

include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h

+55-10
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ class RunQueue {
695695

696696
static std::atomic<uint32_t> next_tag{1};
697697

698-
template <typename Environment>
698+
template <typename Environment, bool kIsHybrid>
699699
class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInterface {
700700
private:
701701
struct PerThread;
@@ -767,6 +767,29 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
767767
typedef std::function<void()> Task;
768768
typedef RunQueue<Task, Tag, 1024> Queue;
769769

770+
// Class for waiting w/ exponential backoff.
771+
// Template argument is maximum number of spins in backoff loop.
772+
template <unsigned kMaxBackoff>
773+
class ThreadPoolWaiter {
774+
// Current number if spins in backoff loop
775+
unsigned pause_time_;
776+
777+
public:
778+
void wait() {
779+
// If kMaxBackoff is zero don't do any pausing.
780+
if constexpr (kMaxBackoff == 1) {
781+
onnxruntime::concurrency::SpinPause();
782+
} else if constexpr (kMaxBackoff > 1) {
783+
// Exponential backoff
784+
unsigned pause_time = pause_time_ + 1U;
785+
for (unsigned i = 0; i < pause_time; ++i) {
786+
onnxruntime::concurrency::SpinPause();
787+
}
788+
pause_time_ = (pause_time * 2U) % kMaxBackoff;
789+
}
790+
}
791+
};
792+
770793
ThreadPoolTempl(const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,
771794
const ThreadOptions& thread_options)
772795
: profiler_(num_threads, name),
@@ -908,8 +931,9 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
908931
// finish dispatch work. This avoids new tasks being started
909932
// concurrently with us attempting to end the parallel section.
910933
if (ps.dispatch_q_idx != -1) {
934+
ThreadPoolWaiter<4> waiter{};
911935
while (!ps.dispatch_done.load(std::memory_order_acquire)) {
912-
onnxruntime::concurrency::SpinPause();
936+
waiter.wait();
913937
}
914938
}
915939

@@ -931,15 +955,17 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
931955

932956
// Wait for the dispatch task's own work...
933957
if (ps.dispatch_q_idx > -1) {
958+
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
934959
while (!ps.work_done.load(std::memory_order_acquire)) {
935-
onnxruntime::concurrency::SpinPause();
960+
waiter.wait();
936961
}
937962
}
938963

939964
// ...and wait for any other tasks not revoked to finish their work
940965
auto tasks_to_wait_for = tasks_started - ps.tasks_revoked;
966+
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
941967
while (ps.tasks_finished < tasks_to_wait_for) {
942-
onnxruntime::concurrency::SpinPause();
968+
waiter.wait();
943969
}
944970

945971
// Clear status to allow the ThreadPoolParallelSection to be
@@ -1257,9 +1283,10 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
12571283
// Increase the worker count if needed. Each worker will pick up
12581284
// loops to execute from the current parallel section.
12591285
std::function<void(unsigned)> worker_fn = [&ps](unsigned par_idx) {
1286+
ThreadPoolWaiter<kIsHybrid ? 4 : 0> waiter{};
12601287
while (ps.active) {
12611288
if (ps.current_loop.load() == nullptr) {
1262-
onnxruntime::concurrency::SpinPause();
1289+
waiter.wait();
12631290
} else {
12641291
ps.workers_in_loop++;
12651292
ThreadPoolLoop* work_item = ps.current_loop;
@@ -1280,8 +1307,9 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
12801307

12811308
// Wait for workers to exit the loop
12821309
ps.current_loop = 0;
1310+
ThreadPoolWaiter<kIsHybrid ? 1 : 4> waiter{};
12831311
while (ps.workers_in_loop) {
1284-
onnxruntime::concurrency::SpinPause();
1312+
waiter.wait();
12851313
}
12861314
profiler_.LogEnd(ThreadPoolProfiler::WAIT);
12871315
}
@@ -1532,13 +1560,30 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15321560

15331561
assert(td.GetStatus() == WorkerData::ThreadStatus::Spinning);
15341562

1535-
constexpr int log2_spin = 20;
1536-
const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0;
1537-
const int steal_count = spin_count / 100;
1563+
// The exact value of spin_count and steal_count are arbitrary and
1564+
// were experimentally determined. These numbers yielded the best
1565+
// performance across a range of workloads and
1566+
// machines. Generally, the goal of tuning spin_count is to make
1567+
// the number as small as possible while ensuring there is enough
1568+
// slack so that if each core is doing the same amount of work it
1569+
// won't sleep before they have all finished. The idea here is
1570+
// that in pipelined workloads, it won't sleep during each stage
1571+
// if it's done a bit faster than its neighbors, but that if there
1572+
// are non-equal sizes of work distributed, it won't take too long
1573+
// to reach sleep giving power (and thus frequency/performance) to
1574+
// its neighbors. Since hybrid has P/E cores, a lower value is
1575+
// chosen. On hybrid systems, even with equal sized workloads
1576+
// distributed the compute time won't stay synced. Typically in
1577+
// the hybrid case the P cores finish first (and are thus waiting)
1578+
// which is essentially a priority inversion.
1579+
constexpr int pref_spin_count = kIsHybrid ? 5000 : 10000;
1580+
const int spin_count = allow_spinning_ ? pref_spin_count : 0;
1581+
constexpr int steal_count = pref_spin_count / (kIsHybrid ? 25 : 100);
15381582

15391583
SetDenormalAsZero(set_denormal_as_zero_);
15401584
profiler_.LogThreadId(thread_id);
15411585

1586+
ThreadPoolWaiter<kIsHybrid ? 1 : 8> waiter{};
15421587
while (!should_exit) {
15431588
Task t = q.PopFront();
15441589
if (!t) {
@@ -1554,7 +1599,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15541599
if (spin_loop_status_.load(std::memory_order_relaxed) == SpinLoopStatus::kIdle) {
15551600
break;
15561601
}
1557-
onnxruntime::concurrency::SpinPause();
1602+
waiter.wait();
15581603
}
15591604

15601605
// Attempt to block

include/onnxruntime/core/platform/threadpool.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ struct TensorOpCost {
129129

130130
namespace concurrency {
131131

132-
template <typename Environment>
132+
template <typename Environment, bool kIsHybrid>
133133
class ThreadPoolTempl;
134134

135135
class ExtendedThreadPoolInterface;
@@ -424,7 +424,8 @@ class ThreadPool {
424424
ExtendedThreadPoolInterface* underlying_threadpool_ = nullptr;
425425

426426
// If used, underlying_threadpool_ is instantiated and owned by the ThreadPool.
427-
std::unique_ptr<ThreadPoolTempl<Env> > extended_eigen_threadpool_;
427+
std::unique_ptr<ThreadPoolTempl<Env, true>> extended_eigen_hybrid_threadpool_;
428+
std::unique_ptr<ThreadPoolTempl<Env, false>> extended_eigen_normal_threadpool_;
428429

429430
// Force the thread pool to run in hybrid mode on a normal cpu.
430431
bool force_hybrid_ = false;

onnxruntime/core/common/threadpool.cc

+25-13
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,23 @@ ThreadPool::ThreadPool(Env* env,
389389
assert(thread_options_.affinities.size() >= size_t(threads_to_create));
390390
}
391391

392-
extended_eigen_threadpool_ =
393-
std::make_unique<ThreadPoolTempl<Env> >(name,
394-
threads_to_create,
395-
low_latency_hint,
396-
*env,
397-
thread_options_);
398-
underlying_threadpool_ = extended_eigen_threadpool_.get();
392+
if (force_hybrid_) {
393+
extended_eigen_hybrid_threadpool_ =
394+
std::make_unique<ThreadPoolTempl<Env, true> >(name,
395+
threads_to_create,
396+
low_latency_hint,
397+
*env,
398+
thread_options_);
399+
underlying_threadpool_ = extended_eigen_hybrid_threadpool_.get();
400+
} else {
401+
extended_eigen_normal_threadpool_ =
402+
std::make_unique<ThreadPoolTempl<Env, false> >(name,
403+
threads_to_create,
404+
low_latency_hint,
405+
*env,
406+
thread_options_);
407+
underlying_threadpool_ = extended_eigen_normal_threadpool_.get();
408+
}
399409
}
400410
}
401411

@@ -664,15 +674,17 @@ std::string ThreadPool::StopProfiling(concurrency::ThreadPool* tp) {
664674
}
665675

666676
void ThreadPool::EnableSpinning() {
667-
if (extended_eigen_threadpool_) {
668-
extended_eigen_threadpool_->EnableSpinning();
669-
}
677+
if (extended_eigen_hybrid_threadpool_)
678+
extended_eigen_hybrid_threadpool_->EnableSpinning();
679+
else if (extended_eigen_normal_threadpool_)
680+
extended_eigen_normal_threadpool_->EnableSpinning();
670681
}
671682

672683
void ThreadPool::DisableSpinning() {
673-
if (extended_eigen_threadpool_) {
674-
extended_eigen_threadpool_->DisableSpinning();
675-
}
684+
if (extended_eigen_hybrid_threadpool_)
685+
extended_eigen_hybrid_threadpool_->DisableSpinning();
686+
else if (extended_eigen_normal_threadpool_)
687+
extended_eigen_normal_threadpool_->DisableSpinning();
676688
}
677689

678690
// Return the number of threads created by the pool.

0 commit comments

Comments
 (0)