Skip to content

Commit c415991

Browse files
authored
Revert "ThreadPool: Spend less time busy waiting. (#21545)" (#22350)
This reverts commit 4e15b22. Reason: We are seeing an increase in the number of deadlocks after this PR. We have a release coming up next week and do not have enough time to investigate the root cause, hence reverting this PR temporarily. Moreover, this is causing an increase int he binary size. ### Description We are seeing an [increase in the number of deadlocks](#22315 (comment)) after this PR. We have a release coming up next week and do not have enough time to investigate the root cause, hence reverting this PR temporarily. ### Motivation and Context See above.
1 parent c5d28ca commit c415991

File tree

3 files changed

+25
-83
lines changed

3 files changed

+25
-83
lines changed

include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h

+10-55
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ class RunQueue {
695695

696696
static std::atomic<uint32_t> next_tag{1};
697697

698-
template <typename Environment, bool kIsHybrid>
698+
template <typename Environment>
699699
class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInterface {
700700
private:
701701
struct PerThread;
@@ -767,29 +767,6 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
767767
typedef std::function<void()> Task;
768768
typedef RunQueue<Task, Tag, 1024> Queue;
769769

770-
// Class for waiting w/ exponential backoff.
771-
// Template argument is maximum number of spins in backoff loop.
772-
template <unsigned kMaxBackoff>
773-
class ThreadPoolWaiter {
774-
// Current number if spins in backoff loop
775-
unsigned pause_time_;
776-
777-
public:
778-
void wait() {
779-
// If kMaxBackoff is zero don't do any pausing.
780-
if constexpr (kMaxBackoff == 1) {
781-
onnxruntime::concurrency::SpinPause();
782-
} else if constexpr (kMaxBackoff > 1) {
783-
// Exponential backoff
784-
unsigned pause_time = pause_time_ + 1U;
785-
for (unsigned i = 0; i < pause_time; ++i) {
786-
onnxruntime::concurrency::SpinPause();
787-
}
788-
pause_time_ = (pause_time * 2U) % kMaxBackoff;
789-
}
790-
}
791-
};
792-
793770
ThreadPoolTempl(const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,
794771
const ThreadOptions& thread_options)
795772
: profiler_(num_threads, name),
@@ -931,9 +908,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
931908
// finish dispatch work. This avoids new tasks being started
932909
// concurrently with us attempting to end the parallel section.
933910
if (ps.dispatch_q_idx != -1) {
934-
ThreadPoolWaiter<4> waiter{};
935911
while (!ps.dispatch_done.load(std::memory_order_acquire)) {
936-
waiter.wait();
912+
onnxruntime::concurrency::SpinPause();
937913
}
938914
}
939915

@@ -955,17 +931,15 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
955931

956932
// Wait for the dispatch task's own work...
957933
if (ps.dispatch_q_idx > -1) {
958-
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
959934
while (!ps.work_done.load(std::memory_order_acquire)) {
960-
waiter.wait();
935+
onnxruntime::concurrency::SpinPause();
961936
}
962937
}
963938

964939
// ...and wait for any other tasks not revoked to finish their work
965940
auto tasks_to_wait_for = tasks_started - ps.tasks_revoked;
966-
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
967941
while (ps.tasks_finished < tasks_to_wait_for) {
968-
waiter.wait();
942+
onnxruntime::concurrency::SpinPause();
969943
}
970944

971945
// Clear status to allow the ThreadPoolParallelSection to be
@@ -1283,10 +1257,9 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
12831257
// Increase the worker count if needed. Each worker will pick up
12841258
// loops to execute from the current parallel section.
12851259
std::function<void(unsigned)> worker_fn = [&ps](unsigned par_idx) {
1286-
ThreadPoolWaiter<kIsHybrid ? 4 : 0> waiter{};
12871260
while (ps.active) {
12881261
if (ps.current_loop.load() == nullptr) {
1289-
waiter.wait();
1262+
onnxruntime::concurrency::SpinPause();
12901263
} else {
12911264
ps.workers_in_loop++;
12921265
ThreadPoolLoop* work_item = ps.current_loop;
@@ -1307,9 +1280,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
13071280

13081281
// Wait for workers to exit the loop
13091282
ps.current_loop = 0;
1310-
ThreadPoolWaiter<kIsHybrid ? 1 : 4> waiter{};
13111283
while (ps.workers_in_loop) {
1312-
waiter.wait();
1284+
onnxruntime::concurrency::SpinPause();
13131285
}
13141286
profiler_.LogEnd(ThreadPoolProfiler::WAIT);
13151287
}
@@ -1560,30 +1532,13 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15601532

15611533
assert(td.GetStatus() == WorkerData::ThreadStatus::Spinning);
15621534

1563-
// The exact value of spin_count and steal_count are arbitrary and
1564-
// were experimentally determined. These numbers yielded the best
1565-
// performance across a range of workloads and
1566-
// machines. Generally, the goal of tuning spin_count is to make
1567-
// the number as small as possible while ensuring there is enough
1568-
// slack so that if each core is doing the same amount of work it
1569-
// won't sleep before they have all finished. The idea here is
1570-
// that in pipelined workloads, it won't sleep during each stage
1571-
// if it's done a bit faster than its neighbors, but that if there
1572-
// are non-equal sizes of work distributed, it won't take too long
1573-
// to reach sleep giving power (and thus frequency/performance) to
1574-
// its neighbors. Since hybrid has P/E cores, a lower value is
1575-
// chosen. On hybrid systems, even with equal sized workloads
1576-
// distributed the compute time won't stay synced. Typically in
1577-
// the hybrid case the P cores finish first (and are thus waiting)
1578-
// which is essentially a priority inversion.
1579-
constexpr int pref_spin_count = kIsHybrid ? 5000 : 10000;
1580-
const int spin_count = allow_spinning_ ? pref_spin_count : 0;
1581-
constexpr int steal_count = pref_spin_count / (kIsHybrid ? 25 : 100);
1535+
constexpr int log2_spin = 20;
1536+
const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0;
1537+
const int steal_count = spin_count / 100;
15821538

15831539
SetDenormalAsZero(set_denormal_as_zero_);
15841540
profiler_.LogThreadId(thread_id);
15851541

1586-
ThreadPoolWaiter<kIsHybrid ? 1 : 8> waiter{};
15871542
while (!should_exit) {
15881543
Task t = q.PopFront();
15891544
if (!t) {
@@ -1599,7 +1554,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15991554
if (spin_loop_status_.load(std::memory_order_relaxed) == SpinLoopStatus::kIdle) {
16001555
break;
16011556
}
1602-
waiter.wait();
1557+
onnxruntime::concurrency::SpinPause();
16031558
}
16041559

16051560
// Attempt to block

include/onnxruntime/core/platform/threadpool.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ struct TensorOpCost {
129129

130130
namespace concurrency {
131131

132-
template <typename Environment, bool kIsHybrid>
132+
template <typename Environment>
133133
class ThreadPoolTempl;
134134

135135
class ExtendedThreadPoolInterface;
@@ -424,8 +424,7 @@ class ThreadPool {
424424
ExtendedThreadPoolInterface* underlying_threadpool_ = nullptr;
425425

426426
// If used, underlying_threadpool_ is instantiated and owned by the ThreadPool.
427-
std::unique_ptr<ThreadPoolTempl<Env, true>> extended_eigen_hybrid_threadpool_;
428-
std::unique_ptr<ThreadPoolTempl<Env, false>> extended_eigen_normal_threadpool_;
427+
std::unique_ptr<ThreadPoolTempl<Env> > extended_eigen_threadpool_;
429428

430429
// Force the thread pool to run in hybrid mode on a normal cpu.
431430
bool force_hybrid_ = false;

onnxruntime/core/common/threadpool.cc

+13-25
Original file line numberDiff line numberDiff line change
@@ -389,23 +389,13 @@ ThreadPool::ThreadPool(Env* env,
389389
assert(thread_options_.affinities.size() >= size_t(threads_to_create));
390390
}
391391

392-
if (force_hybrid_) {
393-
extended_eigen_hybrid_threadpool_ =
394-
std::make_unique<ThreadPoolTempl<Env, true> >(name,
395-
threads_to_create,
396-
low_latency_hint,
397-
*env,
398-
thread_options_);
399-
underlying_threadpool_ = extended_eigen_hybrid_threadpool_.get();
400-
} else {
401-
extended_eigen_normal_threadpool_ =
402-
std::make_unique<ThreadPoolTempl<Env, false> >(name,
403-
threads_to_create,
404-
low_latency_hint,
405-
*env,
406-
thread_options_);
407-
underlying_threadpool_ = extended_eigen_normal_threadpool_.get();
408-
}
392+
extended_eigen_threadpool_ =
393+
std::make_unique<ThreadPoolTempl<Env> >(name,
394+
threads_to_create,
395+
low_latency_hint,
396+
*env,
397+
thread_options_);
398+
underlying_threadpool_ = extended_eigen_threadpool_.get();
409399
}
410400
}
411401

@@ -674,17 +664,15 @@ std::string ThreadPool::StopProfiling(concurrency::ThreadPool* tp) {
674664
}
675665

676666
void ThreadPool::EnableSpinning() {
677-
if (extended_eigen_hybrid_threadpool_)
678-
extended_eigen_hybrid_threadpool_->EnableSpinning();
679-
else if (extended_eigen_normal_threadpool_)
680-
extended_eigen_normal_threadpool_->EnableSpinning();
667+
if (extended_eigen_threadpool_) {
668+
extended_eigen_threadpool_->EnableSpinning();
669+
}
681670
}
682671

683672
void ThreadPool::DisableSpinning() {
684-
if (extended_eigen_hybrid_threadpool_)
685-
extended_eigen_hybrid_threadpool_->DisableSpinning();
686-
else if (extended_eigen_normal_threadpool_)
687-
extended_eigen_normal_threadpool_->DisableSpinning();
673+
if (extended_eigen_threadpool_) {
674+
extended_eigen_threadpool_->DisableSpinning();
675+
}
688676
}
689677

690678
// Return the number of threads created by the pool.

0 commit comments

Comments
 (0)