|
| 1 | +sched/fair: Fix CPU bandwidth limit bypass during CPU hotplug |
| 2 | + |
| 3 | +jira LE-3255 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.56.1.el8_10 |
| 5 | +commit-author Vishal Chourasia < [email protected]> |
| 6 | +commit af98d8a36a963e758e84266d152b92c7b51d4ecb |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.56.1.el8_10/af98d8a3.failed |
| 10 | + |
| 11 | +CPU controller limits are not properly enforced during CPU hotplug |
| 12 | +operations, particularly during CPU offline. When a CPU goes offline, |
| 13 | +throttled processes are unintentionally being unthrottled across all CPUs |
| 14 | +in the system, allowing them to exceed their assigned quota limits. |
| 15 | + |
| 16 | +Consider below for an example, |
| 17 | + |
| 18 | +Assigning 6.25% bandwidth limit to a cgroup |
| 19 | +in a 8 CPU system, where, workload is running 8 threads for 20 seconds at |
| 20 | +100% CPU utilization, expected (user+sys) time = 10 seconds. |
| 21 | + |
| 22 | +$ cat /sys/fs/cgroup/test/cpu.max |
| 23 | +50000 100000 |
| 24 | + |
| 25 | +$ ./ebizzy -t 8 -S 20 // non-hotplug case |
| 26 | +real 20.00 s |
| 27 | +user 10.81 s // intended behaviour |
| 28 | +sys 0.00 s |
| 29 | + |
| 30 | +$ ./ebizzy -t 8 -S 20 // hotplug case |
| 31 | +real 20.00 s |
| 32 | +user 14.43 s // Workload is able to run for 14 secs |
| 33 | +sys 0.00 s // when it should have only run for 10 secs |
| 34 | + |
| 35 | +During CPU hotplug, scheduler domains are rebuilt and cpu_attach_domain |
| 36 | +is called for every active CPU to update the root domain. That ends up |
| 37 | +calling rq_offline_fair which un-throttles any throttled hierarchies. |
| 38 | + |
| 39 | +Unthrottling should only occur for the CPU being hotplugged to allow its |
| 40 | +throttled processes to become runnable and get migrated to other CPUs. |
| 41 | + |
| 42 | +With current patch applied, |
| 43 | +$ ./ebizzy -t 8 -S 20 // hotplug case |
| 44 | +real 21.00 s |
| 45 | +user 10.16 s // intended behaviour |
| 46 | +sys 0.00 s |
| 47 | + |
| 48 | +This also has another symptom, when a CPU goes offline, and if the cfs_rq |
| 49 | +is not in throttled state and the runtime_remaining still had plenty |
| 50 | +remaining, it gets reset to 1 here, causing the runtime_remaining of |
| 51 | +cfs_rq to be quickly depleted. |
| 52 | + |
| 53 | +Note: hotplug operation (online, offline) was performed in while(1) loop |
| 54 | + |
| 55 | +v3: https://lore.kernel.org/all/ [email protected] |
| 56 | +v2: https://lore.kernel.org/all/ [email protected] |
| 57 | +v1: https://lore.kernel.org/all/ [email protected] |
| 58 | + Suggested-by: Zhang Qiao < [email protected]> |
| 59 | + Signed-off-by: Vishal Chourasia < [email protected]> |
| 60 | + Signed-off-by: Peter Zijlstra (Intel) < [email protected]> |
| 61 | + Acked-by: Vincent Guittot < [email protected]> |
| 62 | + Tested-by: Madadi Vineeth Reddy < [email protected]> |
| 63 | + Tested-by: Samir Mulani < [email protected]> |
| 64 | +Link: https://lore.kernel.org/r/ [email protected] |
| 65 | +(cherry picked from commit af98d8a36a963e758e84266d152b92c7b51d4ecb) |
| 66 | + Signed-off-by: Jonathan Maple < [email protected]> |
| 67 | + |
| 68 | +# Conflicts: |
| 69 | +# kernel/sched/fair.c |
| 70 | +diff --cc kernel/sched/fair.c |
| 71 | +index b6174edca38c,8f641c9e74a8..000000000000 |
| 72 | +--- a/kernel/sched/fair.c |
| 73 | ++++ b/kernel/sched/fair.c |
| 74 | +@@@ -5512,7 -6694,18 +5512,22 @@@ static void __maybe_unused unthrottle_o |
| 75 | + { |
| 76 | + struct task_group *tg; |
| 77 | + |
| 78 | +++<<<<<<< HEAD |
| 79 | + + lockdep_assert_held(&rq->lock); |
| 80 | +++======= |
| 81 | ++ lockdep_assert_rq_held(rq); |
| 82 | ++ |
| 83 | ++ // Do not unthrottle for an active CPU |
| 84 | ++ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) |
| 85 | ++ return; |
| 86 | ++ |
| 87 | ++ /* |
| 88 | ++ * The rq clock has already been updated in the |
| 89 | ++ * set_rq_offline(), so we should skip updating |
| 90 | ++ * the rq clock again in unthrottle_cfs_rq(). |
| 91 | ++ */ |
| 92 | ++ rq_clock_start_loop_update(rq); |
| 93 | +++>>>>>>> af98d8a36a96 (sched/fair: Fix CPU bandwidth limit bypass during CPU hotplug) |
| 94 | + |
| 95 | + rcu_read_lock(); |
| 96 | + list_for_each_entry_rcu(tg, &task_groups, list) { |
| 97 | +@@@ -5532,10 -6720,19 +5542,17 @@@ |
| 98 | + */ |
| 99 | + cfs_rq->runtime_enabled = 0; |
| 100 | + |
| 101 | +- if (cfs_rq_throttled(cfs_rq)) |
| 102 | +- unthrottle_cfs_rq(cfs_rq); |
| 103 | ++ if (!cfs_rq_throttled(cfs_rq)) |
| 104 | ++ continue; |
| 105 | ++ |
| 106 | ++ /* |
| 107 | ++ * clock_task is not advancing so we just need to make sure |
| 108 | ++ * there's some valid quota amount |
| 109 | ++ */ |
| 110 | ++ cfs_rq->runtime_remaining = 1; |
| 111 | ++ unthrottle_cfs_rq(cfs_rq); |
| 112 | + } |
| 113 | + rcu_read_unlock(); |
| 114 | + - |
| 115 | + - rq_clock_stop_loop_update(rq); |
| 116 | + } |
| 117 | + |
| 118 | + bool cfs_task_bw_constrained(struct task_struct *p) |
| 119 | +* Unmerged path kernel/sched/fair.c |
0 commit comments