apache · mwkang · May 7, 2026 · Jun 5, 2026
diff --git a/conf/defaults.yaml b/conf/defaults.yaml
@@ -88,6 +88,9 @@ topology.max.replication.wait.time.sec: 60
 nimbus.credential.renewers.freq.secs: 600
 nimbus.queue.size: 100000
 scheduler.display.resource: false
+nimbus.even.rebalance.idle.supervisor.enabled: false
+nimbus.even.rebalance.max.free.per.topology: 0
+nimbus.even.rebalance.idle.supervisor.min.stable.rounds: 3
 nimbus.local.assignments.backend.class: "org.apache.storm.assignments.InMemoryAssignmentBackend"
 nimbus.assignments.service.threads: 10
 nimbus.assignments.service.thread.queue.size: 100

diff --git a/storm-server/src/main/java/org/apache/storm/DaemonConfig.java b/storm-server/src/main/java/org/apache/storm/DaemonConfig.java
@@ -175,6 +175,37 @@ public class DaemonConfig implements Validated {
     @IsBoolean
     public static final String SCHEDULER_DISPLAY_RESOURCE = "scheduler.display.resource";
 
+    /**
+     * If true, {@link org.apache.storm.scheduler.EvenScheduler} may move already-assigned workers onto non-blacklisted supervisors
+     * with no slot in use. This lets a freshly returned supervisor pick up workers instead of staying idle. The number of workers
+     * freed per topology in a single scheduling round is capped by {@link #NIMBUS_EVEN_REBALANCE_MAX_FREE_PER_TOPOLOGY}, so even
+     * distribution is approached gradually rather than rebuilt from scratch.
+     */
+    @IsBoolean
+    public static final String NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED
+            = "nimbus.even.rebalance.idle.supervisor.enabled";
+
+    /**
+     * Optional upper bound on the number of currently-assigned workers a single topology may release in one scheduling round
+     * when the idle-supervisor rebalance defined by {@link #NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED} kicks in. The
+     * default budget already targets an even per-supervisor distribution (idle supervisors absorb roughly {@code numWorkers /
+     * numSupervisors} workers each in one round), capped by the idle side's free slot capacity. Setting this to a positive
+     * value tightens that budget; setting it to {@code 0} or a negative value leaves the even-distribution budget unbounded.
+     */
+    @IsInteger
+    public static final String NIMBUS_EVEN_REBALANCE_MAX_FREE_PER_TOPOLOGY
+            = "nimbus.even.rebalance.max.free.per.topology";
+
+    /**
+     * Minimum number of consecutive supervisor monitor rounds that a fully-idle supervisor must have been alive before
+     * {@link org.apache.storm.scheduler.EvenScheduler} can relocate workers onto it. A positive value avoids moving workers onto a
+     * supervisor that has only just returned and may still be flapping. Setting this to {@code 0} or a negative value disables the
+     * uptime guard.
+     */
+    @IsInteger
+    public static final String NIMBUS_EVEN_REBALANCE_IDLE_SUPERVISOR_MIN_STABLE_ROUNDS
+            = "nimbus.even.rebalance.idle.supervisor.min.stable.rounds";
+
     /**
      * The directory where storm's health scripts go.
      */

diff --git a/storm-server/src/main/java/org/apache/storm/daemon/nimbus/Nimbus.java b/storm-server/src/main/java/org/apache/storm/daemon/nimbus/Nimbus.java
@@ -985,11 +985,16 @@ private static Map<String, SupervisorDetails> basicSupervisorDetailsMap(IStormCl
             String id = entry.getKey();
             SupervisorInfo info = entry.getValue();
             ret.put(id, new SupervisorDetails(id, info.get_server_port(), info.get_hostname(),
-                                              info.get_scheduler_meta(), null, info.get_resources_map()));
+                                              info.get_scheduler_meta(), null, info.get_resources_map(),
+                                              supervisorUptimeSecs(info)));
         }
         return ret;
     }
 
+    private static long supervisorUptimeSecs(SupervisorInfo info) {
+        return info.is_set_uptime_secs() ? info.get_uptime_secs() : 0L;
+    }
+
     /**
      * NOTE: this can return false when a topology has just been activated.  The topology may still be
      * in the STORMS_SUBTREE.
@@ -2273,7 +2278,8 @@ private Map<String, SupervisorDetails> readAllSupervisorDetails(Map<String, Set<
         List<SupervisorDetails> superDetails = new ArrayList<>();
         for (Entry<String, SupervisorInfo> entry : superInfos.entrySet()) {
             SupervisorInfo info = entry.getValue();
-            superDetails.add(new SupervisorDetails(entry.getKey(), info.get_meta(), info.get_resources_map()));
+            superDetails.add(new SupervisorDetails(entry.getKey(), info.get_meta(), info.get_resources_map(),
+                                                   supervisorUptimeSecs(info)));
         }
         // Note that allSlotsAvailableForScheduling
         // only uses the supervisor-details. The rest of the arguments
@@ -2306,7 +2312,7 @@ private Map<String, SupervisorDetails> readAllSupervisorDetails(Map<String, Set<
                 allPorts.removeAll(deadPorts);
             }
             ret.put(superId, new SupervisorDetails(superId, hostname, info.get_scheduler_meta(),
-                                                   allPorts, info.get_resources_map()));
+                                                   allPorts, info.get_resources_map(), supervisorUptimeSecs(info)));
         }
         return ret;
     }
@@ -5526,4 +5532,3 @@ public void run() {
         }
     }
 }
-
diff --git a/storm-server/src/main/java/org/apache/storm/scheduler/Cluster.java b/storm-server/src/main/java/org/apache/storm/scheduler/Cluster.java
@@ -363,6 +363,64 @@ public boolean needsScheduling(TopologyDetails topology) {
         return desiredNumWorkers > assignedNumWorkers || getUnassignedExecutors(topology).size() > 0;
     }
 
+    /**
+     * Returns true when there is at least one stable, non-blacklisted supervisor whose slots are all currently free and the
+     * topology is not already on that supervisor. Controlled by
+     * {@link DaemonConfig#NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED}; returns false when disabled. The check is
+     * binary by design -- a supervisor either has zero used slots or it does not -- so this never fires for "almost balanced"
+     * clusters. Topologies that cannot benefit from a move (e.g. only a single worker assigned) are filtered later by the
+     * drain-budget computation in {@link EvenScheduler}, which evaluates to zero whenever
+     * {@code floor(numWorkers / nonBlacklistedSupervisorCount)} is zero.
+     */
+    public boolean hasIdleSupervisorReusableBy(TopologyDetails topology) {
+        if (!ObjectReader.getBoolean(
+                conf.get(DaemonConfig.NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED), false)) {
+            return false;
+        }
+        Set<String> nodesUsedByTopology = new HashSet<>();
+        for (WorkerSlot slot : getUsedSlotsByTopologyId(topology.getId())) {
+            nodesUsedByTopology.add(slot.getNodeId());
+        }
+        for (SupervisorDetails s : supervisors.values()) {
+            String sid = s.getId();
+            if (!isIdleSupervisorAvailableForEvenRebalance(s)) {
+                continue;
+            }
+            if (nodesUsedByTopology.contains(sid)) {
+                continue;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    public boolean isIdleSupervisorAvailableForEvenRebalance(SupervisorDetails supervisor) {
+        if (supervisor == null) {
+            return false;
+        }
+        if (isBlackListed(supervisor.getId())) {
+            return false;
+        }
+        if (supervisor.getAllPorts().isEmpty()) {
+            return false;
+        }
+        if (!getUsedPorts(supervisor).isEmpty()) {
+            return false;
+        }
+        return hasMinimumIdleSupervisorStability(supervisor);
+    }
+
+    private boolean hasMinimumIdleSupervisorStability(SupervisorDetails supervisor) {
+        int minStableRounds = ObjectReader.getInt(
+                conf.get(DaemonConfig.NIMBUS_EVEN_REBALANCE_IDLE_SUPERVISOR_MIN_STABLE_ROUNDS), 3);
+        if (minStableRounds <= 0) {
+            return true;
+        }
+        int monitorFrequencySecs = ObjectReader.getInt(conf.get(DaemonConfig.SUPERVISOR_MONITOR_FREQUENCY_SECS), 3);
+        long requiredUptimeSecs = (long) minStableRounds * Math.max(1, monitorFrequencySecs);
+        return supervisor.getUptimeSecs() >= requiredUptimeSecs;
+    }
+
     @Override
     public boolean needsSchedulingRas(TopologyDetails topology) {
         return getUnassignedExecutors(topology).size() > 0;

diff --git a/storm-server/src/main/java/org/apache/storm/scheduler/DefaultScheduler.java b/storm-server/src/main/java/org/apache/storm/scheduler/DefaultScheduler.java
@@ -72,7 +72,16 @@ public static Set<WorkerSlot> slotsCanReassign(Cluster cluster, Set<WorkerSlot>
     }
 
     public static void defaultSchedule(Topologies topologies, Cluster cluster) {
+        EvenScheduler.redistributeOntoIdleSupervisors(topologies, cluster);
         for (TopologyDetails topology : cluster.needsSchedulingTopologies()) {
+            // needsSchedulingTopologies() returns the cluster's full topology set, but this run is scoped to the
+            // topologies passed in: DefaultScheduler.schedule passes the full set (so the guard is a no-op), while
+            // IsolationScheduler delegates only its leftover, non-isolated topologies here. redistributeOntoIdleSupervisors
+            // above acted only on that passed-in set too. Skip topologies outside it so the leftover path never schedules
+            // one the caller excluded -- e.g. a down isolated topology on a reserved host.
+            if (topologies.getById(topology.getId()) == null) {
+                continue;
+            }
             List<WorkerSlot> availableSlots = cluster.getAvailableSlots();
             Set<ExecutorDetails> allExecutors = topology.getExecutors();