Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions conf/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ topology.max.replication.wait.time.sec: 60
nimbus.credential.renewers.freq.secs: 600
nimbus.queue.size: 100000
scheduler.display.resource: false
nimbus.even.rebalance.idle.supervisor.enabled: false
nimbus.even.rebalance.max.free.per.topology: 0
nimbus.even.rebalance.idle.supervisor.min.stable.rounds: 3
nimbus.local.assignments.backend.class: "org.apache.storm.assignments.InMemoryAssignmentBackend"
nimbus.assignments.service.threads: 10
nimbus.assignments.service.thread.queue.size: 100
Expand Down
31 changes: 31 additions & 0 deletions storm-server/src/main/java/org/apache/storm/DaemonConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,37 @@ public class DaemonConfig implements Validated {
@IsBoolean
public static final String SCHEDULER_DISPLAY_RESOURCE = "scheduler.display.resource";

/**
* If true, {@link org.apache.storm.scheduler.EvenScheduler} may move already-assigned workers onto non-blacklisted supervisors
* with no slot in use. This lets a freshly returned supervisor pick up workers instead of staying idle. The number of workers
* freed per topology in a single scheduling round is capped by {@link #NIMBUS_EVEN_REBALANCE_MAX_FREE_PER_TOPOLOGY}, so even
* distribution is approached gradually rather than rebuilt from scratch.
*/
@IsBoolean
public static final String NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED
= "nimbus.even.rebalance.idle.supervisor.enabled";

/**
* Optional upper bound on the number of currently-assigned workers a single topology may release in one scheduling round
* when the idle-supervisor rebalance defined by {@link #NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED} kicks in. The
* default budget already targets an even per-supervisor distribution (idle supervisors absorb roughly {@code numWorkers /
* numSupervisors} workers each in one round), capped by the idle side's free slot capacity. Setting this to a positive
* value tightens that budget; setting it to {@code 0} or a negative value leaves the even-distribution budget unbounded.
*/
@IsInteger
public static final String NIMBUS_EVEN_REBALANCE_MAX_FREE_PER_TOPOLOGY
= "nimbus.even.rebalance.max.free.per.topology";

/**
* Minimum number of consecutive supervisor monitor rounds that a fully-idle supervisor must have been alive before
* {@link org.apache.storm.scheduler.EvenScheduler} can relocate workers onto it. A positive value avoids moving workers onto a
* supervisor that has only just returned and may still be flapping. Setting this to {@code 0} or a negative value disables the
* uptime guard.
*/
@IsInteger
public static final String NIMBUS_EVEN_REBALANCE_IDLE_SUPERVISOR_MIN_STABLE_ROUNDS
= "nimbus.even.rebalance.idle.supervisor.min.stable.rounds";

/**
* The directory where storm's health scripts go.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -985,11 +985,16 @@ private static Map<String, SupervisorDetails> basicSupervisorDetailsMap(IStormCl
String id = entry.getKey();
SupervisorInfo info = entry.getValue();
ret.put(id, new SupervisorDetails(id, info.get_server_port(), info.get_hostname(),
info.get_scheduler_meta(), null, info.get_resources_map()));
info.get_scheduler_meta(), null, info.get_resources_map(),
supervisorUptimeSecs(info)));
}
return ret;
}

private static long supervisorUptimeSecs(SupervisorInfo info) {
return info.is_set_uptime_secs() ? info.get_uptime_secs() : 0L;
}

/**
* NOTE: this can return false when a topology has just been activated. The topology may still be
* in the STORMS_SUBTREE.
Expand Down Expand Up @@ -2273,7 +2278,8 @@ private Map<String, SupervisorDetails> readAllSupervisorDetails(Map<String, Set<
List<SupervisorDetails> superDetails = new ArrayList<>();
for (Entry<String, SupervisorInfo> entry : superInfos.entrySet()) {
SupervisorInfo info = entry.getValue();
superDetails.add(new SupervisorDetails(entry.getKey(), info.get_meta(), info.get_resources_map()));
superDetails.add(new SupervisorDetails(entry.getKey(), info.get_meta(), info.get_resources_map(),
supervisorUptimeSecs(info)));
}
// Note that allSlotsAvailableForScheduling
// only uses the supervisor-details. The rest of the arguments
Expand Down Expand Up @@ -2306,7 +2312,7 @@ private Map<String, SupervisorDetails> readAllSupervisorDetails(Map<String, Set<
allPorts.removeAll(deadPorts);
}
ret.put(superId, new SupervisorDetails(superId, hostname, info.get_scheduler_meta(),
allPorts, info.get_resources_map()));
allPorts, info.get_resources_map(), supervisorUptimeSecs(info)));
}
return ret;
}
Expand Down Expand Up @@ -5526,4 +5532,3 @@ public void run() {
}
}
}

58 changes: 58 additions & 0 deletions storm-server/src/main/java/org/apache/storm/scheduler/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,64 @@ public boolean needsScheduling(TopologyDetails topology) {
return desiredNumWorkers > assignedNumWorkers || getUnassignedExecutors(topology).size() > 0;
}

/**
* Returns true when there is at least one stable, non-blacklisted supervisor whose slots are all currently free and the
* topology is not already on that supervisor. Controlled by
* {@link DaemonConfig#NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED}; returns false when disabled. The check is
* binary by design -- a supervisor either has zero used slots or it does not -- so this never fires for "almost balanced"
* clusters. Topologies that cannot benefit from a move (e.g. only a single worker assigned) are filtered later by the
* drain-budget computation in {@link EvenScheduler}, which evaluates to zero whenever
* {@code floor(numWorkers / nonBlacklistedSupervisorCount)} is zero.
*/
public boolean hasIdleSupervisorReusableBy(TopologyDetails topology) {
if (!ObjectReader.getBoolean(
conf.get(DaemonConfig.NIMBUS_EVEN_REBALANCE_ON_IDLE_SUPERVISOR_ENABLED), false)) {
return false;
}
Set<String> nodesUsedByTopology = new HashSet<>();
for (WorkerSlot slot : getUsedSlotsByTopologyId(topology.getId())) {
nodesUsedByTopology.add(slot.getNodeId());
}
for (SupervisorDetails s : supervisors.values()) {
String sid = s.getId();
if (!isIdleSupervisorAvailableForEvenRebalance(s)) {
continue;
}
if (nodesUsedByTopology.contains(sid)) {
continue;
}
return true;
}
return false;
}

public boolean isIdleSupervisorAvailableForEvenRebalance(SupervisorDetails supervisor) {
if (supervisor == null) {
return false;
}
if (isBlackListed(supervisor.getId())) {
return false;
}
if (supervisor.getAllPorts().isEmpty()) {
return false;
}
if (!getUsedPorts(supervisor).isEmpty()) {
return false;
}
return hasMinimumIdleSupervisorStability(supervisor);
}

private boolean hasMinimumIdleSupervisorStability(SupervisorDetails supervisor) {
int minStableRounds = ObjectReader.getInt(
conf.get(DaemonConfig.NIMBUS_EVEN_REBALANCE_IDLE_SUPERVISOR_MIN_STABLE_ROUNDS), 3);
if (minStableRounds <= 0) {
return true;
}
int monitorFrequencySecs = ObjectReader.getInt(conf.get(DaemonConfig.SUPERVISOR_MONITOR_FREQUENCY_SECS), 3);
long requiredUptimeSecs = (long) minStableRounds * Math.max(1, monitorFrequencySecs);
return supervisor.getUptimeSecs() >= requiredUptimeSecs;
}

@Override
public boolean needsSchedulingRas(TopologyDetails topology) {
return getUnassignedExecutors(topology).size() > 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,16 @@ public static Set<WorkerSlot> slotsCanReassign(Cluster cluster, Set<WorkerSlot>
}

public static void defaultSchedule(Topologies topologies, Cluster cluster) {
EvenScheduler.redistributeOntoIdleSupervisors(topologies, cluster);
for (TopologyDetails topology : cluster.needsSchedulingTopologies()) {
// needsSchedulingTopologies() returns the cluster's full topology set, but this run is scoped to the
// topologies passed in: DefaultScheduler.schedule passes the full set (so the guard is a no-op), while
// IsolationScheduler delegates only its leftover, non-isolated topologies here. redistributeOntoIdleSupervisors
// above acted only on that passed-in set too. Skip topologies outside it so the leftover path never schedules
// one the caller excluded -- e.g. a down isolated topology on a reserved host.
if (topologies.getById(topology.getId()) == null) {
continue;
}
List<WorkerSlot> availableSlots = cluster.getAvailableSlots();
Set<ExecutorDetails> allExecutors = topology.getExecutors();

Expand Down
Loading