Skip to content

Commit e385ee3

Browse files
committed
#7 Reset idle timeout when other nodes are being provisioning
1 parent 63209a6 commit e385ee3

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

hpc_cluster_manager.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -305,16 +305,26 @@ def _check_runaway_and_idle_compute_nodes(self):
305305
self.logger.warn("Missing nodes in running state:{}".format(missing_nodes))
306306
self._set_nodes_closed(missing_nodes)
307307

308-
# Update running node names to remove closed nodes
309-
running_node_names = [name for name in running_node_names if
310-
(name not in unapproved_nodes and name not in missing_nodes)]
311-
idle_nodes = self._hpc_client.check_nodes_idle(running_node_names)
312-
self.logger.info("Get idle_nodes:{}".format(str(idle_nodes)))
313-
idle_timeout_nodes = self._check_node_idle_timeout([node.node_name for node in idle_nodes])
314-
self.logger.info("Get idle_timeout_nodes:{}".format(str(idle_timeout_nodes)))
315-
# If there is still node growing, we won't shrink at the same time
316308
if self.get_cores_in_provisioning() <= 0.0:
309+
# Update running node names to remove closed nodes
310+
running_node_names = [name for name in running_node_names if
311+
(name not in unapproved_nodes and name not in missing_nodes)]
312+
idle_nodes = self._hpc_client.check_nodes_idle(running_node_names)
313+
self.logger.info("Get idle_nodes:{}".format(str(idle_nodes)))
314+
idle_timeout_nodes = self._check_node_idle_timeout([node.node_name for node in idle_nodes])
315+
self.logger.info("Get idle_timeout_nodes:{}".format(str(idle_timeout_nodes)))
317316
self._set_nodes_draining(idle_timeout_nodes)
317+
else:
318+
# If there is still node growing, we won't shrink at the same time
319+
self._reset_node_idle_check()
320+
self.logger.info(
321+
"Reset node idle timeout as there are nodes being provisioning. Cores in provisioning: {}".format(
322+
self.get_cores_in_provisioning()))
323+
324+
def _reset_node_idle_check(self):
325+
# type: () -> ()
326+
self._node_idle_check_table.clear()
327+
self._removed_nodes.clear()
318328

319329
def _check_node_idle_timeout(self, node_names, now=None):
320330
# type: (Iterable[str], datetime) -> [str]

0 commit comments

Comments
 (0)