Skip to content

Commit d0c8f21

Browse files
committed
allow remote execution of queue and submit commands
1 parent a0887c9 commit d0c8f21

File tree

3 files changed

+28
-8
lines changed

3 files changed

+28
-8
lines changed

adaptive_scheduler/server_support.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ async def manage_jobs(
155155
run_script="run_learner.py",
156156
python_executable=None,
157157
interval=30,
158+
remote=None,
158159
*,
159160
max_simultaneous_jobs=5000,
160161
max_fails_per_job=100,
@@ -164,7 +165,7 @@ async def manage_jobs(
164165
with concurrent.futures.ProcessPoolExecutor() as ex:
165166
while True:
166167
try:
167-
running = queue()
168+
running = queue(remote=remote)
168169
_update_db(db_fname, running) # in case some jobs died
169170
queued = {j["name"] for j in running.values() if j["name"] in job_names}
170171
not_queued = set(job_names) - queued
@@ -236,6 +237,7 @@ def start_job_manager(
236237
run_script="run_learner.py",
237238
python_executable=None,
238239
interval=30,
240+
remote=None,
239241
*,
240242
max_simultaneous_jobs=5000,
241243
max_fails_per_job=40,
@@ -250,6 +252,7 @@ def start_job_manager(
250252
run_script,
251253
python_executable,
252254
interval,
255+
remote,
253256
max_simultaneous_jobs=max_simultaneous_jobs,
254257
max_fails_per_job=max_fails_per_job,
255258
)
@@ -356,6 +359,7 @@ async def manage_killer(
356359
interval: int = 600,
357360
max_cancel_tries: int = 5,
358361
move_to: Optional[str] = None,
362+
remote: Optional[str] = None,
359363
) -> Coroutine:
360364
# It seems like tasks that print the error message do not always stop working
361365
# I think it only stops working when the error happens on a node where the logger runs.
@@ -371,7 +375,7 @@ async def manage_killer(
371375
to_delete = []
372376

373377
# get cancel/delete only the processes/logs that are running nowg
374-
for job_id, info in queue().items():
378+
for job_id, info in queue(remote=remote).items():
375379
job_name = info["name"]
376380
if job_id in failed_jobs.get(job_name, []):
377381
to_cancel.append(job_name)
@@ -427,9 +431,10 @@ def start_kill_manager(
427431
interval: int = 600,
428432
max_cancel_tries: int = 5,
429433
move_to: Optional[str] = None,
434+
remote: Optional[str] = None,
430435
) -> asyncio.Task:
431436
ioloop = asyncio.get_event_loop()
432-
coro = manage_killer(job_names, error, interval, max_cancel_tries, move_to)
437+
coro = manage_killer(job_names, error, interval, max_cancel_tries, move_to, remote)
433438
return ioloop.create_task(coro)
434439

435440

@@ -667,6 +672,7 @@ def __init__(
667672
overwrite_db: bool = True,
668673
start_job_manager_kwargs: Optional[dict] = None,
669674
start_kill_manager_kwargs: Optional[dict] = None,
675+
remote: Optional[str] = None,
670676
):
671677
# Set from arguments
672678
self.run_script = run_script
@@ -688,6 +694,7 @@ def __init__(
688694
self.overwrite_db = overwrite_db
689695
self.start_job_manager_kwargs = start_job_manager_kwargs or {}
690696
self.start_kill_manager_kwargs = start_kill_manager_kwargs or {}
697+
self.remote = remote
691698

692699
# Set in methods
693700
self.job_task = None
@@ -794,6 +801,7 @@ def _start_job_manager(self) -> None:
794801
interval=self.job_manager_interval,
795802
run_script=self.run_script,
796803
job_script_function=self.job_script_function,
804+
remote=self.remote,
797805
**self.start_job_manager_kwargs,
798806
)
799807

@@ -808,6 +816,7 @@ def _start_kill_manager(self) -> None:
808816
error=self.kill_on_error,
809817
interval=self.kill_interval,
810818
move_to=self.move_old_logs_to,
819+
remote=self.remote,
811820
**self.start_kill_manager_kwargs,
812821
)
813822

@@ -818,7 +827,7 @@ def cancel(self):
818827
self.database_task.cancel()
819828
if self.kill_task is not None:
820829
self.kill_task.cancel()
821-
return cancel(self.job_names)
830+
return cancel(self.job_names, remote=self.remote)
822831

823832
def cleanup(self):
824833
"""Cleanup the log and batch files.
@@ -953,7 +962,11 @@ def cleanup(_):
953962
)
954963

955964
def _info_html(self):
956-
jobs = [job for job in queue().values() if job["name"] in self.job_names]
965+
jobs = [
966+
job
967+
for job in queue(remote=self.remote).values()
968+
if job["name"] in self.job_names
969+
]
957970
n_running = sum(job["state"] in ("RUNNING", "R") for job in jobs)
958971
n_pending = sum(job["state"] in ("PENDING", "Q") for job in jobs)
959972
n_done = sum(job["is_done"] for job in self.get_database())

adaptive_scheduler/slurm.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,15 @@ def make_job_script(
133133
return job_script
134134

135135

136-
def queue(me_only=True):
136+
def queue(me_only=True, remote=None):
137137
"""Get the current running and pending jobs.
138138
139139
Parameters
140140
----------
141141
me_only : bool, default: True
142142
Only see your jobs.
143+
remote : str, optional
144+
Remote hostname, to run over ssh.
143145
144146
Returns
145147
-------
@@ -164,6 +166,8 @@ def queue(me_only=True):
164166
if me_only:
165167
username = getpass.getuser()
166168
cmd.append(f"--user={username}")
169+
if remote is not None:
170+
cmd = ["ssh", remote] + cmd
167171
proc = subprocess.run(cmd, text=True, capture_output=True)
168172
output = proc.stdout
169173

adaptive_scheduler/utils.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,10 @@ def _progress(seq: Sequence, with_progress_bar: bool = True, desc: str = ""):
8181

8282
def _cancel_function(cancel_cmd: str, queue_function: Callable) -> Callable:
8383
def cancel(
84-
job_names: List[str], with_progress_bar: bool = True, max_tries: int = 5
84+
job_names: List[str],
85+
with_progress_bar: bool = True,
86+
max_tries: int = 5,
87+
remote: Optional[str] = None,
8588
) -> Callable:
8689
"""Cancel all jobs in `job_names`.
8790
@@ -98,7 +101,7 @@ def cancel(
98101
def to_cancel(job_names):
99102
return [
100103
job_id
101-
for job_id, info in queue_function().items()
104+
for job_id, info in queue_function(remote=remote).items()
102105
if info["name"] in job_names
103106
]
104107

0 commit comments

Comments
 (0)