Skip to content

Commit 447323c

Browse files
Use suggested idling interval returned by the workflows api backend (#12)
* Idle for duration suggested by workflows API backend * Make lease extension a debug log message * Prepare release v0.41
1 parent f54c238 commit 447323c

File tree

12 files changed

+216
-122
lines changed

12 files changed

+216
-122
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ repos:
66
- id: end-of-file-fixer
77
- repo: https://github.com/charliermarsh/ruff-pre-commit
88
# keep the version here in sync with the version in uv.lock
9-
rev: "v0.12.2"
9+
rev: "v0.12.7"
1010
hooks:
1111
- id: ruff-check
1212
args: [--fix, --exit-non-zero-on-fix]

CHANGELOG.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.41.0] - 2025-08-01
11+
12+
### Added
13+
14+
- `tilebox-workflows`: Task runners now support receiving a suggested idling duration from the workflows API
15+
16+
### Fixed
17+
18+
- `tilebox-workflows`: Change task lease extension logging message to `DEBUG` level
19+
1020
## [0.40.0] - 2025-07-29
1121

1222
### Added
@@ -223,7 +233,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
223233
- Released packages: `tilebox-datasets`, `tilebox-workflows`, `tilebox-storage`, `tilebox-grpc`
224234

225235

226-
[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.40.0...HEAD
236+
[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.41.0...HEAD
237+
[0.41.0]: https://github.com/tilebox/tilebox-python/compare/v0.40.0...v0.41.0
227238
[0.40.0]: https://github.com/tilebox/tilebox-python/compare/v0.39.0...v0.40.0
228239
[0.39.0]: https://github.com/tilebox/tilebox-python/compare/v0.38.0...v0.39.0
229240
[0.38.0]: https://github.com/tilebox/tilebox-python/compare/v0.37.1...v0.38.0
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:8b2943e452addd5eedf0549a582b97b01ca746133a0f8171baa184e7c4dc0edf
3-
size 9244
2+
oid sha256:193198a63389117759a7dd8166ef13cd6c69395e35577d36b281d75a46c7c475
3+
size 9328
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:375301ced6092e5ed6b1e40307fdaa14ddb34aa5637847750935e8d307a64050
3-
size 5489
2+
oid sha256:5a75c0c3867517e5ee1f5f9da1a4321b5ac71f4e6d2fdff41580714722941c35
3+
size 5609

tilebox-workflows/tests/tasks_data.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import json
66
import string
7+
from datetime import timedelta
78

89
from hypothesis.strategies import (
910
DrawFn,
@@ -26,6 +27,7 @@
2627
Cluster,
2728
ComputedTask,
2829
CronTrigger,
30+
Idling,
2931
Job,
3032
JobState,
3133
StorageEventTrigger,
@@ -71,6 +73,17 @@ def tasks(draw: DrawFn) -> Task:
7173
return Task(task_id, identifier, state, task_input, display, job, parent_id, depends_on, lease, retry_count)
7274

7375

76+
@composite
77+
def idling_responses(draw: DrawFn) -> Idling:
78+
"""A hypothesis strategy for generating random idling_responses"""
79+
return Idling(
80+
timedelta(
81+
seconds=draw(integers(min_value=0, max_value=60 * 60)),
82+
milliseconds=draw(integers(min_value=0, max_value=1000)),
83+
)
84+
)
85+
86+
7487
@composite
7588
def task_identifiers(draw: DrawFn) -> TaskIdentifier:
7689
"""A hypothesis strategy for generating random task_identifiers"""

tilebox-workflows/tests/test_data.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
automations,
55
clusters,
66
computed_tasks,
7+
idling_responses,
78
jobs,
89
storage_locations,
910
task_identifiers,
@@ -15,6 +16,7 @@
1516
AutomationPrototype,
1617
Cluster,
1718
ComputedTask,
19+
Idling,
1820
Job,
1921
StorageLocation,
2022
Task,
@@ -34,6 +36,11 @@ def test_tasks_to_message_and_back(task: Task) -> None:
3436
assert Task.from_message(task.to_message()) == task
3537

3638

39+
@given(idling_responses())
40+
def test_idling_responses_to_message_and_back(idling: Idling) -> None:
41+
assert Idling.from_message(idling.to_message()) == idling
42+
43+
3744
@given(jobs())
3845
def test_jobs_to_message_and_back(job: Job) -> None:
3946
assert Job.from_message(job.to_message()) == job

tilebox-workflows/tilebox/workflows/data.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import re
22
import warnings
33
from dataclasses import dataclass, field
4-
from datetime import datetime
4+
from datetime import datetime, timedelta
55
from enum import Enum
66
from functools import lru_cache
77
from pathlib import Path
@@ -14,7 +14,13 @@
1414

1515
from tilebox.datasets.query.id_interval import IDInterval
1616
from tilebox.datasets.query.pagination import Pagination
17-
from tilebox.datasets.query.time_interval import TimeInterval, datetime_to_timestamp, timestamp_to_datetime
17+
from tilebox.datasets.query.time_interval import (
18+
TimeInterval,
19+
datetime_to_timestamp,
20+
duration_to_timedelta,
21+
timedelta_to_duration,
22+
timestamp_to_datetime,
23+
)
1824
from tilebox.datasets.uuid import uuid_message_to_optional_uuid, uuid_message_to_uuid, uuid_to_uuid_message
1925

2026
try:
@@ -149,6 +155,20 @@ def to_message(self) -> core_pb2.Task:
149155
)
150156

151157

158+
@dataclass(order=True)
159+
class Idling:
160+
suggested_idling_duration: timedelta
161+
162+
@classmethod
163+
def from_message(cls, idling: task_pb2.IdlingResponse) -> "Idling":
164+
"""Convert a Idling protobuf message to a Idling object."""
165+
return cls(suggested_idling_duration=duration_to_timedelta(idling.suggested_idling_duration))
166+
167+
def to_message(self) -> task_pb2.IdlingResponse:
168+
"""Convert a Idling object to a Idling protobuf message."""
169+
return task_pb2.IdlingResponse(suggested_idling_duration=timedelta_to_duration(self.suggested_idling_duration))
170+
171+
152172
class JobState(Enum):
153173
UNSPECIFIED = 0
154174
QUEUED = 1

tilebox-workflows/tilebox/workflows/runner/task_runner.py

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from _tilebox.grpc.error import InternalServerError
2929
from tilebox.datasets.sync.dataset import DatasetClient
3030
from tilebox.workflows.cache import JobCache
31-
from tilebox.workflows.data import ComputedTask, NextTaskToRun, Task, TaskLease
31+
from tilebox.workflows.data import ComputedTask, Idling, NextTaskToRun, Task, TaskLease
3232
from tilebox.workflows.interceptors import Interceptor, InterceptorType
3333
from tilebox.workflows.observability.logging import get_logger
3434
from tilebox.workflows.observability.tracing import WorkflowTracer
@@ -37,13 +37,24 @@
3737
from tilebox.workflows.task import FutureTask, RunnerContext, TaskMeta
3838
from tilebox.workflows.task import Task as TaskInstance
3939

40-
# In seconds
40+
# The time we give a task to finish it's execution when a runner shutdown is requested before we forcefully stop it
4141
_SHUTDOWN_GRACE_PERIOD = timedelta(seconds=2)
42-
_POLL_INTERVAL = timedelta(seconds=5)
43-
_JITTER_INTERVAL = timedelta(seconds=5)
42+
43+
# Retry configuration for retrying failed requests to the workflows API
4444
_INITIAL_RETRY_BACKOFF = timedelta(seconds=5)
4545
_MAX_RETRY_BACKOFF = timedelta(hours=1) # 1 hour
4646

47+
# A maximum idling duration, as a safeguard to avoid way too long sleep times in case the suggested idling duration is
48+
# ever too long. 5 minutes should be plenty of time to wait.
49+
_MAX_IDLING_DURATION = timedelta(minutes=5)
50+
# A minimum idling duration, as a safeguard to avoid too short sleep times in case the suggested idling duration is
51+
# ever too short.
52+
_MIN_IDLING_DURATION = timedelta(milliseconds=1)
53+
54+
# Fallback polling interval and jitter in case the workflows API fails to respond with a suggested idling duration
55+
_FALLBACK_POLL_INTERVAL = timedelta(seconds=5)
56+
_FALLBACK_JITTER_INTERVAL = timedelta(seconds=5)
57+
4758
WrappedFnReturnT = TypeVar("WrappedFnReturnT")
4859

4960

@@ -96,14 +107,14 @@ def _extend_lease_while_task_is_running(
96107

97108
break
98109

99-
logger.info(f"Extending task lease for {task_id=}, {task_lease=}")
110+
logger.debug(f"Extending task lease for {task_id=}, {task_lease=}")
100111
try:
101112
# The first time we call the function, we pass the argument we received
102113
# After that, we call it with the result of the previous call
103114
task_lease = service.extend_task_lease(task_id, 2 * task_lease.lease)
104115
if task_lease.lease == 0:
105116
# The server did not return a lease extension, it means that there is no need in trying to extend the lease
106-
logger.info(f"task lease extension not granted for task {task_id}")
117+
logger.debug(f"task lease extension not granted for task {task_id}")
107118
# even though we failed to extend the lease, let's still wait till the task is done
108119
# otherwise we might end up with a mismatch between the task currently being executed and the task
109120
# that we extend leases for (and the runner can anyways only execute one task at a time)
@@ -331,41 +342,59 @@ def run_all(self) -> None:
331342
"""
332343
self._run(stop_when_idling=True)
333344

334-
def _run(self, stop_when_idling: bool = True) -> None:
345+
def _run(self, stop_when_idling: bool = True) -> None: # noqa: C901
335346
"""
336347
Run the task runner forever. This will poll for new tasks and execute them as they come in.
337348
If no tasks are available, it will sleep for a short time and then try again.
338349
"""
339-
task: Task | None = None
350+
work: Task | Idling | None = None
340351

341352
# capture interrupt signals and delay them by a grace period in order to shut down gracefully
342353
with _GracefulShutdown(_SHUTDOWN_GRACE_PERIOD, self._service) as shutdown_context:
343354
while True:
344-
if task is None: # if we don't have a task right now, let's try to work-steal one
345-
if shutdown_context.is_shutting_down():
355+
if not isinstance(work, Task): # if we don't have a task right now, let's try to work-steal one
356+
if shutdown_context.is_shutting_down(): # unless we received an interrupt, then we shut down
346357
return
347358
try:
348-
task = self._service.next_task(task_to_run=self.tasks_to_run, computed_task=None)
359+
work = self._service.next_task(task_to_run=self.tasks_to_run, computed_task=None)
349360
except InternalServerError as e:
350361
# We do not need to retry here, since the task runner will sleep for a while and then anyways request this again.
351362
self.logger.error(f"Failed to get next task with error {e}")
352363

353-
if task is not None: # we have a task to execute
364+
if isinstance(work, Task): # we received a task to execute
365+
task = work
354366
if task.retry_count > 0:
355367
self.logger.debug(f"Retrying task {task.id} that failed {task.retry_count} times")
356-
task = self._execute(task, shutdown_context) # submitting the task gives us the next one
357-
else: # if we didn't get a task, let's sleep for a bit and try work-stealing again
358-
self.logger.debug("No task to run")
368+
work = self._execute(task, shutdown_context) # submitting the task gives us the next work item
369+
elif isinstance(work, Idling): # we received an idling response, so let's sleep for a bit
370+
self.logger.debug("No task to run, idling")
359371
if stop_when_idling: # if stop_when_idling is set, we can just return
360372
return
373+
361374
# now sleep for a bit and then try again, unless we receive an interrupt
362-
shutdown_context.sleep(
363-
_POLL_INTERVAL.total_seconds() + random.uniform(0, _JITTER_INTERVAL.total_seconds()) # noqa: S311
364-
)
375+
idling_duration = work.suggested_idling_duration
376+
idling_duration = min(idling_duration, _MAX_IDLING_DURATION)
377+
idling_duration = max(idling_duration, _MIN_IDLING_DURATION)
378+
shutdown_context.sleep(idling_duration.total_seconds())
365379
if shutdown_context.is_shutting_down():
366380
return
381+
else: # work is None
382+
# we didn't receive an idling response, but also not a task. This only happens if we didn't request
383+
# a task to run, indicating that we are shutting down.
384+
if shutdown_context.is_shutting_down():
385+
return
386+
387+
fallback_interval = _FALLBACK_POLL_INTERVAL.total_seconds() + random.uniform( # noqa: S311
388+
0, _FALLBACK_JITTER_INTERVAL.total_seconds()
389+
)
390+
self.logger.debug(
391+
f"Didn't receive a task to run, nor an idling response, but runner is not shutting down. "
392+
f"Falling back to a default idling period of {fallback_interval:.2f}s"
393+
)
394+
395+
shutdown_context.sleep(fallback_interval)
367396

368-
def _execute(self, task: Task, shutdown_context: _GracefulShutdown) -> Task | None:
397+
def _execute(self, task: Task, shutdown_context: _GracefulShutdown) -> Task | Idling | None:
369398
try:
370399
return self._try_execute(task, shutdown_context)
371400
except Exception as e:
@@ -380,7 +409,7 @@ def _execute(self, task: Task, shutdown_context: _GracefulShutdown) -> Task | No
380409
task_failed_retry(task, e)
381410
return None
382411

383-
def _try_execute(self, task: Task, shutdown_context: _GracefulShutdown) -> Task | None:
412+
def _try_execute(self, task: Task, shutdown_context: _GracefulShutdown) -> Task | Idling | None:
384413
if task.job is None:
385414
raise ValueError(f"Task {task.id} has no job associated with it.")
386415

tilebox-workflows/tilebox/workflows/runner/task_service.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from _tilebox.grpc.error import with_pythonic_errors
77
from tilebox.workflows.data import (
88
ComputedTask,
9+
Idling,
910
NextTaskToRun,
1011
Task,
1112
TaskLease,
@@ -32,18 +33,19 @@ def __init__(self, channel: Channel) -> None:
3233
"""
3334
self.service = with_pythonic_errors(TaskServiceStub(channel))
3435

35-
def next_task(self, task_to_run: NextTaskToRun | None, computed_task: ComputedTask | None) -> Task | None:
36+
def next_task(self, task_to_run: NextTaskToRun | None, computed_task: ComputedTask | None) -> Task | Idling | None:
3637
computed_task_message = None if computed_task is None else computed_task.to_message()
3738
task_to_run_message = None if task_to_run is None else task_to_run.to_message()
3839

3940
response: NextTaskResponse = self.service.NextTask(
4041
NextTaskRequest(computed_task=computed_task_message, next_task_to_run=task_to_run_message)
4142
)
42-
return (
43-
Task.from_message(response.next_task)
44-
if response.next_task is not None and response.next_task.id.uuid
45-
else None
46-
)
43+
44+
if response.next_task is not None and response.next_task.id.uuid:
45+
return Task.from_message(response.next_task)
46+
if response.idling is not None:
47+
return Idling.from_message(response.idling)
48+
return None
4749

4850
def task_failed(self, task: Task, error: Exception, cancel_job: bool = True) -> None:
4951
# job ouptut is limited to 1KB, so truncate the error message if necessary

0 commit comments

Comments
 (0)