Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
de85427
feat(client): accumulate per-connection round-trip time in send_recv
spMohanty Jun 6, 2026
c5e1622
feat(client): add pure timing-decomposition helpers
spMohanty Jun 6, 2026
7a67e80
feat(client): expose timing-split properties on BudgetContext
spMohanty Jun 6, 2026
971dca4
feat(client): populate BudgetContext timing split from server compute…
spMohanty Jun 6, 2026
221e87f
test(server): pin that close() surfaces recorded compute time
spMohanty Jun 6, 2026
2590e8b
docs: changelog for client timing split
spMohanty Jun 6, 2026
bc0d5d5
feat(server): report compute_time as pure numpy-kernel time
spMohanty Jun 6, 2026
d12c65b
feat(client): add dispatch-time accumulator with baseline/delta nesting
spMohanty Jun 6, 2026
09609f5
feat(client): time full dispatch at every op entry point
spMohanty Jun 6, 2026
c9f4a71
feat(client): decompose timing from full dispatch (overhead = dispatc…
spMohanty Jun 6, 2026
733821e
test(client): rewrite timing integration suite for pure-isolation sem…
spMohanty Jun 6, 2026
ff376f0
fix(client): count fetch reconstruction (tolist/repr/scalar) as overh…
spMohanty Jun 6, 2026
830a484
docs: changelog for precise timing split
spMohanty Jun 6, 2026
9f42ada
fix(client): count budget-response parsing as overhead, not residual
spMohanty Jun 6, 2026
06857ed
fix(build): emit timed_dispatch in generated stats/linalg client proxies
spMohanty Jun 6, 2026
2e32237
test: preload client _dispatch module before _remote_array in parity …
spMohanty Jun 6, 2026
43cdd36
style: ruff-format and import-sort the new timing tests
spMohanty Jun 6, 2026
6f7c9b0
fix(client): count flops cost-query round-trips as overhead, not resi…
spMohanty Jun 6, 2026
e6ca3b3
fix(client): refresh flops_used from server on BudgetContext close
spMohanty Jun 6, 2026
b0d69b9
feat(client): show backend/overhead/residual timing in budget summary
spMohanty Jun 6, 2026
3709dcd
test: relax flaky cold-call benchmark budget to 500ms; fix ms formatting
spMohanty Jun 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,28 @@ See the [README](README.md) for the API overview and the
full API reference.

## Unreleased

### Fix

- **flopscope timing split**: `BudgetContext` now reports a precise client/server
decomposition — `backend` = pure server numpy kernel, `overhead` = all flopscope
machinery (client dispatch + wire + server marshaling), `residual` = the
participant's own Python only. The server reports `compute_time` as kernel-only;
the client times the full op dispatch (including result reconstruction). Fixes
the prior all-zero (and the later round-trip-approximation) behavior on the
server-backed path.
- **flopscope cost queries**: `flops.einsum_cost` / `flops.svd_cost` now count their
server round-trip as `overhead` (via `@timed_dispatch`) instead of leaking it into
the billed `residual` bucket when a participant calls them inside a budget context.
The implicit global-default budget-open round-trip is likewise wrapped defensively.
- **flopscope flops_used cache**: `BudgetContext.__exit__` now refreshes `flops_used`
from the server's `budget_breakdown` in the `budget_close` response (it is nested at
`result.budget_breakdown`, one level deeper than `budget_status`). A plain `with`
block now reports the authoritative server count — and `render_budget_summary()`
shows it — without the prior `summary()`-call workaround.

### Added

- **budget summary timing**: `render_budget_summary()` now shows a session
wall-time breakdown (backend / overhead / residual) beneath FLOP usage, in both
the Rich and plain-text renderers, surfacing the per-context timing split.
5 changes: 4 additions & 1 deletion flopscope-client/src/flopscope/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
budget,
budget_summary_dict,
)
from flopscope._dispatch import timed_dispatch # noqa: E402
from flopscope._display import budget_live, budget_summary # noqa: E402
from flopscope._math_compat import e, inf, nan, pi # noqa: E402
from flopscope._perm_group import SymmetryGroup # noqa: E402
Expand Down Expand Up @@ -136,7 +137,7 @@ def proxy(*args: Any, **kwargs: Any):

proxy.__name__ = op_name
proxy.__qualname__ = op_name
return proxy
return timed_dispatch(proxy)


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -182,6 +183,7 @@ def _infer_dtype(values):
return "float64" # mixed or float values


@timed_dispatch
def array(object, dtype=None, **kwargs):
"""Create a remote array from a Python list, tuple, or existing RemoteArray.

Expand Down Expand Up @@ -272,6 +274,7 @@ def array(object, dtype=None, **kwargs):
# ---------------------------------------------------------------------------


@timed_dispatch
def einsum(subscripts, *operands, **kwargs):
"""Einstein summation on remote arrays.

Expand Down
175 changes: 160 additions & 15 deletions flopscope-client/src/flopscope/_budget.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

from __future__ import annotations

import time

from flopscope._connection import get_connection
from flopscope._dispatch import dispatch_span, total_dispatch_ns
from flopscope._protocol import (
encode_budget_close,
encode_budget_open,
Expand All @@ -13,6 +16,67 @@
_active_context = None


def _extract_compute_ns(close_response: object) -> int:
"""Pull total server compute (ns) out of a ``budget_close`` response.

Returns 0 if the ``result.comms_summary.total_compute_time_ns`` path is
absent or unparseable (defensive; the version handshake makes it present in
practice).
"""
if not isinstance(close_response, dict):
return 0
result = close_response.get("result")
if not isinstance(result, dict):
return 0
comms = result.get("comms_summary")
if not isinstance(comms, dict):
return 0
try:
return int(comms.get("total_compute_time_ns", 0))
except (TypeError, ValueError):
return 0


def _extract_close_budget(close_response: object) -> dict:
"""Pull the ``budget_breakdown`` dict (which carries ``flops_used``) out of a
``budget_close`` response.

The authoritative FLOP count is nested at ``result.budget_breakdown`` —
unlike ``budget_status``, which exposes ``flops_used`` directly under
``result``. Returns ``{}`` if the path is absent (defensive; the version
handshake makes it present in practice).
"""
if not isinstance(close_response, dict):
return {}
result = close_response.get("result")
if not isinstance(result, dict):
return {}
breakdown = result.get("budget_breakdown")
return breakdown if isinstance(breakdown, dict) else {}


def _decompose_timing(
wall_ns: int, dispatch_ns: int, kernel_ns: int
) -> tuple[float, float, float, float]:
"""Decompose context wall into (wall, backend, overhead, residual) seconds.

- backend = pure server numpy kernel (``kernel_ns``)
- overhead = all other flopscope machinery: client dispatch + wire +
server marshaling/store/framing = ``dispatch − kernel``; not billed
- residual = the participant's own Python = ``wall − dispatch``; billed

Each is clamped to >= 0 for cross-clock skew. In the normal regime
(``kernel <= dispatch <= wall``) no clamp fires and
``wall == backend + overhead + residual`` exactly.
"""
wall_s = wall_ns / 1e9
backend_s = max(0, kernel_ns) / 1e9
dispatch_s = dispatch_ns / 1e9
overhead_s = max(0.0, dispatch_s - backend_s)
residual_s = max(0.0, wall_s - dispatch_s)
return wall_s, backend_s, overhead_s, residual_s


class OpRecord:
"""Record of a single operation's FLOP cost.

Expand Down Expand Up @@ -75,6 +139,14 @@ def __init__(
self._close_summary: str | None = None
self._is_open: bool = False
self._previous_context = None
# Timing split — populated on __exit__. None until then for wall/residual,
# 0.0 for backend/overhead, mirroring the in-process flopscope contract.
self._wall_time_s: float | None = None
self._flopscope_backend_time: float = 0.0
self._flopscope_overhead_time: float = 0.0
self._residual_wall_time: float | None = None
self._wall_start_ns: int | None = None
self._dispatch_baseline_ns: int = 0

# ------------------------------------------------------------------
# Properties
Expand Down Expand Up @@ -110,6 +182,28 @@ def namespace(self) -> str | None:
"""Optional namespace label for this context."""
return self._namespace

@property
def wall_time_s(self) -> float | None:
"""Total wall-clock seconds spanned by the context (None until closed)."""
return self._wall_time_s

@property
def flopscope_backend_time(self) -> float:
"""Seconds of real op compute on the server (0.0 until closed)."""
return self._flopscope_backend_time

@property
def flopscope_overhead_time(self) -> float:
"""Seconds of flopscope transport overhead — serialization + network +
server-side comms (0.0 until closed). Not billed."""
return self._flopscope_overhead_time

@property
def residual_wall_time(self) -> float | None:
"""Seconds of participant Python outside flopscope calls (None until
closed). The billed bucket: C_m = F_m + lambda * residual."""
return self._residual_wall_time

# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
Expand Down Expand Up @@ -141,7 +235,8 @@ def summary(self) -> str:
Human-readable summary of budget usage.
"""
conn = get_connection()
response = conn.send_recv(encode_budget_status())
with dispatch_span():
response = conn.send_recv(encode_budget_status())
# Budget status is nested inside "result" key
result = response.get("result", {})
self._update_budget(result)
Expand Down Expand Up @@ -179,21 +274,44 @@ def __enter__(self) -> BudgetContext:
)
self._previous_context = _active_context
conn = get_connection()
response = conn.send_recv(
encode_budget_open(self._flop_budget, self._flop_multiplier)
)
self._update_budget(response)
self._wall_start_ns = time.perf_counter_ns()
self._dispatch_baseline_ns = total_dispatch_ns()
with dispatch_span():
response = conn.send_recv(
encode_budget_open(self._flop_budget, self._flop_multiplier)
)
self._update_budget(response)
self._is_open = True
_active_context = self
return self

def __exit__(self, *args: object) -> None:
"""Close the budget on the server and store the close summary."""
"""Close the budget, compute the timing split, store summary."""
global _active_context
if self._is_open:
conn = get_connection()
response = conn.send_recv(encode_budget_close())
self._update_budget(response)
with dispatch_span():
response = conn.send_recv(encode_budget_close())
# flops_used is nested at result.budget_breakdown in the close
# response; refresh the cache from there so a plain `with` block
# reports the server's count without a separate summary() call.
self._update_budget(_extract_close_budget(response))
# _wall_start_ns is always set by __enter__ before _is_open=True; the
# fallback only guards the never-exercised "exit without enter" path.
start_ns = (
self._wall_start_ns
if self._wall_start_ns is not None
else time.perf_counter_ns()
)
wall_ns = time.perf_counter_ns() - start_ns
dispatch_ns = total_dispatch_ns() - self._dispatch_baseline_ns
kernel_ns = _extract_compute_ns(response)
(
self._wall_time_s,
self._flopscope_backend_time,
self._flopscope_overhead_time,
self._residual_wall_time,
) = _decompose_timing(wall_ns, dispatch_ns, kernel_ns)
self._close_summary = (
f"BudgetContext closed: {self._flops_used}/{self._flop_budget} "
f"FLOPs used"
Expand All @@ -218,10 +336,23 @@ def __repr__(self) -> str: # pragma: no cover
class NamespaceRecord:
"""Snapshot of a BudgetContext's state at close time."""

def __init__(self, namespace, flop_budget, flops_used):
def __init__(
self,
namespace,
flop_budget,
flops_used,
wall_time_s=0.0,
backend_time_s=0.0,
overhead_time_s=0.0,
residual_time_s=0.0,
):
self.namespace = namespace
self.flop_budget = flop_budget
self.flops_used = flops_used
self.wall_time_s = wall_time_s
self.backend_time_s = backend_time_s
self.overhead_time_s = overhead_time_s
self.residual_time_s = residual_time_s


class BudgetAccumulator:
Expand All @@ -231,11 +362,17 @@ def __init__(self):
self._records = []

def record(self, ctx):
# `or 0.0` coerces the Optional timing fields (wall_time_s /
# residual_wall_time are None on a never-closed context) to 0.0.
self._records.append(
NamespaceRecord(
namespace=ctx.namespace,
flop_budget=ctx.flop_budget,
flops_used=ctx.flops_used,
wall_time_s=getattr(ctx, "wall_time_s", 0.0) or 0.0,
backend_time_s=getattr(ctx, "flopscope_backend_time", 0.0) or 0.0,
overhead_time_s=getattr(ctx, "flopscope_overhead_time", 0.0) or 0.0,
residual_time_s=getattr(ctx, "residual_wall_time", 0.0) or 0.0,
)
)

Expand All @@ -247,6 +384,10 @@ def get_data(self, by_namespace=False):
"flops_used": total_used,
"flops_remaining": total_budget - total_used,
"operations": {},
"wall_time_s": sum(r.wall_time_s for r in self._records),
"flopscope_backend_time_s": sum(r.backend_time_s for r in self._records),
"flopscope_overhead_time_s": sum(r.overhead_time_s for r in self._records),
"residual_wall_time_s": sum(r.residual_time_s for r in self._records),
}
if by_namespace:
by_ns = {}
Expand Down Expand Up @@ -304,14 +445,18 @@ def _get_global_default():
quiet=True,
namespace=None,
)
# Open it on the server
# Open it on the server. Defensive: keep the round-trip inside a
# dispatch span so it counts as overhead (never billed residual) if this
# implicit global-default path is ever wired up. It is currently
# unreferenced, but the invariant is "every send_recv lives in a span".
conn = get_connection()
response = conn.send_recv(
encode_budget_open(
_global_default._flop_budget, _global_default._flop_multiplier
with dispatch_span():
response = conn.send_recv(
encode_budget_open(
_global_default._flop_budget, _global_default._flop_multiplier
)
)
)
_global_default._update_budget(response)
_global_default._update_budget(response)
_global_default._is_open = True
_active_context = _global_default
return _global_default
Expand Down
65 changes: 65 additions & 0 deletions flopscope-client/src/flopscope/_dispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Client-side dispatch-time accumulator.

Times the *full* client dispatch of each flopscope op (request encode → socket
send/recv → response decode → result reconstruction) and accumulates it into a
single per-process counter. ``BudgetContext`` snapshots a baseline on enter and
reads the delta on exit; ``overhead = dispatch - backend`` and
``residual = wall - dispatch``.

Nesting is handled with a baseline/delta trick (ported from the in-process
``_counted_wrapper``): each span adds only its own remainder
(``wall - inner_already_counted``), so a span that contains nested spans counts
each op's wall exactly once. Single participant process, single-threaded (the
server allows one session at a time), so a module-level counter is safe.
"""

from __future__ import annotations

import functools
import time
from contextlib import contextmanager

_total_dispatch_ns: int = 0


def _now_ns() -> int:
"""Monotonic nanosecond clock (indirected so tests can fake it)."""
return time.perf_counter_ns()


def total_dispatch_ns() -> int:
"""Total client dispatch nanoseconds accumulated so far this process."""
return _total_dispatch_ns


def reset_dispatch() -> None:
"""Reset the accumulator (tests only)."""
global _total_dispatch_ns
_total_dispatch_ns = 0


@contextmanager
def dispatch_span():
"""Bracket one full op dispatch; add only this span's own remainder."""
global _total_dispatch_ns
t0 = _now_ns()
baseline = _total_dispatch_ns
try:
yield
finally:
wall = _now_ns() - t0
inner = _total_dispatch_ns - baseline
# inner > wall is impossible with a monotonic clock; max() guards only
# against cross-clock skew / faked-clock edge cases.
_total_dispatch_ns += max(0, wall - inner)


def timed_dispatch(fn):
"""Decorator: time the full dispatch of *fn* into the accumulator."""

@functools.wraps(fn)
def wrapped(*args, **kwargs):
with dispatch_span():
return fn(*args, **kwargs)

return wrapped
Loading
Loading