feat: add total_token_throughput metric

ajcasagrande · ajcasagrande · commit ec4f21180c65 · 2025-12-04T20:43:25.000-08:00
diff --git a/src/aiperf/metrics/types/prefill_throughput_per_user.py b/src/aiperf/metrics/types/prefill_throughput_per_user.py
@@ -10,21 +10,21 @@
 from aiperf.metrics.types.ttft_metric import TTFTMetric
 
 
-class PrefillThroughputMetric(BaseRecordMetric[float]):
+class PrefillThroughputPerUserMetric(BaseRecordMetric[float]):
     """
-    Post-processor for calculating Prefill Throughput metrics from records. This is only applicable to streaming responses.
+    Post-processor for calculating Prefill Throughput Per User metrics from records. This is only applicable to streaming responses.
 
     Formula:
-        Prefill Throughput = Prefill Sequence Length / Time to First Token (seconds)
+        Prefill Throughput Per User = Prefill Sequence Length / Time to First Token (seconds)
     """
 
-    tag = "prefill_throughput"
-    header = "Prefill Throughput"
-    short_header = "Prefill TPS"
+    tag = "prefill_throughput_per_user"
+    header = "Prefill Throughput Per User"
+    short_header = "Prefill TPS/User"
     short_header_hide_unit = True
-    unit = MetricOverTimeUnit.TOKENS_PER_SECOND
+    unit = MetricOverTimeUnit.TOKENS_PER_SECOND_PER_USER
     flags = (
-        MetricFlags.STREAMING_ONLY
+        MetricFlags.STREAMING_TOKENS_ONLY
         | MetricFlags.TOKENIZES_INPUT_ONLY
         | MetricFlags.LARGER_IS_BETTER
         | MetricFlags.NO_CONSOLE
@@ -39,7 +39,7 @@ def _parse_record(
         record: ParsedResponseRecord,
         record_metrics: MetricRecordDict,
     ) -> float:
-        """This method calculates the prefill throughput by dividing the input sequence length by the TTFT."""
+        """This method calculates the prefill throughput per user by dividing the input sequence length by the TTFT."""
 
         isl = record_metrics.get_or_raise(InputSequenceLengthMetric)
         converted_ttft = record_metrics.get_converted_or_raise(
@@ -48,6 +48,6 @@ def _parse_record(
         )
         if converted_ttft == 0:
             raise NoMetricValue(
-                "TTFT is zero, cannot calculate prefill throughput metric"
+                "TTFT is zero, cannot calculate prefill throughput per user metric"
             )
         return isl / converted_ttft  # type: ignore
diff --git a/src/aiperf/metrics/types/total_token_throughput.py b/src/aiperf/metrics/types/total_token_throughput.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from aiperf.common.enums import MetricFlags, MetricOverTimeUnit
+from aiperf.common.exceptions import NoMetricValue
+from aiperf.metrics import BaseDerivedMetric
+from aiperf.metrics.metric_dicts import MetricResultsDict
+from aiperf.metrics.types.benchmark_duration_metric import BenchmarkDurationMetric
+from aiperf.metrics.types.input_sequence_length_metric import (
+    TotalInputSequenceLengthMetric,
+)
+from aiperf.metrics.types.output_sequence_length_metric import (
+    TotalOutputSequenceLengthMetric,
+)
+
+
+class TotalTokenThroughputMetric(BaseDerivedMetric[float]):
+    """
+    Post Processor for calculating Total Token Throughput Metric.
+
+    Formula:
+        Total Token Throughput = (Total Input Tokens + Total Output Tokens) / Benchmark Duration (seconds)
+    """
+
+    tag = "total_token_throughput"
+    header = "Total Token Throughput"
+    short_header = "Total TPS"
+    short_header_hide_unit = True
+    unit = MetricOverTimeUnit.TOKENS_PER_SECOND
+    flags = (
+        MetricFlags.PRODUCES_TOKENS_ONLY
+        | MetricFlags.LARGER_IS_BETTER
+        | MetricFlags.NO_CONSOLE
+    )
+    required_metrics = {
+        TotalInputSequenceLengthMetric.tag,
+        TotalOutputSequenceLengthMetric.tag,
+        BenchmarkDurationMetric.tag,
+    }
+
+    def _derive_value(
+        self,
+        metric_results: MetricResultsDict,
+    ) -> float:
+        total_input_tokens = metric_results.get_or_raise(TotalInputSequenceLengthMetric)
+        total_output_tokens = metric_results.get_or_raise(
+            TotalOutputSequenceLengthMetric
+        )
+        benchmark_duration_converted = metric_results.get_converted_or_raise(
+            BenchmarkDurationMetric,
+            self.unit.time_unit,  # type: ignore
+        )
+        if benchmark_duration_converted == 0:
+            raise NoMetricValue(
+                "Benchmark duration is zero, cannot calculate total token throughput metric"
+            )
+        return (total_input_tokens + total_output_tokens) / benchmark_duration_converted  # type: ignore
diff --git a/tests/unit/metrics/test_total_token_throughput_metric.py b/tests/unit/metrics/test_total_token_throughput_metric.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from aiperf.common.constants import NANOS_PER_SECOND
+from aiperf.common.exceptions import NoMetricValue
+from aiperf.metrics.metric_dicts import MetricResultsDict
+from aiperf.metrics.types.benchmark_duration_metric import BenchmarkDurationMetric
+from aiperf.metrics.types.input_sequence_length_metric import (
+    TotalInputSequenceLengthMetric,
+)
+from aiperf.metrics.types.output_sequence_length_metric import (
+    TotalOutputSequenceLengthMetric,
+)
+from aiperf.metrics.types.total_token_throughput import TotalTokenThroughputMetric
+
+
+class TestTotalTokenThroughputMetric:
+    @pytest.mark.parametrize(
+        "input_tokens,output_tokens,duration,expected",
+        [
+            (600, 400, 2, 500.0),  # basic: (600+400) / 2s
+            (500, 250, 1.5, 500.0),  # fractional duration: (500+250) / 1.5s
+            (0, 0, 1, 0.0),  # zero tokens
+            (1000, 0, 1, 1000.0),  # only input tokens
+            (0, 1000, 1, 1000.0),  # only output tokens
+            (1, 1, 1, 2.0),  # minimal tokens
+            (500_000, 500_000, 1, 1_000_000.0),  # large token counts
+            (50, 50, 0.1, 1000.0),  # small duration: (50+50) / 0.1s
+        ],
+    )  # fmt: skip
+    def test_total_token_throughput_calculation(
+        self, input_tokens: int, output_tokens: int, duration: float, expected: float
+    ):
+        """Test throughput calculation: (input_tokens + output_tokens) / duration"""
+        metric = TotalTokenThroughputMetric()
+
+        metric_results = MetricResultsDict()
+        metric_results[TotalInputSequenceLengthMetric.tag] = input_tokens
+        metric_results[TotalOutputSequenceLengthMetric.tag] = output_tokens
+        metric_results[BenchmarkDurationMetric.tag] = duration * NANOS_PER_SECOND
+
+        result = metric.derive_value(metric_results)
+        assert result == pytest.approx(expected)
+
+    @pytest.mark.parametrize("duration", [0, 0.0, None])
+    def test_total_token_throughput_invalid_duration_raises(self, duration: float):
+        """Test error when benchmark duration is zero or None"""
+        metric = TotalTokenThroughputMetric()
+
+        metric_results = MetricResultsDict()
+        metric_results[TotalInputSequenceLengthMetric.tag] = 600
+        metric_results[TotalOutputSequenceLengthMetric.tag] = 400
+        metric_results[BenchmarkDurationMetric.tag] = duration * NANOS_PER_SECOND
+
+        with pytest.raises(NoMetricValue):
+            metric.derive_value(metric_results)