Add build_info metric and use it in generated queries (#35)

brettimus · Brett Beutell · web-flow · commit 1f0570618378 · 2023-05-11T14:02:41.000+02:00
* Add blurb to readme about identifying commits * Remove "coming soon" from readme item on adding links to live Prom charts * Initialize Prometheus Gauge for build_info * Add updown counter for build info to otel tracker * Implement set_build_info for OTEL and Prom, and call when we set the default tracker * Move set_build_info call into create_tracker * Update prometheus queries * Update prometheus URL tests * Add test for build_info gauge for prometheus tracker (skipped test for otel tracker) * Update otel tracker and tracker tests after finding otel prometheus bug * Ensure set_build_info is only called once * Update changelog * Add set_build_info to the TrackMetrics Protocol * Fix build_info query based off of autometrics-dev/autometrics-shared#8 * Rename create_tracker to init_tracker * Update pyright * Update README to mention OpenTelemetry tracker does not work with build_info --------- Co-authored-by: Brett Beutell <brett@fiberplane.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Support for build_info metrics in Prometheus (#35)
 - OpenTelemetry Support (#28)
 - Fly.io example (#26)
 - Django example (#22)
diff --git a/README.md b/README.md
@@ -17,7 +17,8 @@ See [Why Autometrics?](https://github.com/autometrics-dev#why-autometrics) for m
   most useful metrics
 - 💡 Writes Prometheus queries so you can understand the data generated without
   knowing PromQL
-- 🔗 Create links to live Prometheus charts directly into each functions docstrings (with tooltips coming soon!)
+- 🔗 Create links to live Prometheus charts directly into each function's docstring
+- [🔍 Identify commits](#identifying-commits-that-introduced-problems) that introduced errors or increased latency
 - [🚨 Define alerts](#alerts--slos) using SLO best practices directly in your source code
 - [📊 Grafana dashboards](#dashboards) work out of the box to visualize the performance of instrumented functions & SLOs
 - [⚙️ Configurable](#metrics-libraries) metric collection library (`opentelemetry`, `prometheus`, or `metrics`)
@@ -112,7 +113,22 @@ def api_handler():
 Configure the crate that autometrics will use to produce metrics by using one of the following feature flags:
 
 - `opentelemetry` - (enabled by default, can also be explicitly set using the AUTOMETRICS_TRACKER="OPEN_TELEMETERY" env var) uses
-- `prometheus` -(using the AUTOMETRICS_TRACKER env var set to "PROMETHEUS")
+- `prometheus` - (using the AUTOMETRICS_TRACKER env var set to "PROMETHEUS")
+
+## Identifying commits that introduced problems
+
+> **NOTE** - As of writing, `build_info` will not work correctly when using the default tracker (`AUTOMETRICS_TRACKER=OPEN_TELEMETRY`).
+> This will be fixed once the following PR is merged on the opentelemetry-python project: https://github.com/open-telemetry/opentelemetry-python/pull/3306
+>
+> autometrics-py will track support for build_info using the OpenTelemetry tracker via #38
+
+Autometrics makes it easy to identify if a specific version or commit introduced errors or increased latencies.
+
+It uses a separate metric (`build_info`) to track the version and, optionally, git commit of your service. It then writes queries that group metrics by the `version` and `commit` labels so you can spot correlations between those and potential issues.
+
+The `version` is read from the `AUTOMETRICS_VERSION` environment variable, and the `commit` value uses the environment variable `AUTOMETRICS_COMMIT`.
+
+This follows the method outlined in [Exposing the software version to Prometheus](https://www.robustperception.io/exposing-the-software-version-to-prometheus/).
 
 ## Development of the package
 
@@ -149,4 +165,6 @@ poetry run black .
 poetry run pyright
 # Run the tests using pytest
 poetry run pytest
+# Run a single test, and clear the cache
+poetry run pytest --cache-clear -k test_tracker
 ```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ typing-extensions = "^4.5.0"
 optional = true
 
 [tool.poetry.group.dev.dependencies]
-pyright = "^1.1.302"
+pyright = "^1.1.307"
 pytest = "^7.3.0"
 pytest-asyncio = "^0.21.0"
 black = "^23.3.0"
diff --git a/src/autometrics/constants.py b/src/autometrics/constants.py
@@ -2,17 +2,24 @@
 
 COUNTER_NAME = "function.calls.count"
 HISTOGRAM_NAME = "function.calls.duration"
+# NOTE - The Rust implementation does not use `build.info`, instead opts for just `build_info`
+BUILD_INFO_NAME = "build_info"
 
 COUNTER_NAME_PROMETHEUS = COUNTER_NAME.replace(".", "_")
 HISTOGRAM_NAME_PROMETHEUS = HISTOGRAM_NAME.replace(".", "_")
 
 COUNTER_DESCRIPTION = "Autometrics counter for tracking function calls"
 HISTOGRAM_DESCRIPTION = "Autometrics histogram for tracking function call duration"
+BUILD_INFO_DESCRIPTION = (
+    "Autometrics info metric for tracking software version and build details"
+)
 
 # The following constants are used to create the labels
 OBJECTIVE_NAME = "objective.name"
 OBJECTIVE_PERCENTILE = "objective.percentile"
 OBJECTIVE_LATENCY_THRESHOLD = "objective.latency_threshold"
+VERSION_KEY = "version"
+COMMIT_KEY = "commit"
 
 # The values are updated to use underscores instead of periods to avoid issues with prometheus.
 # A similar thing is done in the rust library, which supports multiple exporters
diff --git a/src/autometrics/prometheus_url.py b/src/autometrics/prometheus_url.py
@@ -3,6 +3,8 @@
 from typing import Optional
 from dotenv import load_dotenv
 
+ADD_BUILD_INFO_LABELS = "* on (instance, job) group_left(version, commit) (last_over_time(build_info[1s]) or on (instance, job) up)"
+
 
 def cleanup_url(url: str) -> str:
     """Remove the trailing slash if there is one."""
@@ -26,9 +28,9 @@ def __init__(
 
     def create_urls(self):
         """Create the prometheus query urls for the function and module."""
-        request_rate_query = f'sum by (function, module) (rate (function_calls_count_total{{function="{self.function_name}",module="{self.module_name}"}}[5m]))'
-        latency_query = f'sum by (le, function, module) (rate(function_calls_duration_bucket{{function="{self.function_name}",module="{self.module_name}"}}[5m]))'
-        error_ratio_query = f'sum by (function, module) (rate (function_calls_count_total{{function="{self.function_name}",module="{self.module_name}", result="error"}}[5m])) / {request_rate_query}'
+        request_rate_query = f'sum by (function, module, commit, version) (rate (function_calls_count_total{{function="{self.function_name}",module="{self.module_name}"}}[5m]) {ADD_BUILD_INFO_LABELS})'
+        latency_query = f'sum by (le, function, module, commit, version) (rate(function_calls_duration_bucket{{function="{self.function_name}",module="{self.module_name}"}}[5m]) {ADD_BUILD_INFO_LABELS})'
+        error_ratio_query = f'sum by (function, module, commit, version) (rate (function_calls_count_total{{function="{self.function_name}",module="{self.module_name}", result="error"}}[5m]) {ADD_BUILD_INFO_LABELS}) / {request_rate_query}'
 
         queries = {
             "Request rate URL": request_rate_query,
diff --git a/src/autometrics/test_prometheus_url.py b/src/autometrics/test_prometheus_url.py
@@ -24,11 +24,12 @@ def test_create_prometheus_url_with_default_url(default_url_generator: Generator
 def test_create_urls_with_default_url(default_url_generator: Generator):
     urls = default_url_generator.create_urls()
 
-    # print(urls.keys())
+    print(urls)
+
     result = {
-        "Request rate URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28function%2C%20module%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%29&g0.tab=0",
-        "Latency URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28le%2C%20function%2C%20module%29%20%28rate%28function_calls_duration_bucket%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%29&g0.tab=0",
-        "Error Ratio URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28function%2C%20module%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%2C%20result%3D%22error%22%7D%5B5m%5D%29%29%20/%20sum%20by%20%28function%2C%20module%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%29&g0.tab=0",
+        "Request rate URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28function%2C%20module%2C%20commit%2C%20version%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%20%2A%20on%20%28instance%2C%20job%29%20group_left%28version%2C%20commit%29%20%28last_over_time%28build_info%5B1s%5D%29%20or%20on%20%28instance%2C%20job%29%20up%29%29&g0.tab=0",
+        "Latency URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28le%2C%20function%2C%20module%2C%20commit%2C%20version%29%20%28rate%28function_calls_duration_bucket%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%20%2A%20on%20%28instance%2C%20job%29%20group_left%28version%2C%20commit%29%20%28last_over_time%28build_info%5B1s%5D%29%20or%20on%20%28instance%2C%20job%29%20up%29%29&g0.tab=0",
+        "Error Ratio URL": "http://localhost:9090/graph?g0.expr=sum%20by%20%28function%2C%20module%2C%20commit%2C%20version%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%2C%20result%3D%22error%22%7D%5B5m%5D%29%20%2A%20on%20%28instance%2C%20job%29%20group_left%28version%2C%20commit%29%20%28last_over_time%28build_info%5B1s%5D%29%20or%20on%20%28instance%2C%20job%29%20up%29%29%20/%20sum%20by%20%28function%2C%20module%2C%20commit%2C%20version%29%20%28rate%20%28function_calls_count_total%7Bfunction%3D%22myFunction%22%2Cmodule%3D%22myModule%22%7D%5B5m%5D%29%20%2A%20on%20%28instance%2C%20job%29%20group_left%28version%2C%20commit%29%20%28last_over_time%28build_info%5B1s%5D%29%20or%20on%20%28instance%2C%20job%29%20up%29%29&g0.tab=0",
     }
     assert result == urls
 
diff --git a/src/autometrics/tracker/opentelemetry.py b/src/autometrics/tracker/opentelemetry.py
@@ -4,6 +4,7 @@
     Meter,
     Counter,
     Histogram,
+    UpDownCounter,
     set_meter_provider,
 )
 
@@ -21,6 +22,8 @@
     COUNTER_NAME,
     HISTOGRAM_DESCRIPTION,
     HISTOGRAM_NAME,
+    BUILD_INFO_NAME,
+    BUILD_INFO_DESCRIPTION,
     OBJECTIVE_NAME,
     OBJECTIVE_PERCENTILE,
     OBJECTIVE_LATENCY_THRESHOLD,
@@ -39,6 +42,7 @@ class OpenTelemetryTracker:
 
     __counter_instance: Counter
     __histogram_instance: Histogram
+    __up_down_counter_instance: UpDownCounter
 
     def __init__(self):
         exporter = PrometheusMetricReader("")
@@ -60,6 +64,11 @@ def __init__(self):
             name=HISTOGRAM_NAME,
             description=HISTOGRAM_DESCRIPTION,
         )
+        self.__up_down_counter_instance = meter.create_up_down_counter(
+            name=BUILD_INFO_NAME,
+            description=BUILD_INFO_DESCRIPTION,
+        )
+        self._has_set_build_info = False
 
     def __count(
         self,
@@ -116,6 +125,17 @@ def __histogram(
             },
         )
 
+    def set_build_info(self, commit: str, version: str):
+        if not self._has_set_build_info:
+            self._has_set_build_info = True
+            self.__up_down_counter_instance.add(
+                1.0,
+                attributes={
+                    "commit": commit,
+                    "version": version,
+                },
+            )
+
     def finish(
         self,
         start_time: float,
diff --git a/src/autometrics/tracker/prometheus.py b/src/autometrics/tracker/prometheus.py
@@ -1,16 +1,20 @@
 import time
 from typing import Optional
-from prometheus_client import Counter, Histogram
+from prometheus_client import Counter, Histogram, Gauge
 from .tracker import Result
 
 from ..constants import (
     COUNTER_NAME_PROMETHEUS,
     HISTOGRAM_NAME_PROMETHEUS,
+    BUILD_INFO_NAME,
     COUNTER_DESCRIPTION,
     HISTOGRAM_DESCRIPTION,
+    BUILD_INFO_DESCRIPTION,
     OBJECTIVE_NAME_PROMETHEUS,
     OBJECTIVE_PERCENTILE_PROMETHEUS,
     OBJECTIVE_LATENCY_THRESHOLD_PROMETHEUS,
+    COMMIT_KEY,
+    VERSION_KEY,
 )
 from ..objectives import Objective
 
@@ -41,6 +45,12 @@ class PrometheusTracker:
             OBJECTIVE_LATENCY_THRESHOLD_PROMETHEUS,
         ],
     )
+    prom_gauge = Gauge(
+        BUILD_INFO_NAME, BUILD_INFO_DESCRIPTION, [COMMIT_KEY, VERSION_KEY]
+    )
+
+    def __init__(self) -> None:
+        self._has_set_build_info = False
 
     def _count(
         self,
@@ -93,6 +103,11 @@ def _histogram(
             threshold,
         ).observe(duration)
 
+    def set_build_info(self, commit: str, version: str):
+        if not self._has_set_build_info:
+            self._has_set_build_info = True
+            self.prom_gauge.labels(commit, version).set(1)
+
     # def start(self, function: str = None, module: str = None):
     #     """Start tracking metrics for a function call."""
     #     pass
diff --git a/src/autometrics/tracker/test_tracker.py b/src/autometrics/tracker/test_tracker.py
@@ -1,7 +1,10 @@
+from prometheus_client.exposition import generate_latest
+import pytest
+
 from .opentelemetry import OpenTelemetryTracker
 from .prometheus import PrometheusTracker
 
-from .tracker import default_tracker
+from .tracker import default_tracker, init_tracker, TrackerType
 
 
 def test_default_tracker(monkeypatch):
@@ -22,3 +25,55 @@ def test_default_tracker(monkeypatch):
     monkeypatch.setenv("AUTOMETRICS_TRACKER", "something_else")
     tracker = default_tracker()
     assert isinstance(tracker, OpenTelemetryTracker)
+
+
+def test_init_prometheus_tracker_set_build_info(monkeypatch):
+    """Test that init_tracker (for a Prometheus tracker) calls set_build_info using env vars."""
+
+    commit = "d6abce3"
+    version = "1.0.1"
+
+    monkeypatch.setenv("AUTOMETRICS_COMMIT", commit)
+    monkeypatch.setenv("AUTOMETRICS_VERSION", version)
+
+    prom_tracker = init_tracker(TrackerType.PROMETHEUS)
+    assert isinstance(prom_tracker, PrometheusTracker)
+
+    blob = generate_latest()
+    assert blob is not None
+    data = blob.decode("utf-8")
+
+    prom_build_info = f"""build_info{{commit="{commit}",version="{version}"}} 1.0"""
+    assert prom_build_info in data
+
+    monkeypatch.delenv("AUTOMETRICS_VERSION", raising=False)
+    monkeypatch.delenv("AUTOMETRICS_COMMIT", raising=False)
+
+
+def test_init_otel_tracker_set_build_info(monkeypatch):
+    """
+    Test that init_tracker (for an OTEL tracker) calls set_build_info using env vars.
+    Note that the OTEL collector translates metrics to Prometheus.
+    """
+    pytest.skip(
+        "Skipping test because OTEL collector does not create a gauge when it translates UpDownCounter to Prometheus"
+    )
+
+    commit = "a29a178"
+    version = "0.0.1"
+
+    monkeypatch.setenv("AUTOMETRICS_COMMIT", commit)
+    monkeypatch.setenv("AUTOMETRICS_VERSION", version)
+
+    otel_tracker = init_tracker(TrackerType.OPENTELEMETRY)
+    assert isinstance(otel_tracker, OpenTelemetryTracker)
+
+    blob = generate_latest()
+    assert blob is not None
+    data = blob.decode("utf-8")
+
+    prom_build_info = f"""build_info{{commit="{commit}",version="{version}"}} 1.0"""
+    assert prom_build_info in data
+
+    monkeypatch.delenv("AUTOMETRICS_VERSION", raising=False)
+    monkeypatch.delenv("AUTOMETRICS_COMMIT", raising=False)
diff --git a/src/autometrics/tracker/tracker.py b/src/autometrics/tracker/tracker.py
@@ -16,6 +16,9 @@ class Result(Enum):
 class TrackMetrics(Protocol):
     """Protocol for tracking metrics."""
 
+    def set_build_info(self, commit: str, version: str):
+        """Observe the build info. Should only be called once per tracker instance"""
+
     def finish(
         self,
         start_time: float,
@@ -35,18 +38,28 @@ class TrackerType(Enum):
     PROMETHEUS = "prometheus"
 
 
-def create_tracker(tracker_type: TrackerType) -> TrackMetrics:
+def init_tracker(tracker_type: TrackerType) -> TrackMetrics:
     """Create a tracker"""
+
+    tracker_instance: TrackMetrics
     if tracker_type == TrackerType.OPENTELEMETRY:
         # pylint: disable=import-outside-toplevel
         from .opentelemetry import OpenTelemetryTracker
 
-        return OpenTelemetryTracker()
+        tracker_instance = OpenTelemetryTracker()
     elif tracker_type == TrackerType.PROMETHEUS:
         # pylint: disable=import-outside-toplevel
         from .prometheus import PrometheusTracker
 
-        return PrometheusTracker()
+        tracker_instance = PrometheusTracker()
+
+    # NOTE - Only set the build info when the tracker is initialized
+    tracker_instance.set_build_info(
+        commit=os.getenv("AUTOMETRICS_COMMIT") or "",
+        version=os.getenv("AUTOMETRICS_VERSION") or "",
+    )
+
+    return tracker_instance
 
 
 def get_tracker_type() -> TrackerType:
@@ -60,7 +73,7 @@ def get_tracker_type() -> TrackerType:
 def default_tracker():
     """Setup the default tracker."""
     preferred_tracker = get_tracker_type()
-    return create_tracker(preferred_tracker)
+    return init_tracker(preferred_tracker)
 
 
 tracker: TrackMetrics = default_tracker()
@@ -74,4 +87,4 @@ def get_tracker() -> TrackMetrics:
 def set_tracker(tracker_type: TrackerType):
     """Set the tracker type."""
     global tracker
-    tracker = create_tracker(tracker_type)
+    tracker = init_tracker(tracker_type)