Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions .github/workflows/registry-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ jobs:
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
# Tags drive the phantom-version filter in extract_metadata.py
# (only versions with a real `providers-<id>/<ver>` tag are
# treated as released). Without this, the filter silently
# falls back to `versions[0]` and ships phantom versions.
fetch-tags: true

- name: "Prepare breeze & CI image"
uses: ./.github/actions/prepare_breeze_and_image
Expand Down Expand Up @@ -176,11 +181,20 @@ jobs:
- name: "Extract registry data (breeze)"
env:
PROVIDER: ${{ inputs.provider }}
DESTINATION: ${{ inputs.destination }}
run: |
# Staging dispatches preview unreleased providers (maintainers want to
# verify newly-bumped versions look right before tagging). Live builds
# filter them so the production registry never ships pointers to
# non-existent PyPI releases / GitHub tags / docs pages.
ALLOW_UNRELEASED=""
if [[ "${DESTINATION}" == "staging" ]]; then
ALLOW_UNRELEASED="--allow-unreleased"
fi
if [[ -n "${PROVIDER}" ]]; then
breeze registry extract-data --python 3.12 --provider "${PROVIDER}"
breeze registry extract-data --python 3.12 --provider "${PROVIDER}" ${ALLOW_UNRELEASED}
else
breeze registry extract-data --python 3.12
breeze registry extract-data --python 3.12 ${ALLOW_UNRELEASED}
fi

# --- Incremental: merge new data with existing ---
Expand Down
37 changes: 25 additions & 12 deletions dev/breeze/doc/images/output_registry_extract-data.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion dev/breeze/doc/images/output_registry_extract-data.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
602ea508f9bcf0d5c2f97a220f5ee6d2
5ece18e98af19619094e0ee3c439b73b
19 changes: 17 additions & 2 deletions dev/breeze/src/airflow_breeze/commands/registry_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,20 @@ def registry_group():
default=None,
help="Extract only this provider ID (e.g. 'amazon'). Omit for full build.",
)
@click.option(
"--allow-unreleased",
is_flag=True,
default=False,
help=(
"Include providers and versions that don't have a matching "
"providers-<id>/<ver> git tag. Use for staging builds and local dev "
"where maintainers want to preview unreleased provider pages before "
"the tag lands. Forwarded to extract_metadata.py."
),
)
@option_verbose
@option_dry_run
def extract_data(python: str, provider: str | None):
def extract_data(python: str, provider: str | None, allow_unreleased: bool):
unique_project_name = f"breeze-registry-{uuid.uuid4().hex[:8]}"

shell_params = ShellParams(
Expand All @@ -88,9 +99,13 @@ def extract_data(python: str, provider: str | None):
install_cmd = f"pip install --quiet {' '.join(suspended_packages)} && " if suspended_packages else ""

provider_flag = f" --provider '{provider}'" if provider else ""
# --allow-unreleased only applies to extract_metadata.py (which owns the
# version filter). The other two scripts read from providers.json and
# don't need it.
metadata_extra = " --allow-unreleased" if allow_unreleased else ""
command = (
f"{install_cmd}"
f"python dev/registry/extract_metadata.py{provider_flag} && "
f"python dev/registry/extract_metadata.py{provider_flag}{metadata_extra} && "
f"python dev/registry/extract_parameters.py{provider_flag} && "
f"python dev/registry/extract_connections.py{provider_flag}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"options": [
"--python",
"--provider",
"--allow-unreleased",
],
},
],
Expand Down
109 changes: 106 additions & 3 deletions dev/registry/extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import json
import re
import shutil
import subprocess
import urllib.request
import zlib
from dataclasses import asdict, dataclass, field
Expand Down Expand Up @@ -363,6 +364,48 @@ def find_related_providers(provider_id: str, all_provider_yamls: dict[str, dict]
return related[:5] # Limit to 5 related providers


def load_release_tags() -> set[str]:
"""Return all ``providers-<id>/<version>`` git tags as a set for fast lookup.

Used to filter ``provider.yaml`` ``versions:`` lists to only entries that
correspond to a real release (excludes phantom version bumps where the
next-version entry was prepended to ``versions:`` before the tag landed,
or pre-release-only versions like ``providers-celery/3.19.0rc1`` where the
``rc1`` exists but the final does not).

Returns an empty set if the ``git`` command fails (e.g., outside a checkout);
callers can decide whether to fall back to the unfiltered top entry.
"""
try:
result = subprocess.run(
["git", "tag", "--list", "providers-*"],
capture_output=True,
text=True,
cwd=AIRFLOW_ROOT,
check=True,
)
except (subprocess.CalledProcessError, FileNotFoundError):
return set()
return {line.strip() for line in result.stdout.splitlines() if line.strip()}


def find_latest_released_version(
provider_id: str,
versions_list: list[str],
release_tags: set[str],
) -> str | None:
"""Walk ``versions_list`` newest-first, return the first version with a real release tag.

Returns ``None`` when no entry in ``versions_list`` has a corresponding
``providers-<id>/<version>`` tag, indicating the provider is unreleased
(brand-new in-tree, no tags yet) or in an inconsistent state.
"""
for version in versions_list:
if f"providers-{provider_id}/{version}" in release_tags:
return version
return None


def main():
"""Main extraction function."""
import argparse
Expand All @@ -373,6 +416,17 @@ def main():
default=None,
help="Extract only this provider ID (e.g. 'amazon'). Omit for full build.",
)
parser.add_argument(
"--allow-unreleased",
action="store_true",
help=(
"Include providers and versions that don't have a matching "
"providers-<id>/<ver> git tag. Use for staging builds and local dev "
"where maintainers want to preview unreleased provider pages before "
"the tag lands. Default is to filter unreleased entries so live "
"builds don't ship phantom pointers."
),
)
args = parser.parse_args()

print("Airflow Registry Metadata Extractor")
Expand All @@ -382,6 +436,8 @@ def main():
print(f"Incremental mode: extracting provider(s) {requested_providers}")
else:
requested_providers = None
if args.allow_unreleased:
print("Unreleased providers: INCLUDED (--allow-unreleased)")

# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -419,6 +475,27 @@ def main():
else:
extraction_ids = set(all_provider_yamls.keys())

# Load all release tags once. Used below to filter `provider.yaml`'s
# `versions:` to only entries that have a real `providers-<id>/<ver>`
# git tag, avoiding phantom-version leaks (next-release bumps prepended
# to `versions:` before the tag lands, RC-only releases, brand-new
# providers with no tags yet).
#
# Skipped entirely when --allow-unreleased is set: staging builds and
# local dev want to preview unreleased provider pages so maintainers
# can verify them before tagging.
if args.allow_unreleased:
release_tags: set[str] = set()
else:
release_tags = load_release_tags()
if not release_tags:
print(
" Warning: no providers-* git tags found; "
"phantom version filter is disabled (falling back to versions[0]). "
"If this is a CI run, ensure the checkout step uses fetch-tags: true."
)
skipped_unreleased: list[str] = []

# Second pass: Extract full metadata (only for providers in extraction_ids)
for provider_id in extraction_ids:
provider_yaml = all_provider_yamls[provider_id]
Expand Down Expand Up @@ -451,9 +528,29 @@ def main():
if len(description) > 200:
description = description[:197] + "..."

# Get versions
versions = provider_yaml.get("versions", [])
version = versions[0] if versions else "0.0.0"
# Get versions, filtering to entries that have a real release tag.
# Provider release prep prepends the next version to `versions:` BEFORE
# the tag lands, and pre-release-only versions match `versions:` but
# have no final tag. Without filtering, `version` (the latest pointer)
# AND the `versions` list both leak phantoms downstream -- the latter
# is consumed by extract_versions.py's backfill, which would try to
# `git show` from a non-existent tag.
raw_versions = provider_yaml.get("versions", [])
if release_tags:
versions = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags]
version = find_latest_released_version(provider_id, raw_versions, release_tags)
if version is None:
skipped_unreleased.append(provider_id)
print(
f" Skipping {provider_id}: no released version found in "
f"versions list {raw_versions} "
f"(no matching providers-{provider_id}/<ver> tag)"
)
continue
else:
# No tag information available -- fall back to old behaviour.
versions = list(raw_versions)
version = versions[0] if versions else "0.0.0"

# Extract categories from integrations
categories = extract_integrations_as_categories(provider_yaml)
Expand Down Expand Up @@ -607,6 +704,12 @@ def main():
all_providers.append(provider)
print(f" {provider_id}: {len(categories)} categories")

if skipped_unreleased:
print(
f"\nSkipped {len(skipped_unreleased)} unreleased provider(s) "
f"(no matching git tag): {sorted(skipped_unreleased)}"
)

# Find related providers
for provider in all_providers:
provider.related_providers = find_related_providers(provider.id, all_provider_yamls)
Expand Down
144 changes: 144 additions & 0 deletions dev/registry/tests/test_extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
fetch_provider_inventory,
fetch_pypi_dates,
fetch_pypi_downloads,
find_latest_released_version,
find_related_providers,
load_release_tags,
module_path_to_file_path,
parse_pyproject_toml,
read_connection_urls,
Expand Down Expand Up @@ -568,3 +570,145 @@ def test_tableau_resolves(self):
conn_map = {"tableau": "connections/tableau.html"}
url = resolve_connection_docs_url("tableau", conn_map, self.BASE)
assert url == f"{self.BASE}/connections/tableau.html"


# ---------------------------------------------------------------------------
# load_release_tags
# ---------------------------------------------------------------------------
class TestLoadReleaseTags:
def test_parses_subprocess_output(self):
from unittest.mock import MagicMock, patch

mock_result = MagicMock()
mock_result.stdout = (
"providers-amazon/9.25.0\n"
"providers-amazon/9.26.0\n"
"providers-celery/3.18.0\n"
"providers-celery/3.19.0rc1\n"
"\n" # blank line
" providers-google/21.2.0 \n" # whitespace tolerated
)
with patch("extract_metadata.subprocess.run", return_value=mock_result) as mock_run:
tags = load_release_tags()

assert tags == {
"providers-amazon/9.25.0",
"providers-amazon/9.26.0",
"providers-celery/3.18.0",
"providers-celery/3.19.0rc1",
"providers-google/21.2.0",
}
# The git command runs against the providers-* glob
cmd = mock_run.call_args[0][0]
assert cmd[:3] == ["git", "tag", "--list"]
assert cmd[3] == "providers-*"

def test_returns_empty_set_on_subprocess_failure(self):
from subprocess import CalledProcessError
from unittest.mock import patch

with patch(
"extract_metadata.subprocess.run",
side_effect=CalledProcessError(1, ["git", "tag", "--list"]),
):
tags = load_release_tags()
assert tags == set()

def test_returns_empty_set_when_git_not_installed(self):
from unittest.mock import patch

with patch("extract_metadata.subprocess.run", side_effect=FileNotFoundError):
tags = load_release_tags()
assert tags == set()


# ---------------------------------------------------------------------------
# find_latest_released_version
# ---------------------------------------------------------------------------
class TestFindLatestReleasedVersion:
def test_returns_top_when_top_has_tag(self):
tags = {"providers-amazon/9.26.0", "providers-amazon/9.25.0"}
assert find_latest_released_version("amazon", ["9.26.0", "9.25.0"], tags) == "9.26.0"

def test_walks_past_phantom_top(self):
# celery 3.19.0 is in versions: but no final tag -- only rc1.
tags = {
"providers-celery/3.19.0rc1",
"providers-celery/3.18.0",
"providers-celery/3.17.2",
}
result = find_latest_released_version("celery", ["3.19.0", "3.18.0", "3.17.2"], tags)
assert result == "3.18.0"

def test_returns_none_when_no_versions_have_tags(self):
# akeyless: brand-new provider, listed in versions: but never tagged.
tags = {"providers-amazon/9.26.0"} # different provider
result = find_latest_released_version("akeyless", ["1.0.0"], tags)
assert result is None

def test_returns_none_for_empty_versions_list(self):
result = find_latest_released_version("amazon", [], {"providers-amazon/9.26.0"})
assert result is None

def test_rc_only_treated_as_phantom(self):
# Final 3.19.0 is missing; rc1/rc2 exist. Final must match exactly.
tags = {"providers-celery/3.19.0rc1", "providers-celery/3.19.0rc2", "providers-celery/3.18.0"}
# versions: list contains only the would-be final
result = find_latest_released_version("celery", ["3.19.0"], tags)
assert result is None
# When fallback also exists in versions, returns the fallback
result = find_latest_released_version("celery", ["3.19.0", "3.18.0"], tags)
assert result == "3.18.0"

def test_does_not_match_other_providers_tags(self):
# provider id is part of the tag prefix; pure version coincidence shouldn't match
tags = {"providers-google/9.26.0"}
result = find_latest_released_version("amazon", ["9.26.0"], tags)
assert result is None


# ---------------------------------------------------------------------------
# Filter behaviour applied to the `versions` list (not just the latest pointer)
# ---------------------------------------------------------------------------
class TestVersionsListFiltering:
"""Regression test for the bug where Provider.version (singular) was
filtered to a real release but Provider.versions (list) still contained
phantom entries. Downstream consumers like extract_versions.py read the
list and would chase non-existent backfill tags.
"""

def test_filter_drops_phantom_top_from_list(self):
# This mirrors the in-loop logic. We don't have to test main()
# end-to-end -- the filter is a single comprehension that we can
# exercise directly to lock in the contract.
provider_id = "celery"
raw_versions = ["3.19.0", "3.18.0", "3.17.2"]
release_tags = {
"providers-celery/3.19.0rc1", # not the final
"providers-celery/3.18.0",
"providers-celery/3.17.2",
}
filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags]
assert filtered == ["3.18.0", "3.17.2"]
# And the latest pointer agrees
assert find_latest_released_version(provider_id, raw_versions, release_tags) == "3.18.0"

def test_filter_drops_unreleased_provider(self):
provider_id = "akeyless"
raw_versions = ["1.0.0"]
release_tags = {"providers-amazon/9.26.0"} # different provider
filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags]
assert filtered == []
assert find_latest_released_version(provider_id, raw_versions, release_tags) is None

def test_filter_preserves_order(self):
provider_id = "amazon"
raw_versions = ["9.27.0", "9.26.0", "9.25.0", "9.24.0"] # 9.27.0 phantom
release_tags = {
"providers-amazon/9.26.0",
"providers-amazon/9.25.0",
"providers-amazon/9.24.0",
}
filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags]
# Order from raw_versions is preserved; only the phantom is dropped
assert filtered == ["9.26.0", "9.25.0", "9.24.0"]
Loading