Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow merging multiple simpleapi indexes #2642

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion python/private/pypi/extension.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
index_url = pip_attr.experimental_index_url,
extra_index_urls = pip_attr.experimental_extra_index_urls or [],
index_url_overrides = pip_attr.experimental_index_url_overrides or {},
index_strategy = pip_attr.index_strategy,
sources = distributions,
envsubst = pip_attr.envsubst,
# Auth related info
Expand Down Expand Up @@ -681,27 +682,41 @@ stable.

This is equivalent to `--index-url` `pip` option.

:::{warn}
`rules_python` will fallback to using `pip` to download wheels if the requirements
files do not have hashes.
:::

:::{versionchanged} 0.37.0
If {attr}`download_only` is set, then `sdist` archives will be discarded and `pip.parse` will
operate in wheel-only mode.
:::
""",
),
"experimental_index_url_overrides": attr.string_dict(
# TODO @aignas 2025-03-01: consider using string_list_dict so that
# we could have index_url_overrides per package for different
# platforms like what `uv` has.
# See https://docs.astral.sh/uv/configuration/indexes/#-index-url-and-extra-index-url
doc = """\
The index URL overrides for each package to use for downloading wheels using
bazel downloader. This value is going to be subject to `envsubst` substitutions
if necessary.

The key is the package name (will be normalized before usage) and the value is the
index URL.
index URLs separated with `,`.

This design pattern has been chosen in order to be fully deterministic about which
packages come from which source. We want to avoid issues similar to what happened in
https://pytorch.org/blog/compromised-nightly-dependency/.

The indexes must support Simple API as described here:
https://packaging.python.org/en/latest/specifications/simple-repository-api/

:::{versionchanged} VERSION_NEXT_PATCH
This can contain comma separated values per package to allow `torch` being
indexed from multiple sources.
:::
""",
),
"hub_name": attr.string(
Expand All @@ -724,6 +739,21 @@ is not required. Each hub is a separate resolution of pip dependencies. This
means if different programs need different versions of some library, separate
hubs can be created, and each program can use its respective hub's targets.
Targets from different hubs should not be used together.
""",
),
"index_strategy": attr.string(
default = "first-index",
values = ["first-index", "unsafe"],
doc = """\
The strategy used when fetching package locations from indexes. This is to allow fetching
`torch` from the `torch` maintained and PyPI index so that on different platforms users
can have different torch versions (e.g. gpu accelerated on linux and cpu on the
rest of the platforms).

See https://docs.astral.sh/uv/configuration/indexes/#searching-across-multiple-indexes.

:::{versionadded} VERSION_NEXT_PATCH
:::
""",
),
"parallel_download": attr.bool(
Expand Down
117 changes: 82 additions & 35 deletions python/private/pypi/simpleapi_download.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def simpleapi_download(
parallel_download = True,
read_simpleapi = None,
get_auth = None,
_print = print,
_fail = fail):
"""Download Simple API HTML.

Expand All @@ -43,6 +44,8 @@ def simpleapi_download(
separate packages.
* extra_index_urls: Extra index URLs that will be looked up after
the main is looked up.
* index_strategy: The string identifier representing the strategy
used here. Can be either "first-index" or "unsafe".
* sources: list[str], the sources to download things for. Each value is
the contents of requirements files.
* envsubst: list[str], the envsubst vars for performing substitution in index url.
Expand All @@ -61,6 +64,7 @@ def simpleapi_download(
read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
Used in tests.
get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
_print: a function to print. Used in tests.
_fail: a function to print a failure. Used in tests.

Returns:
Expand All @@ -71,6 +75,9 @@ def simpleapi_download(
for p, i in (attr.index_url_overrides or {}).items()
}

if attr.index_strategy not in ["unsafe", "first-index"]:
fail("TODO")

download_kwargs = {}
if bazel_features.external_deps.download_has_block_param:
download_kwargs["block"] = not parallel_download
Expand All @@ -80,68 +87,108 @@ def simpleapi_download(
contents = {}
index_urls = [attr.index_url] + attr.extra_index_urls
read_simpleapi = read_simpleapi or _read_simpleapi
sources = {
pkg: normalize_name(pkg)
for pkg in attr.sources
}

found_on_index = {}
found_on_indexes = {}
warn_overrides = False
for i, index_url in enumerate(index_urls):
if i != 0:
# Warn the user about a potential fix for the overrides
warn_overrides = True

async_downloads = {}
sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
for pkg in sources:
for pkg, pkg_normalized in sources.items():
if pkg not in found_on_indexes:
# We have not found the pkg yet, let's search for it
pass
elif "first-index" == attr.index_strategy and pkg in found_on_indexes:
# We have found it and we are using a safe strategy, let's not
# search anymore.
continue
elif pkg in found_on_indexes and pkg_normalized in index_url_overrides:
# This pkg has been overriden, be strict and use `first-index` strategy
# implicitly.
continue
elif "unsafe" in attr.index_strategy:
# We can search for the packages
pass
else:
fail("BUG: Unknown state of searching of packages")

pkg_normalized = normalize_name(pkg)
result = read_simpleapi(
ctx = ctx,
url = "{}/{}/".format(
index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
pkg,
),
attr = attr,
cache = cache,
get_auth = get_auth,
**download_kwargs
)
if hasattr(result, "wait"):
# We will process it in a separate loop:
async_downloads[pkg] = struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
override_urls = index_url_overrides.get(pkg_normalized, index_url)
for url in override_urls.split(","):
result = read_simpleapi(
ctx = ctx,
url = "{}/{}/".format(
url.rstrip("/"),
pkg,
),
attr = attr,
cache = cache,
get_auth = get_auth,
**download_kwargs
)
elif result.success:
contents[pkg_normalized] = result.output
found_on_index[pkg] = index_url
if hasattr(result, "wait"):
# We will process it in a separate loop:
async_downloads.setdefault(pkg, []).append(
struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
),
)
elif result.success:
current = contents.get(
pkg_normalized,
struct(sdists = {}, whls = {}),
)
contents[pkg_normalized] = struct(
# Always prefer the current values, so that the first index wins
sdists = result.output.sdists | current.sdists,
whls = result.output.whls | current.whls,
)
found_on_indexes.setdefault(pkg, []).append(url)

if not async_downloads:
continue

# If we use `block` == False, then we need to have a second loop that is
# collecting all of the results as they were being downloaded in parallel.
for pkg, download in async_downloads.items():
result = download.wait()

if result.success:
contents[download.pkg_normalized] = result.output
found_on_index[pkg] = index_url

failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
for pkg, downloads in async_downloads.items():
for download in downloads:
result = download.wait()

if result.success:
current = contents.get(
download.pkg_normalized,
struct(sdists = {}, whls = {}),
)
contents[download.pkg_normalized] = struct(
# Always prefer the current values, so that the first index wins
sdists = result.output.sdists | current.sdists,
whls = result.output.whls | current.whls,
)
found_on_indexes.setdefault(pkg, []).append(index_url)

failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_indexes]
if failed_sources:
_fail("Failed to download metadata for {} for from urls: {}".format(
_fail("Failed to download metadata for {} from urls: {}".format(
failed_sources,
index_urls,
))
return None

if warn_overrides:
index_url_overrides = {
pkg: found_on_index[pkg]
pkg: ",".join(found_on_indexes[pkg])
for pkg in attr.sources
if found_on_index[pkg] != attr.index_url
if found_on_indexes[pkg] != attr.index_url
}

# buildifier: disable=print
print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
_print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
render.dict(index_url_overrides),
))

Expand Down
Loading
Loading