python/private/pypi/simpleapi_download.bzl

# Copyright 2024 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A file that houses private functions used in the `bzlmod` extension with the same name.
"""

load("@bazel_features//:features.bzl", "bazel_features")
load("//python/private:auth.bzl", _get_auth = "get_auth")
load("//python/private:envsubst.bzl", "envsubst")
load("//python/private:normalize_name.bzl", "normalize_name")
load("//python/private:text_util.bzl", "render")
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

def simpleapi_download(
        ctx,
        *,
        attr,
        cache,
        parallel_download = True,
        read_simpleapi = None,
        get_auth = None,
        _print = print,
        _fail = fail):
    """Download Simple API HTML.

    Args:
        ctx: The module_ctx or repository_ctx.
        attr: Contains the parameters for the download. They are grouped into a
          struct for better clarity. It must have attributes:
           * index_url: str, the index.
           * index_url_overrides: dict[str, str], the index overrides for
             separate packages.
           * extra_index_urls: Extra index URLs that will be looked up after
             the main is looked up.
           * index_strategy: The string identifier representing the strategy
             used here. Can be either "first-index" or "unsafe".
           * sources: list[str], the sources to download things for. Each value is
             the contents of requirements files.
           * envsubst: list[str], the envsubst vars for performing substitution in index url.
           * netrc: The netrc parameter for ctx.download, see http_file for docs.
           * auth_patterns: The auth_patterns parameter for ctx.download, see
               http_file for docs.
        cache: A dictionary that can be used as a cache between calls during a
            single evaluation of the extension. We use a dictionary as a cache
            so that we can reuse calls to the simple API when evaluating the
            extension. Using the canonical_id parameter of the module_ctx would
            deposit the simple API responses to the bazel cache and that is
            undesirable because additions to the PyPI index would not be
            reflected when re-evaluating the extension unless we do
            `bazel clean --expunge`.
        parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
        read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
            Used in tests.
        get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
        _print: a function to print. Used in tests.
        _fail: a function to print a failure. Used in tests.

    Returns:
        dict of pkg name to the parsed HTML contents - a list of structs.
    """
    index_url_overrides = {
        normalize_name(p): i
        for p, i in (attr.index_url_overrides or {}).items()
    }

    if attr.index_strategy not in ["unsafe", "first-index"]:
        fail("TODO")

    download_kwargs = {}
    if bazel_features.external_deps.download_has_block_param:
        download_kwargs["block"] = not parallel_download

    # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
    # to replicate how `pip` would handle this case.
    contents = {}
    index_urls = [attr.index_url] + attr.extra_index_urls
    read_simpleapi = read_simpleapi or _read_simpleapi
    sources = {
        pkg: normalize_name(pkg)
        for pkg in attr.sources
    }

    found_on_indexes = {}
    warn_overrides = False
    for i, index_url in enumerate(index_urls):
        if i != 0:
            # Warn the user about a potential fix for the overrides
            warn_overrides = True

        async_downloads = {}
        for pkg, pkg_normalized in sources.items():
            if pkg not in found_on_indexes:
                # We have not found the pkg yet, let's search for it
                pass
            elif "first-index" == attr.index_strategy and pkg in found_on_indexes:
                # We have found it and we are using a safe strategy, let's not
                # search anymore.
                continue
            elif pkg in found_on_indexes and pkg_normalized in index_url_overrides:
                # This pkg has been overriden, be strict and use `first-index` strategy
                # implicitly.
                continue
            elif "unsafe" in attr.index_strategy:
                # We can search for the packages
                pass
            else:
                fail("BUG: Unknown state of searching of packages")

            pkg_normalized = normalize_name(pkg)
            override_urls = index_url_overrides.get(pkg_normalized, index_url)
            for url in override_urls.split(","):
                result = read_simpleapi(
                    ctx = ctx,
                    url = "{}/{}/".format(
                        url.rstrip("/"),
                        pkg,
                    ),
                    attr = attr,
                    cache = cache,
                    get_auth = get_auth,
                    **download_kwargs
                )
                if hasattr(result, "wait"):
                    # We will process it in a separate loop:
                    async_downloads.setdefault(pkg, []).append(
                        struct(
                            pkg_normalized = pkg_normalized,
                            wait = result.wait,
                        ),
                    )
                elif result.success:
                    current = contents.get(
                        pkg_normalized,
                        struct(sdists = {}, whls = {}),
                    )
                    contents[pkg_normalized] = struct(
                        # Always prefer the current values, so that the first index wins
                        sdists = result.output.sdists | current.sdists,
                        whls = result.output.whls | current.whls,
                    )
                    found_on_indexes.setdefault(pkg, []).append(url)

        if not async_downloads:
            continue

        # If we use `block` == False, then we need to have a second loop that is
        # collecting all of the results as they were being downloaded in parallel.
        for pkg, downloads in async_downloads.items():
            for download in downloads:
                result = download.wait()

                if result.success:
                    current = contents.get(
                        download.pkg_normalized,
                        struct(sdists = {}, whls = {}),
                    )
                    contents[download.pkg_normalized] = struct(
                        # Always prefer the current values, so that the first index wins
                        sdists = result.output.sdists | current.sdists,
                        whls = result.output.whls | current.whls,
                    )
                    found_on_indexes.setdefault(pkg, []).append(index_url)

    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_indexes]
    if failed_sources:
        _fail("Failed to download metadata for {} from urls: {}".format(
            failed_sources,
            index_urls,
        ))
        return None

    if warn_overrides:
        index_url_overrides = {
            pkg: ",".join(found_on_indexes[pkg])
            for pkg in attr.sources
            if found_on_indexes[pkg] != attr.index_url
        }

        _print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
            render.dict(index_url_overrides),
        ))

    return contents

def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
    """Read SimpleAPI.

    Args:
        ctx: The module_ctx or repository_ctx.
        url: str, the url parameter that can be passed to ctx.download.
        attr: The attribute that contains necessary info for downloading. The
          following attributes must be present:
           * envsubst: The envsubst values for performing substitutions in the URL.
           * netrc: The netrc parameter for ctx.download, see http_file for docs.
           * auth_patterns: The auth_patterns parameter for ctx.download, see
               http_file for docs.
        cache: A dict for storing the results.
        get_auth: A function to get auth information. Used in tests.
        **download_kwargs: Any extra params to ctx.download.
            Note that output and auth will be passed for you.

    Returns:
        A similar object to what `download` would return except that in result.out
        will be the parsed simple api contents.
    """
    # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
    # the whl location and we cannot handle multiple URLs at once by passing
    # them to ctx.download if we want to correctly handle the relative URLs.
    # TODO: Add a test that env subbed index urls do not leak into the lock file.

    real_url = strip_empty_path_segments(envsubst(
        url,
        attr.envsubst,
        ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
    ))

    cache_key = real_url
    if cache_key in cache:
        return struct(success = True, output = cache[cache_key])

    output_str = envsubst(
        url,
        attr.envsubst,
        # Use env names in the subst values - this will be unique over
        # the lifetime of the execution of this function and we also use
        # `~` as the separator to ensure that we don't get clashes.
        {e: "~{}~".format(e) for e in attr.envsubst}.get,
    )

    # Transform the URL into a valid filename
    for char in [".", ":", "/", "\\", "-"]:
        output_str = output_str.replace(char, "_")

    output = ctx.path(output_str.strip("_").lower() + ".html")

    get_auth = get_auth or _get_auth

    # NOTE: this may have block = True or block = False in the download_kwargs
    download = ctx.download(
        url = [real_url],
        output = output,
        auth = get_auth(ctx, [real_url], ctx_attr = attr),
        allow_fail = True,
        **download_kwargs
    )

    if download_kwargs.get("block") == False:
        # Simulate the same API as ctx.download has
        return struct(
            wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
        )

    return _read_index_result(ctx, download, output, real_url, cache, cache_key)

def strip_empty_path_segments(url):
    """Removes empty path segments from a URL. Does nothing for urls with no scheme.

    Public only for testing.

    Args:
        url: The url to remove empty path segments from

    Returns:
        The url with empty path segments removed and any trailing slash preserved.
        If the url had no scheme it is returned unchanged.
    """
    scheme, _, rest = url.partition("://")
    if rest == "":
        return url
    stripped = "/".join([p for p in rest.split("/") if p])
    if url.endswith("/"):
        return "{}://{}/".format(scheme, stripped)
    else:
        return "{}://{}".format(scheme, stripped)

def _read_index_result(ctx, result, output, url, cache, cache_key):
    if not result.success:
        return struct(success = False)

    content = ctx.read(output)

    output = parse_simpleapi_html(url = url, content = content)
    if output:
        cache.setdefault(cache_key, output)
        return struct(success = True, output = output, cache_key = cache_key)
    else:
        return struct(success = False)