-
-
Notifications
You must be signed in to change notification settings - Fork 575
/
Copy pathsimpleapi_download.bzl
299 lines (262 loc) · 11.9 KB
/
simpleapi_download.bzl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# Copyright 2024 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A file that houses private functions used in the `bzlmod` extension with the same name.
"""
load("@bazel_features//:features.bzl", "bazel_features")
load("//python/private:auth.bzl", _get_auth = "get_auth")
load("//python/private:envsubst.bzl", "envsubst")
load("//python/private:normalize_name.bzl", "normalize_name")
load("//python/private:text_util.bzl", "render")
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
def simpleapi_download(
ctx,
*,
attr,
cache,
parallel_download = True,
read_simpleapi = None,
get_auth = None,
_print = print,
_fail = fail):
"""Download Simple API HTML.
Args:
ctx: The module_ctx or repository_ctx.
attr: Contains the parameters for the download. They are grouped into a
struct for better clarity. It must have attributes:
* index_url: str, the index.
* index_url_overrides: dict[str, str], the index overrides for
separate packages.
* extra_index_urls: Extra index URLs that will be looked up after
the main is looked up.
* index_strategy: The string identifier representing the strategy
used here. Can be either "first-index" or "unsafe".
* sources: list[str], the sources to download things for. Each value is
the contents of requirements files.
* envsubst: list[str], the envsubst vars for performing substitution in index url.
* netrc: The netrc parameter for ctx.download, see http_file for docs.
* auth_patterns: The auth_patterns parameter for ctx.download, see
http_file for docs.
cache: A dictionary that can be used as a cache between calls during a
single evaluation of the extension. We use a dictionary as a cache
so that we can reuse calls to the simple API when evaluating the
extension. Using the canonical_id parameter of the module_ctx would
deposit the simple API responses to the bazel cache and that is
undesirable because additions to the PyPI index would not be
reflected when re-evaluating the extension unless we do
`bazel clean --expunge`.
parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
Used in tests.
get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
_print: a function to print. Used in tests.
_fail: a function to print a failure. Used in tests.
Returns:
dict of pkg name to the parsed HTML contents - a list of structs.
"""
index_url_overrides = {
normalize_name(p): i
for p, i in (attr.index_url_overrides or {}).items()
}
if attr.index_strategy not in ["unsafe", "first-index"]:
fail("TODO")
download_kwargs = {}
if bazel_features.external_deps.download_has_block_param:
download_kwargs["block"] = not parallel_download
# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
# to replicate how `pip` would handle this case.
contents = {}
index_urls = [attr.index_url] + attr.extra_index_urls
read_simpleapi = read_simpleapi or _read_simpleapi
sources = {
pkg: normalize_name(pkg)
for pkg in attr.sources
}
found_on_indexes = {}
warn_overrides = False
for i, index_url in enumerate(index_urls):
if i != 0:
# Warn the user about a potential fix for the overrides
warn_overrides = True
async_downloads = {}
for pkg, pkg_normalized in sources.items():
if pkg not in found_on_indexes:
# We have not found the pkg yet, let's search for it
pass
elif "first-index" == attr.index_strategy and pkg in found_on_indexes:
# We have found it and we are using a safe strategy, let's not
# search anymore.
continue
elif pkg in found_on_indexes and pkg_normalized in index_url_overrides:
# This pkg has been overriden, be strict and use `first-index` strategy
# implicitly.
continue
elif "unsafe" in attr.index_strategy:
# We can search for the packages
pass
else:
fail("BUG: Unknown state of searching of packages")
pkg_normalized = normalize_name(pkg)
override_urls = index_url_overrides.get(pkg_normalized, index_url)
for url in override_urls.split(","):
result = read_simpleapi(
ctx = ctx,
url = "{}/{}/".format(
url.rstrip("/"),
pkg,
),
attr = attr,
cache = cache,
get_auth = get_auth,
**download_kwargs
)
if hasattr(result, "wait"):
# We will process it in a separate loop:
async_downloads.setdefault(pkg, []).append(
struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
),
)
elif result.success:
current = contents.get(
pkg_normalized,
struct(sdists = {}, whls = {}),
)
contents[pkg_normalized] = struct(
# Always prefer the current values, so that the first index wins
sdists = result.output.sdists | current.sdists,
whls = result.output.whls | current.whls,
)
found_on_indexes.setdefault(pkg, []).append(url)
if not async_downloads:
continue
# If we use `block` == False, then we need to have a second loop that is
# collecting all of the results as they were being downloaded in parallel.
for pkg, downloads in async_downloads.items():
for download in downloads:
result = download.wait()
if result.success:
current = contents.get(
download.pkg_normalized,
struct(sdists = {}, whls = {}),
)
contents[download.pkg_normalized] = struct(
# Always prefer the current values, so that the first index wins
sdists = result.output.sdists | current.sdists,
whls = result.output.whls | current.whls,
)
found_on_indexes.setdefault(pkg, []).append(index_url)
failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_indexes]
if failed_sources:
_fail("Failed to download metadata for {} from urls: {}".format(
failed_sources,
index_urls,
))
return None
if warn_overrides:
index_url_overrides = {
pkg: ",".join(found_on_indexes[pkg])
for pkg in attr.sources
if found_on_indexes[pkg] != attr.index_url
}
_print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
render.dict(index_url_overrides),
))
return contents
def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
"""Read SimpleAPI.
Args:
ctx: The module_ctx or repository_ctx.
url: str, the url parameter that can be passed to ctx.download.
attr: The attribute that contains necessary info for downloading. The
following attributes must be present:
* envsubst: The envsubst values for performing substitutions in the URL.
* netrc: The netrc parameter for ctx.download, see http_file for docs.
* auth_patterns: The auth_patterns parameter for ctx.download, see
http_file for docs.
cache: A dict for storing the results.
get_auth: A function to get auth information. Used in tests.
**download_kwargs: Any extra params to ctx.download.
Note that output and auth will be passed for you.
Returns:
A similar object to what `download` would return except that in result.out
will be the parsed simple api contents.
"""
# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
# the whl location and we cannot handle multiple URLs at once by passing
# them to ctx.download if we want to correctly handle the relative URLs.
# TODO: Add a test that env subbed index urls do not leak into the lock file.
real_url = strip_empty_path_segments(envsubst(
url,
attr.envsubst,
ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
))
cache_key = real_url
if cache_key in cache:
return struct(success = True, output = cache[cache_key])
output_str = envsubst(
url,
attr.envsubst,
# Use env names in the subst values - this will be unique over
# the lifetime of the execution of this function and we also use
# `~` as the separator to ensure that we don't get clashes.
{e: "~{}~".format(e) for e in attr.envsubst}.get,
)
# Transform the URL into a valid filename
for char in [".", ":", "/", "\\", "-"]:
output_str = output_str.replace(char, "_")
output = ctx.path(output_str.strip("_").lower() + ".html")
get_auth = get_auth or _get_auth
# NOTE: this may have block = True or block = False in the download_kwargs
download = ctx.download(
url = [real_url],
output = output,
auth = get_auth(ctx, [real_url], ctx_attr = attr),
allow_fail = True,
**download_kwargs
)
if download_kwargs.get("block") == False:
# Simulate the same API as ctx.download has
return struct(
wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
)
return _read_index_result(ctx, download, output, real_url, cache, cache_key)
def strip_empty_path_segments(url):
"""Removes empty path segments from a URL. Does nothing for urls with no scheme.
Public only for testing.
Args:
url: The url to remove empty path segments from
Returns:
The url with empty path segments removed and any trailing slash preserved.
If the url had no scheme it is returned unchanged.
"""
scheme, _, rest = url.partition("://")
if rest == "":
return url
stripped = "/".join([p for p in rest.split("/") if p])
if url.endswith("/"):
return "{}://{}/".format(scheme, stripped)
else:
return "{}://{}".format(scheme, stripped)
def _read_index_result(ctx, result, output, url, cache, cache_key):
if not result.success:
return struct(success = False)
content = ctx.read(output)
output = parse_simpleapi_html(url = url, content = content)
if output:
cache.setdefault(cache_key, output)
return struct(success = True, output = output, cache_key = cache_key)
else:
return struct(success = False)