From d4cea676e998b291961f0f0b953a7de7b7a35940 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Tue, 10 Mar 2026 20:05:54 +0900
Subject: [PATCH 01/14] wip
wip
---
python/private/pypi/parse_simpleapi_html.bzl | 24 ++-
python/private/pypi/simpleapi_download.bzl | 175 +++++++++++--------
2 files changed, 123 insertions(+), 76 deletions(-)
diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 563130791e..78669d5ff8 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -16,16 +16,20 @@
Parse SimpleAPI HTML in Starlark.
"""
+load("//python/private:normalize_name.bzl", "normalize_name")
load(":version_from_filename.bzl", "version_from_filename")
-def parse_simpleapi_html(*, content):
+def parse_simpleapi_html(*, content, parse_index = False):
"""Get the package URLs for given shas by parsing the Simple API HTML.
Args:
- content(str): The Simple API HTML content.
+ content: {type}`str` The Simple API HTML content.
+ parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
+ e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
Returns:
- A list of structs with:
+ If it is the index page, return the map of package to URL it can be queried from.
+ Otherwise, a list of structs with:
* filename: {type}`str` The filename of the artifact.
* version: {type}`str` The version of the artifact.
* url: {type}`str` The URL to download the artifact.
@@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content):
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))
+ packages = {}
+
# 2. Iterate using find() to avoid huge list allocations from .split("": {
+ # "": "",
+ # }
+ # }
+ # }
+ download = read_simpleapi(
+ ctx = ctx,
+ attr = attr,
+ url = urllib.strip_empty_path_segments("{index_url}/".format(
+ index_url = index_url,
+ )),
+ parse_index = True,
+ versions = None,
+ block = block,
+ allow_fail = False,
+ **kwargs
+ )
+ if hasattr(download, "wait"):
+ downloads[index_url] = download
+ else:
+ results[index_url] = download
+
+ for index_url, download in downloads.items():
+ results[index_url] = download.wait()
+
+ found_on_index = {}
+ for index_url, result in results.items():
+ sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+
+ available_packages = result.output
+ sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages]
+ found_on_index.update({
+ pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)])
+ for pkg in sources
+ })
+
+ failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
if failed_sources:
pkg_index_urls = {
pkg: index_url_overrides.get(
@@ -148,7 +191,7 @@ def simpleapi_download(
_fail(
"""
-Failed to download metadata of the following packages from urls:
+Failed to find packages on PyPI of the following packages from urls:
{pkg_index_urls}
If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
@@ -159,22 +202,9 @@ If you would like to skip downloading metadata for these packages please add 'si
)
return None
- if warn_overrides:
- index_url_overrides = {
- pkg: found_on_index[pkg]
- for pkg in attr.sources
- if found_on_index[pkg] != attr.index_url
- }
-
- if index_url_overrides:
- # buildifier: disable=print
- print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
- render.dict(index_url_overrides),
- ))
-
- return contents
+ return {normalize_name(pkg): url for pkg, url in found_on_index.items()}
-def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs):
+def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
"""Read SimpleAPI.
Args:
@@ -189,6 +219,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
cache: {type}`struct` the `pypi_cache` instance.
versions: {type}`list[str] The versions that have been requested.
get_auth: A function to get auth information. Used in tests.
+ parse_index: TODO
**download_kwargs: Any extra params to ctx.download.
Note that output and auth will be passed for you.
@@ -242,6 +273,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
output = output,
cache = cache,
cache_key = cache_key,
+ parse_index = parse_index,
),
)
@@ -251,15 +283,16 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
output = output,
cache = cache,
cache_key = cache_key,
+ parse_index = parse_index,
)
-def _read_index_result(ctx, *, result, output, cache, cache_key):
+def _read_index_result(ctx, *, result, output, cache, cache_key, parse_index):
if not result.success:
return struct(success = False)
content = ctx.read(output)
- output = parse_simpleapi_html(content = content)
+ output = parse_simpleapi_html(content = content, parse_index = parse_index)
if output:
cache.setdefault(cache_key, output)
return struct(success = True, output = output)
From 0107a54801377f16ea1caf6e206076d201d13aec Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:09:51 +0900
Subject: [PATCH 02/14] add facts
---
python/private/pypi/pypi_cache.bzl | 51 +++++++++++++++
python/private/pypi/simpleapi_download.bzl | 73 ++++++++--------------
2 files changed, 76 insertions(+), 48 deletions(-)
diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 28c6cbeafb..747bf6a7a1 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -122,6 +122,15 @@ def _filter_packages(dists, requested_versions):
if dists == None or not requested_versions:
return dists
+ if type(dists) == "dict":
+ pkgs = requested_versions
+ filtered = {
+ pkg: url
+ for pkg, url in dists.items()
+ if pkg in pkgs
+ }
+ return filtered
+
sha256s_by_version = {}
whls = {}
sdists = {}
@@ -193,6 +202,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
# cannot trust known facts, different version that we know how to parse
return None
+ if type(requested_versions) == "dict":
+ return _filter_packages(
+ dists = known_facts.get("index_urls", {}).get(index_url, {}),
+ requested_versions = requested_versions,
+ )
+
known_sources = {}
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
@@ -266,10 +281,46 @@ def _store_facts(facts, fact_version, index_url, value):
facts["fact_version"] = fact_version
+ if type(value) == "dict":
+ # facts: {
+ # "index_urls": {
+ # "": {
+ # "": "",
+ # },
+ # },
+ # },
+ for pkg, url in value.items():
+ facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url)
+ return value
+
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")
+ # The schema is
+ # facts: {
+ # "dist_hashes": {
+ # "": {
+ # "": {
+ # "": "",
+ # },
+ # },
+ # },
+ # "dist_filenames": {
+ # "": {
+ # "": {
+ # "": "", # if it is different from the URL
+ # },
+ # },
+ # },
+ # "dist_yanked": {
+ # "": {
+ # "": {
+ # "": "", # if the package is yanked
+ # },
+ # },
+ # },
+ # },
for sha256, d in (value.sdists | value.whls).items():
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
if not d.url.endswith(d.filename):
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index f15d835a48..a1dd447e1e 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -75,6 +75,11 @@ def simpleapi_download(
for p, i in (attr.index_url_overrides or {}).items()
}
+ sources = {
+ normalize_name(pkg): versions
+ for pkg, versions in attr.sources.items()
+ }
+
# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
# to replicate how `pip` would handle this case.
contents = {}
@@ -83,8 +88,9 @@ def simpleapi_download(
dist_urls = _get_dist_urls(
ctx,
- index_urls,
- index_url_overrides,
+ index_urls = index_urls,
+ index_url_overrides = index_url_overrides,
+ sources = sources,
read_simpleapi = read_simpleapi,
cache = cache,
get_auth = get_auth,
@@ -95,11 +101,6 @@ def simpleapi_download(
ctx.report_progress("Fetch package lists from PyPI index")
- sources = {
- normalize_name(pkg): versions
- for pkg, versions in attr.sources.items()
- }
-
downloads = {}
contents = {}
for pkg, url in dist_urls.items():
@@ -125,29 +126,10 @@ def simpleapi_download(
return contents
-def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr, block, _fail = fail, **kwargs):
- if index_url_overrides:
- first_index = index_urls[0]
- return {
- pkg: urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
- index_url = index_url_overrides.get(normalize_name(pkg), first_index).rstrip("/"),
- distribution = pkg,
- ))
- for pkg in attr.sources
- }
-
+def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
downloads = {}
results = {}
for index_url in index_urls:
- # TODO @aignas 2026-03-20: pull from the cache/facts
- # we can store the following schema:
- # facts: {
- # "index_urls": {
- # "": {
- # "": "",
- # }
- # }
- # }
download = read_simpleapi(
ctx = ctx,
attr = attr,
@@ -155,7 +137,7 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr
index_url = index_url,
)),
parse_index = True,
- versions = None,
+ versions = {pkg: None for pkg in sources},
block = block,
allow_fail = False,
**kwargs
@@ -170,25 +152,25 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr
found_on_index = {}
for index_url, result in results.items():
- sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
-
- available_packages = result.output
- sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages]
+ # Filter out the things that we have already found
found_on_index.update({
- pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)])
+ pkg: urllib.absolute_url(index_url, result.output[pkg])
for pkg in sources
})
+ sources = [
+ pkg
+ for pkg in sources
+ if pkg not in found_on_index
+ ]
- failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
- if failed_sources:
+ if sources:
pkg_index_urls = {
- pkg: index_url_overrides.get(
- normalize_name(pkg),
- index_urls,
- )
- for pkg in failed_sources
+ pkg: index_url_overrides.get(pkg, index_urls)
+ for pkg in sources
}
+ # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can
+ # print a warning, or we can fallback to PyPI. For now let's fail
_fail(
"""
Failed to find packages on PyPI of the following packages from urls:
@@ -196,13 +178,13 @@ Failed to find packages on PyPI of the following packages from urls:
If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
""".format(
- pkg_index_urls = render.dict(pkg_index_urls),
- failed_sources = render.list(failed_sources),
+ pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))),
+ failed_sources = render.list(sources),
),
)
return None
- return {normalize_name(pkg): url for pkg, url in found_on_index.items()}
+ return found_on_index
def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
"""Read SimpleAPI.
@@ -227,11 +209,6 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_inde
A similar object to what `download` would return except that in result.out
will be the parsed simple api contents.
"""
- # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
- # the whl location and we cannot handle multiple URLs at once by passing
- # them to ctx.download if we want to correctly handle the relative URLs.
- # TODO: Add a test that env subbed index urls do not leak into the lock file.
-
real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
cache_key = (url, real_url, versions)
From 0492f31707349406229ffc12109dd6eab82bcea5 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:48:46 +0900
Subject: [PATCH 03/14] finish POC
---
python/private/pypi/pypi_cache.bzl | 11 ++++++-----
python/private/pypi/simpleapi_download.bzl | 4 +---
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 747bf6a7a1..bc92de0bde 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -89,6 +89,9 @@ def _pypi_cache_get(self, key):
if not cached and versions:
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
+ else:
+ # TODO @aignas 2026-03-20: add a test here
+ self._facts.setdefault(index_url, cached)
return cached
@@ -123,13 +126,11 @@ def _filter_packages(dists, requested_versions):
return dists
if type(dists) == "dict":
- pkgs = requested_versions
- filtered = {
+ return {
pkg: url
for pkg, url in dists.items()
- if pkg in pkgs
+ if pkg in requested_versions
}
- return filtered
sha256s_by_version = {}
whls = {}
@@ -290,7 +291,7 @@ def _store_facts(facts, fact_version, index_url, value):
# },
# },
for pkg, url in value.items():
- facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url)
+ facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
return value
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index a1dd447e1e..b8caacef82 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -82,13 +82,11 @@ def simpleapi_download(
# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
# to replicate how `pip` would handle this case.
- contents = {}
- index_urls = [attr.index_url] + attr.extra_index_urls
read_simpleapi = read_simpleapi or _read_simpleapi
dist_urls = _get_dist_urls(
ctx,
- index_urls = index_urls,
+ index_urls = [attr.index_url] + attr.extra_index_urls,
index_url_overrides = index_url_overrides,
sources = sources,
read_simpleapi = read_simpleapi,
From b77854d0e28543f54b3763e05d5ae059581f152f Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:10:07 +0900
Subject: [PATCH 04/14] remove a warning
---
python/private/pypi/simpleapi_download.bzl | 31 +++++-----------------
1 file changed, 6 insertions(+), 25 deletions(-)
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index b8caacef82..a73dc4538a 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -19,7 +19,6 @@ A file that houses private functions used in the `bzlmod` extension with the sam
load("//python/private:auth.bzl", _get_auth = "get_auth")
load("//python/private:envsubst.bzl", "envsubst")
load("//python/private:normalize_name.bzl", "normalize_name")
-load("//python/private:text_util.bzl", "render")
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
load(":urllib.bzl", "urllib")
@@ -150,37 +149,19 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
found_on_index = {}
for index_url, result in results.items():
- # Filter out the things that we have already found
- found_on_index.update({
- pkg: urllib.absolute_url(index_url, result.output[pkg])
- for pkg in sources
- })
sources = [
pkg
for pkg in sources
if pkg not in found_on_index
]
- if sources:
- pkg_index_urls = {
- pkg: index_url_overrides.get(pkg, index_urls)
+ # Filter out the things that we have already found
+ found_on_index.update({
+ pkg: urllib.absolute_url(index_url, result.output[pkg])
for pkg in sources
- }
-
- # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can
- # print a warning, or we can fallback to PyPI. For now let's fail
- _fail(
- """
-Failed to find packages on PyPI of the following packages from urls:
-{pkg_index_urls}
-
-If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
-""".format(
- pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))),
- failed_sources = render.list(sources),
- ),
- )
- return None
+ # TODO @aignas 2026-03-20: add a test here
+ if index_url_overrides.get(pkg, index_url)
+ })
return found_on_index
From cb97d74fd86804ec4ab629b90f92e212ac947f46 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:21:02 +0900
Subject: [PATCH 05/14] add a test
---
python/private/pypi/pypi_cache.bzl | 1 -
python/private/pypi/simpleapi_download.bzl | 9 +++---
tests/pypi/pypi_cache/pypi_cache_tests.bzl | 33 ++++++++++++++++++++++
3 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index bc92de0bde..2d2418c6ae 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -90,7 +90,6 @@ def _pypi_cache_get(self, key):
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
else:
- # TODO @aignas 2026-03-20: add a test here
self._facts.setdefault(index_url, cached)
return cached
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index a73dc4538a..de12b9d675 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -73,16 +73,17 @@ def simpleapi_download(
normalize_name(p): i
for p, i in (attr.index_url_overrides or {}).items()
}
-
sources = {
normalize_name(pkg): versions
for pkg, versions in attr.sources.items()
}
- # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
- # to replicate how `pip` would handle this case.
read_simpleapi = read_simpleapi or _read_simpleapi
+ ctx.report_progress("Fetch package lists from PyPI index")
+
+ # NOTE: we are not merging results from multiple indexes to replicate how `pip` would
+ # handle this case. What we do is we select a particular index to download the packages
dist_urls = _get_dist_urls(
ctx,
index_urls = [attr.index_url] + attr.extra_index_urls,
@@ -96,7 +97,7 @@ def simpleapi_download(
_fail = _fail,
)
- ctx.report_progress("Fetch package lists from PyPI index")
+ ctx.report_progress("Fetching package URLs from PyPI index")
downloads = {}
contents = {}
diff --git a/tests/pypi/pypi_cache/pypi_cache_tests.bzl b/tests/pypi/pypi_cache/pypi_cache_tests.bzl
index 7b6168ce7b..3cf01c7450 100644
--- a/tests/pypi/pypi_cache/pypi_cache_tests.bzl
+++ b/tests/pypi/pypi_cache/pypi_cache_tests.bzl
@@ -155,6 +155,39 @@ def _test_pypi_cache_writes_to_facts(env):
"fact_version": "v1", # Facts version
})
+ # When we get the other items cached in memory, they get written to facts
+ got = cache.get((key[0], key[1], ["1.1.0"]))
+ got.whls().contains_exactly({
+ "sha_whl_2": fake_result.whls["sha_whl_2"],
+ })
+ got.sdists().contains_exactly({})
+ got.sha256s_by_version().contains_exactly({
+ "1.1.0": fake_result.sha256s_by_version["1.1.0"],
+ })
+
+ # Then when we get facts at the end
+ cache.get_facts().contains_exactly({
+ "dist_hashes": {
+ # We are not using the real index URL, because we may have credentials in here
+ "https://{PYPI_INDEX_URL}": {
+ "pkg": {
+ "https://pypi.org/files/pkg-1.0.0-py3-none-any.whl": "sha_whl",
+ "https://pypi.org/files/pkg-1.0.0.tar.gz": "sha_sdist",
+ "https://pypi.org/files/pkg-1.1.0-py3-none-any.whl": "sha_whl_2",
+ },
+ },
+ },
+ "dist_yanked": {
+ "https://{PYPI_INDEX_URL}": {
+ "pkg": {
+ "sha_sdist": "",
+ "sha_whl": "Security issue",
+ },
+ },
+ },
+ "fact_version": "v1", # Facts version
+ })
+
_tests.append(_test_pypi_cache_writes_to_facts)
def _test_pypi_cache_reads_from_facts(env):
From 7e71a5898037d6aee581bf039f8019c11bcbc836 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:25:15 +0900
Subject: [PATCH 06/14] add a test
---
.../parse_simpleapi_html_tests.bzl | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
index f72d61371c..933a0783f2 100644
--- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
+++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
@@ -42,6 +42,29 @@ def _generate_html(*items):
]),
)
+def _test_index(env):
+ # buildifier: disable=unsorted-dict-items
+ tests = [
+ (
+ [
+ struct(attrs = ['href="/simple/foo/"'], filename = "foo"),
+ struct(attrs = ['href="./b-ar/"'], filename = "b-._.-aR"),
+ ],
+ {
+ "b_ar": "./b-ar/",
+ "foo": "/simple/foo/",
+ },
+ ),
+ ]
+
+ for (input, want) in tests:
+ html = _generate_html(*input)
+ got = parse_simpleapi_html(content = html, parse_index = True)
+
+ env.expect.that_dict(got).contains_exactly(want)
+
+_tests.append(_test_index)
+
def _test_sdist(env):
# buildifier: disable=unsorted-dict-items
tests = [
From 1cd90d76c1fac9758776899e119bd83a9b795c23 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sat, 21 Mar 2026 23:42:25 +0900
Subject: [PATCH 07/14] fix index override handling and fix a few tests
---
python/private/pypi/parse_simpleapi_html.bzl | 2 +-
python/private/pypi/simpleapi_download.bzl | 38 ++--
.../simpleapi_download_tests.bzl | 162 +++++++-----------
3 files changed, 82 insertions(+), 120 deletions(-)
diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 78669d5ff8..83b006ffd2 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -132,7 +132,7 @@ def parse_simpleapi_html(*, content, parse_index = False):
else:
sdists[sha256] = dist
- if packages:
+ if parse_index:
return packages
return struct(
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index de12b9d675..55e11e6fd6 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -110,6 +110,7 @@ def simpleapi_download(
versions = sources[pkg],
get_auth = get_auth,
block = not parallel_download,
+ parse_index = False,
)
if hasattr(result, "wait"):
# We will process it in a separate loop:
@@ -127,6 +128,10 @@ def simpleapi_download(
def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
downloads = {}
results = {}
+ for extra in index_url_overrides.values():
+ if extra not in index_urls:
+ index_urls.append(extra)
+
for index_url in index_urls:
download = read_simpleapi(
ctx = ctx,
@@ -137,7 +142,6 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
parse_index = True,
versions = {pkg: None for pkg in sources},
block = block,
- allow_fail = False,
**kwargs
)
if hasattr(download, "wait"):
@@ -150,23 +154,27 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
found_on_index = {}
for index_url, result in results.items():
- sources = [
- pkg
- for pkg in sources
- if pkg not in found_on_index
- ]
-
- # Filter out the things that we have already found
- found_on_index.update({
- pkg: urllib.absolute_url(index_url, result.output[pkg])
- for pkg in sources
- # TODO @aignas 2026-03-20: add a test here
- if index_url_overrides.get(pkg, index_url)
- })
+ for pkg in sources:
+ if pkg in found_on_index:
+ # We have already found the package, skip
+ continue
+
+ if index_url_overrides.get(pkg, index_url) != index_url:
+ # we should not use this index for the package
+ continue
+
+ if not hasattr(result.output, "get"):
+ fail(result.output)
+
+ found = result.output.get(pkg)
+ if not found:
+ continue
+
+ found_on_index[pkg] = urllib.absolute_url(index_url, found)
return found_on_index
-def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
+def _read_simpleapi(ctx, url, attr, cache, versions, parse_index, get_auth = None, **download_kwargs):
"""Read SimpleAPI.
Args:
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 9a6b7ca5af..8d8a26dd4e 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -23,26 +23,30 @@ _tests = []
def _test_simple(env):
calls = []
- def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, allow_fail):
- _ = ctx, attr, cache, get_auth, versions # buildifier: disable=unused-variable
- env.expect.that_bool(block).equals(False)
- env.expect.that_bool(allow_fail).equals(True)
- calls.append(url)
- if "foo" in url and "main" in url:
+ def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, parse_index):
+ if parse_index:
return struct(
- output = "",
- success = False,
- )
- else:
- return struct(
- output = struct(
- sdists = {"deadbeef": url.strip("/").split("/")[-1]},
- whls = {"deadb33f": url.strip("/").split("/")[-1]},
- sha256s_by_version = {"fizz": url.strip("/").split("/")[-1]},
- ),
success = True,
+ output = {
+ "bar": "/bar/",
+ "baz": "/baz/",
+ } if "main" in url else {
+ "foo": "/foo/",
+ },
)
+ _ = ctx, attr, cache, get_auth, versions # buildifier: disable=unused-variable
+ env.expect.that_bool(block).equals(False)
+ calls.append(url)
+ return struct(
+ output = struct(
+ sdists = {"deadbeef": url.strip("/").split("/")[-1]},
+ whls = {"deadb33f": url.strip("/").split("/")[-1]},
+ sha256s_by_version = {"fizz": url.strip("/").split("/")[-1]},
+ ),
+ success = True,
+ )
+
contents = simpleapi_download(
ctx = struct(
getenv = {}.get,
@@ -50,8 +54,8 @@ def _test_simple(env):
),
attr = struct(
index_url_overrides = {},
- index_url = "main",
- extra_index_urls = ["extra"],
+ index_url = "https://main.com",
+ extra_index_urls = ["https://extra.com"],
sources = {"bar": None, "baz": None, "foo": None},
envsubst = [],
),
@@ -61,26 +65,25 @@ def _test_simple(env):
)
env.expect.that_collection(calls).contains_exactly([
- "extra/foo/",
- "main/bar/",
- "main/baz/",
- "main/foo/",
+ "https://extra.com/foo/",
+ "https://main.com/bar/",
+ "https://main.com/baz/",
])
env.expect.that_dict(contents).contains_exactly({
"bar": struct(
- index_url = "main/bar/",
+ index_url = "https://main.com/bar/",
sdists = {"deadbeef": "bar"},
sha256s_by_version = {"fizz": "bar"},
whls = {"deadb33f": "bar"},
),
"baz": struct(
- index_url = "main/baz/",
+ index_url = "https://main.com/baz/",
sdists = {"deadbeef": "baz"},
sha256s_by_version = {"fizz": "baz"},
whls = {"deadb33f": "baz"},
),
"foo": struct(
- index_url = "extra/foo/",
+ index_url = "https://extra.com/foo/",
sdists = {"deadbeef": "foo"},
sha256s_by_version = {"fizz": "foo"},
whls = {"deadb33f": "foo"},
@@ -89,85 +92,25 @@ def _test_simple(env):
_tests.append(_test_simple)
-def _test_fail(env):
+def _test_index_overrides(env):
calls = []
fails = []
- def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, allow_fail):
- _ = ctx, attr, cache, get_auth, versions # buildifier: disable=unused-variable
- env.expect.that_bool(block).equals(False)
- env.expect.that_bool(allow_fail).equals(True)
- calls.append(url)
- if "foo" in url:
+ def read_simpleapi(ctx, *, url, versions, attr, cache, get_auth, block, parse_index):
+ if parse_index:
return struct(
- output = "",
- success = False,
- )
- if "bar" in url:
- return struct(
- output = "",
- success = False,
- )
- else:
- return struct(
- output = struct(
- sdists = {},
- whls = {},
- sha256s_by_version = {},
- ),
success = True,
+ output = {
+ "Baz": "/baz/", # let's test normalization
+ "bar": "/bar/",
+ "foo": "/foo-should-fail/",
+ } if "main" in url else {
+ "foo": "/foo/",
+ },
)
- simpleapi_download(
- ctx = struct(
- getenv = {}.get,
- report_progress = lambda _: None,
- ),
- attr = struct(
- index_url_overrides = {},
- index_url = "main",
- extra_index_urls = ["extra"],
- sources = {"bar": None, "baz": None, "foo": None},
- envsubst = [],
- ),
- cache = pypi_cache(),
- parallel_download = True,
- read_simpleapi = read_simpleapi,
- _fail = fails.append,
- )
-
- env.expect.that_collection(fails).contains_exactly([
- """
-Failed to download metadata of the following packages from urls:
-{
- "bar": ["main", "extra"],
- "foo": ["main", "extra"],
-}
-
-If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[
- "bar",
- "foo",
-]' to your 'pip.parse' call.
-""",
- ])
- env.expect.that_collection(calls).contains_exactly([
- "main/foo/",
- "main/bar/",
- "main/baz/",
- "extra/foo/",
- "extra/bar/",
- ])
-
-_tests.append(_test_fail)
-
-def _test_allow_fail_single_index(env):
- calls = []
- fails = []
-
- def read_simpleapi(ctx, *, url, versions, attr, cache, get_auth, block, allow_fail):
_ = ctx, attr, cache, get_auth, versions # buildifier: disable=unused-variable
env.expect.that_bool(block).equals(False)
- env.expect.that_bool(allow_fail).equals(False)
calls.append(url)
return struct(
output = struct(
@@ -185,9 +128,9 @@ def _test_allow_fail_single_index(env):
),
attr = struct(
index_url_overrides = {
- "foo": "extra",
+ "foo": "https://extra.com",
},
- index_url = "main",
+ index_url = "https://main.com",
extra_index_urls = [],
sources = {"bar": None, "baz": None, "foo": None},
envsubst = [],
@@ -200,32 +143,32 @@ def _test_allow_fail_single_index(env):
env.expect.that_collection(fails).contains_exactly([])
env.expect.that_collection(calls).contains_exactly([
- "main/bar/",
- "main/baz/",
- "extra/foo/",
+ "https://main.com/bar/",
+ "https://main.com/baz/",
+ "https://extra.com/foo/",
])
env.expect.that_dict(contents).contains_exactly({
"bar": struct(
- index_url = "main/bar/",
+ index_url = "https://main.com/bar/",
sdists = {"deadbeef": "bar"},
sha256s_by_version = {"fizz": "bar"},
whls = {"deadb33f": "bar"},
),
"baz": struct(
- index_url = "main/baz/",
+ index_url = "https://main.com/baz/",
sdists = {"deadbeef": "baz"},
sha256s_by_version = {"fizz": "baz"},
whls = {"deadb33f": "baz"},
),
"foo": struct(
- index_url = "extra/foo/",
+ index_url = "https://extra.com/foo/",
sdists = {"deadbeef": "foo"},
sha256s_by_version = {"fizz": "foo"},
whls = {"deadb33f": "foo"},
),
})
-_tests.append(_test_allow_fail_single_index)
+_tests.append(_test_index_overrides)
def _test_download_url(env):
downloads = {}
@@ -233,6 +176,17 @@ def _test_download_url(env):
def download(url, output, **kwargs):
_ = kwargs # buildifier: disable=unused-variable
downloads[url[0]] = output
+
+ if len(downloads) == 1:
+ return struct(
+ success = True,
+ output = """
+ bar
+ baz
+ foo
+ """,
+ )
+
return struct(success = True)
simpleapi_download(
From 3eb7adf3e592085f8e550db8c4ef64d5e0e12836 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:35:04 +0900
Subject: [PATCH 08/14] handle envsubst when reading the index
---
python/private/pypi/simpleapi_download.bzl | 12 ++--
python/private/pypi/urllib.bzl | 2 +-
.../simpleapi_download_tests.bzl | 67 ++++++++++++++-----
3 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 55e11e6fd6..5a633e9915 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -163,14 +163,18 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
# we should not use this index for the package
continue
- if not hasattr(result.output, "get"):
- fail(result.output)
-
found = result.output.get(pkg)
if not found:
continue
- found_on_index[pkg] = urllib.absolute_url(index_url, found)
+ # The spec says that we should be able to reach the thing via `/`,
+ # so let's extract that
+ found, _, part = found.rpartition("/")
+ if not part:
+ _, _, part = found.rpartition("/")
+ found_on_index[pkg] = urllib.strip_empty_path_segments(
+ "{}/{}/".format(index_url, part),
+ )
return found_on_index
diff --git a/python/private/pypi/urllib.bzl b/python/private/pypi/urllib.bzl
index ca6ded76b1..ea4cd32cc9 100644
--- a/python/private/pypi/urllib.bzl
+++ b/python/private/pypi/urllib.bzl
@@ -3,7 +3,7 @@
def _get_root_directory(url):
scheme_end = url.find("://")
if scheme_end == -1:
- fail("Invalid URL format")
+ fail("Invalid URL format: '{}'".format(url))
scheme = url[:scheme_end]
host_end = url.find("/", scheme_end + 3)
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 8d8a26dd4e..25494505ba 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -101,8 +101,8 @@ def _test_index_overrides(env):
return struct(
success = True,
output = {
- "Baz": "/baz/", # let's test normalization
"bar": "/bar/",
+ "baz": "/baz/",
"foo": "/foo-should-fail/",
} if "main" in url else {
"foo": "/foo/",
@@ -172,21 +172,21 @@ _tests.append(_test_index_overrides)
def _test_download_url(env):
downloads = {}
+ reads = [
+ # The first read is the index which seeds the downloads later
+ """
+ bar
+ baz
+ foo
+ """,
+ "",
+ "",
+ "",
+ ]
def download(url, output, **kwargs):
_ = kwargs # buildifier: disable=unused-variable
downloads[url[0]] = output
-
- if len(downloads) == 1:
- return struct(
- success = True,
- output = """
- bar
- baz
- foo
- """,
- )
-
return struct(success = True)
simpleapi_download(
@@ -194,14 +194,16 @@ def _test_download_url(env):
getenv = {}.get,
download = download,
report_progress = lambda _: None,
- read = lambda i: "contents of " + i,
+ # We will first add a download to the list, so this is a poor man's `next(foo)`
+ # implementation
+ read = lambda i: reads[len(downloads) - 1],
path = lambda i: "path/for/" + i,
),
attr = struct(
index_url_overrides = {},
index_url = "https://example.com/main/simple/",
extra_index_urls = [],
- sources = {"bar": None, "baz": None, "foo": None},
+ sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]},
envsubst = [],
),
cache = pypi_cache(),
@@ -210,6 +212,7 @@ def _test_download_url(env):
)
env.expect.that_dict(downloads).contains_exactly({
+ "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
"https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
"https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
"https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
@@ -219,6 +222,18 @@ _tests.append(_test_download_url)
def _test_download_url_parallel(env):
downloads = {}
+ reads = [
+ # The first read is the index which seeds the downloads later
+ """
+ bar
+ baz
+ foo
+ """,
+ "",
+ "",
+ "",
+ "",
+ ]
def download(url, output, **kwargs):
_ = kwargs # buildifier: disable=unused-variable
@@ -230,13 +245,15 @@ def _test_download_url_parallel(env):
getenv = {}.get,
download = download,
report_progress = lambda _: None,
- read = lambda i: "contents of " + i,
+ # We will first add a download to the list, so this is a poor man's `next(foo)`
+ # implementation. We use 2 because we will enqueue 2 downloads in parallel.
+ read = lambda i: reads[len(downloads) - 2],
path = lambda i: "path/for/" + i,
),
attr = struct(
index_url_overrides = {},
index_url = "https://example.com/main/simple/",
- extra_index_urls = [],
+ extra_index_urls = ["https://example.com/extra/simple/"],
sources = {"bar": None, "baz": None, "foo": None},
envsubst = [],
),
@@ -246,6 +263,8 @@ def _test_download_url_parallel(env):
)
env.expect.that_dict(downloads).contains_exactly({
+ "https://example.com/extra/simple/": "path/for/https___example_com_extra_simple.html",
+ "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
"https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
"https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
"https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
@@ -255,6 +274,17 @@ _tests.append(_test_download_url_parallel)
def _test_download_envsubst_url(env):
downloads = {}
+ reads = [
+ # The first read is the index which seeds the downloads later
+ """
+ bar
+ baz
+ foo
+ """,
+ "",
+ "",
+ "",
+ ]
def download(url, output, **kwargs):
_ = kwargs # buildifier: disable=unused-variable
@@ -266,7 +296,9 @@ def _test_download_envsubst_url(env):
getenv = {"INDEX_URL": "https://example.com/main/simple/"}.get,
download = download,
report_progress = lambda _: None,
- read = lambda i: "contents of " + i,
+ # We will first add a download to the list, so this is a poor man's `next(foo)`
+ # implementation
+ read = lambda i: reads[len(downloads) - 1],
path = lambda i: "path/for/" + i,
),
attr = struct(
@@ -282,6 +314,7 @@ def _test_download_envsubst_url(env):
)
env.expect.that_dict(downloads).contains_exactly({
+ "https://example.com/main/simple/": "path/for/~index_url~.html",
"https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html",
"https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html",
"https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html",
From fdafe7d6af50bc6bf3fab251be61facc46fbde96 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:36:08 +0900
Subject: [PATCH 09/14] handle envsubst when reading the index
---
python/private/pypi/simpleapi_download.bzl | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 5a633e9915..0f1ea9557d 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -169,12 +169,14 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
# The spec says that we should be able to reach the thing via `/`,
# so let's extract that
+ parts = [index_url]
found, _, part = found.rpartition("/")
+ parts.append(part)
if not part:
_, _, part = found.rpartition("/")
- found_on_index[pkg] = urllib.strip_empty_path_segments(
- "{}/{}/".format(index_url, part),
- )
+ parts.append(part)
+
+ found_on_index[pkg] = urllib.strip_empty_path_segments("/".join(parts))
return found_on_index
From c3b68993789c817d2491a655c94bbe732d8399ae Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:47:44 +0900
Subject: [PATCH 10/14] Ensure the URL construction for dist is robust enough
---
python/private/pypi/simpleapi_download.bzl | 16 ++++++----------
.../simpleapi_download_tests.bzl | 19 ++++++++++---------
2 files changed, 16 insertions(+), 19 deletions(-)
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 0f1ea9557d..e559196638 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -167,16 +167,12 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
if not found:
continue
- # The spec says that we should be able to reach the thing via `/`,
- # so let's extract that
- parts = [index_url]
- found, _, part = found.rpartition("/")
- parts.append(part)
- if not part:
- _, _, part = found.rpartition("/")
- parts.append(part)
-
- found_on_index[pkg] = urllib.strip_empty_path_segments("/".join(parts))
+ # Ignore the URL here because we know how to construct it.
+
+ found_on_index[pkg] = urllib.strip_empty_path_segments("{}/{}/".format(
+ index_url,
+ pkg.replace("_", "-"), # Use the official normalization for URLs
+ ))
return found_on_index
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 25494505ba..2ab4063952 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -101,8 +101,9 @@ def _test_index_overrides(env):
return struct(
success = True,
output = {
+ # normalized
+ "ba_z": "/ba-z/",
"bar": "/bar/",
- "baz": "/baz/",
"foo": "/foo-should-fail/",
} if "main" in url else {
"foo": "/foo/",
@@ -132,7 +133,7 @@ def _test_index_overrides(env):
},
index_url = "https://main.com",
extra_index_urls = [],
- sources = {"bar": None, "baz": None, "foo": None},
+ sources = {"ba_z": None, "bar": None, "foo": None},
envsubst = [],
),
cache = pypi_cache(),
@@ -144,22 +145,22 @@ def _test_index_overrides(env):
env.expect.that_collection(fails).contains_exactly([])
env.expect.that_collection(calls).contains_exactly([
"https://main.com/bar/",
- "https://main.com/baz/",
+ "https://main.com/ba-z/",
"https://extra.com/foo/",
])
env.expect.that_dict(contents).contains_exactly({
+ "ba_z": struct(
+ index_url = "https://main.com/ba-z/",
+ sdists = {"deadbeef": "ba-z"},
+ sha256s_by_version = {"fizz": "ba-z"},
+ whls = {"deadb33f": "ba-z"},
+ ),
"bar": struct(
index_url = "https://main.com/bar/",
sdists = {"deadbeef": "bar"},
sha256s_by_version = {"fizz": "bar"},
whls = {"deadb33f": "bar"},
),
- "baz": struct(
- index_url = "https://main.com/baz/",
- sdists = {"deadbeef": "baz"},
- sha256s_by_version = {"fizz": "baz"},
- whls = {"deadb33f": "baz"},
- ),
"foo": struct(
index_url = "https://extra.com/foo/",
sdists = {"deadbeef": "foo"},
From 26e56bb3bfa661594f7ea0413fd62feb0c13a142 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 01:01:06 +0900
Subject: [PATCH 11/14] fix more tests
---
tests/pypi/hub_builder/hub_builder_tests.bzl | 33 ++++++++++++++------
1 file changed, 24 insertions(+), 9 deletions(-)
diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl
index 637c7881c2..31a41f6af5 100644
--- a/tests/pypi/hub_builder/hub_builder_tests.bzl
+++ b/tests/pypi/hub_builder/hub_builder_tests.bzl
@@ -247,12 +247,19 @@ def _test_simple_extras_vs_no_extras(env):
_tests.append(_test_simple_extras_vs_no_extras)
def _test_simple_extras_vs_no_extras_simpleapi(env):
- def mockread_simpleapi(*_, **__):
+ def mockread_simpleapi(*_, parse_index, **__):
+ if parse_index:
+ content = """\
+ simple-0.0.1-py3-none-any.whl
+"""
return struct(
output = parse_simpleapi_html(
- content = """\
- simple-0.0.1-py3-none-any.whl
-""",
+ content = content,
+ parse_index = parse_index,
),
success = True,
)
@@ -489,10 +496,13 @@ def _test_simple_with_markers(env):
_tests.append(_test_simple_with_markers)
def _test_torch_experimental_index_url(env):
- def mockread_simpleapi(*_, **__):
- return struct(
- output = parse_simpleapi_html(
- content = """\
+ def mockread_simpleapi(*_, parse_index, **__):
+ if parse_index:
+ content = """\
+ torch
+"""
+ else:
+ content = """\
torch-2.4.1+cpu-cp310-cp310-linux_x86_64.whl
torch-2.4.1+cpu-cp310-cp310-win_amd64.whl
torch-2.4.1+cpu-cp311-cp311-linux_x86_64.whl
@@ -513,7 +523,12 @@ def _test_torch_experimental_index_url(env):
torch-2.4.1-cp38-none-macosx_11_0_arm64.whl
torch-2.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
torch-2.4.1-cp39-none-macosx_11_0_arm64.whl
-""",
+"""
+
+ return struct(
+ output = parse_simpleapi_html(
+ content = content,
+ parse_index = parse_index,
),
success = True,
)
From 0f493abbce261a335f0c6590298fa6e87e7e7122 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 01:11:41 +0900
Subject: [PATCH 12/14] doc: self review
---
CHANGELOG.md | 9 +++++++--
python/private/pypi/BUILD.bazel | 3 +--
python/private/pypi/pypi_cache.bzl | 3 +++
python/private/pypi/simpleapi_download.bzl | 3 ++-
4 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18be4def9c..ec9467e7a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -67,8 +67,13 @@ END_UNRELEASED_TEMPLATE
Other changes:
* (pypi) Update dependencies used for `compile_pip_requirements`, building
sdists in the `whl_library` rule and fetching wheels using `pip`.
-* (pypi) We will set `allow_fail` to `False` if the {attr}`experimental_index_url_overrides` is set
- to a non-empty value. This means that failures will be no-longer cached in this particular case.
+* (pypi) Before using the bazel downloader to fetch the PyPI package metadata
+ we will from now on fetch the lists of available packages on each index. The
+ used package mappings will be written as facts to the `MODULE.bazel.lock` file
+ on supported bazel versions and it should be done at most once. As a result,
+ per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
+ passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
+ Fixes
([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and
[#2632](https://github.com/bazel-contrib/rules_python/issues/2632))
diff --git a/python/private/pypi/BUILD.bazel b/python/private/pypi/BUILD.bazel
index 6b4822333c..869be4705a 100644
--- a/python/private/pypi/BUILD.bazel
+++ b/python/private/pypi/BUILD.bazel
@@ -244,6 +244,7 @@ bzl_library(
srcs = ["parse_simpleapi_html.bzl"],
deps = [
":version_from_filename_bzl",
+ "//python/private:normalize_name_bzl",
],
)
@@ -424,8 +425,6 @@ bzl_library(
":urllib_bzl",
"//python/private:auth_bzl",
"//python/private:normalize_name_bzl",
- "//python/private:text_util_bzl",
- "@bazel_features//:features",
],
)
diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 2d2418c6ae..7b24102263 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -90,6 +90,9 @@ def _pypi_cache_get(self, key):
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
else:
+ # We might be using something from memory that is not yet stored in facts (e.g. we processed
+ # the requirements.txt for one Python version and the deps got cached, but new python
+ # version means different deps, which may add extras.
self._facts.setdefault(index_url, cached)
return cached
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index e559196638..3551f18a0c 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -191,7 +191,8 @@ def _read_simpleapi(ctx, url, attr, cache, versions, parse_index, get_auth = Non
cache: {type}`struct` the `pypi_cache` instance.
versions: {type}`list[str] The versions that have been requested.
get_auth: A function to get auth information. Used in tests.
- parse_index: TODO
+ parse_index: {type}`bool` Whether to parse the content as a root index page
+ (e.g. `/simple/`) instead of a package-specific page.
**download_kwargs: Any extra params to ctx.download.
Note that output and auth will be passed for you.
From 208ee55c9c2511348deec6ad2e57cf2e802a159d Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Mon, 23 Mar 2026 22:47:49 +0900
Subject: [PATCH 13/14] behave like uv does
---
CHANGELOG.md | 3 +++
python/private/pypi/simpleapi_download.bzl | 23 ++++++++++++++-----
.../simpleapi_download_tests.bzl | 10 ++++----
3 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27a75cac94..8e7c152acd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -75,6 +75,9 @@ Other changes:
on supported bazel versions and it should be done at most once. As a result,
per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
+ What is more, we start implementing the flags for `--index_url` and `--extra_index_urls` more in
+ line to how it is used in `uv` and `pip`, i.e. we default to `--index_url` if the package is not
+ found in `--extra_index_urls`.
Fixes
([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and
[#2632](https://github.com/bazel-contrib/rules_python/issues/2632)).
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 3551f18a0c..2171e8b56a 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -42,11 +42,13 @@ def simpleapi_download(
ctx: The module_ctx or repository_ctx.
attr: Contains the parameters for the download. They are grouped into a
struct for better clarity. It must have attributes:
- * index_url: str, the index.
+ * index_url: str, the index, or if `extra_index_urls` are passed, the default index.
* index_url_overrides: dict[str, str], the index overrides for
separate packages.
- * extra_index_urls: Extra index URLs that will be looked up after
- the main is looked up.
+ * extra_index_urls: Will be looked at in the order they are defined and the first match
+ wins. This is similar to what uv does, see
+ https://docs.astral.sh/uv/concepts/indexes/#searching-across-multiple-indexes.
+ PRs for implementing other strategies are welcome.
* sources: list[str], the sources to download things for. Each value is
the contents of requirements files.
* envsubst: list[str], the envsubst vars for performing substitution in index url.
@@ -86,7 +88,8 @@ def simpleapi_download(
# handle this case. What we do is we select a particular index to download the packages
dist_urls = _get_dist_urls(
ctx,
- index_urls = [attr.index_url] + attr.extra_index_urls,
+ default_index = attr.index_url,
+ index_urls = attr.extra_index_urls,
index_url_overrides = index_url_overrides,
sources = sources,
read_simpleapi = read_simpleapi,
@@ -125,13 +128,17 @@ def simpleapi_download(
return contents
-def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
+def _get_dist_urls(ctx, *, default_index, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
downloads = {}
results = {}
for extra in index_url_overrides.values():
if extra not in index_urls:
index_urls.append(extra)
+ index_urls = index_urls or []
+ if default_index not in index_urls:
+ index_urls.append(default_index)
+
for index_url in index_urls:
download = read_simpleapi(
ctx = ctx,
@@ -156,7 +163,11 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
for index_url, result in results.items():
for pkg in sources:
if pkg in found_on_index:
- # We have already found the package, skip
+ # We have already found the package, skip searching for it in
+ # other indexes.
+ #
+ # If we wanted to merge all of the index results, we would have to continue here
+ # and in the outer function process merging of the results.
continue
if index_url_overrides.get(pkg, index_url) != index_url:
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 2ab4063952..55439c2593 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -253,7 +253,7 @@ def _test_download_url_parallel(env):
),
attr = struct(
index_url_overrides = {},
- index_url = "https://example.com/main/simple/",
+ index_url = "https://example.com/default/simple/",
extra_index_urls = ["https://example.com/extra/simple/"],
sources = {"bar": None, "baz": None, "foo": None},
envsubst = [],
@@ -264,11 +264,11 @@ def _test_download_url_parallel(env):
)
env.expect.that_dict(downloads).contains_exactly({
+ "https://example.com/default/simple/": "path/for/https___example_com_default_simple.html",
"https://example.com/extra/simple/": "path/for/https___example_com_extra_simple.html",
- "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
- "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
- "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
- "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
+ "https://example.com/extra/simple/bar/": "path/for/https___example_com_extra_simple_bar.html",
+ "https://example.com/extra/simple/baz/": "path/for/https___example_com_extra_simple_baz.html",
+ "https://example.com/extra/simple/foo/": "path/for/https___example_com_extra_simple_foo.html",
})
_tests.append(_test_download_url_parallel)
From f950c32600231e73b0b0900229c67f1920c304c2 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Tue, 24 Mar 2026 10:49:02 +0900
Subject: [PATCH 14/14] fix: support unescaped characters in strings
---
python/private/pypi/parse_simpleapi_html.bzl | 13 +++++++++----
.../parse_simpleapi_html_tests.bzl | 4 ++--
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 83b006ffd2..7f0d2776d7 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -72,11 +72,16 @@ def parse_simpleapi_html(*, content, parse_index = False):
if start_tag == -1:
break
- # Find the end of the opening tag and the closing
- tag_end = content.find(">", start_tag)
- end_tag = content.find("", tag_end)
- if tag_end == -1 or end_tag == -1:
+ # Find the closing tag first, then find the end of the opening
+ # tag using rfind. This correctly handles attributes that
+ # contain > characters, e.g. data-requires-python=">=3.6".
+ end_tag = content.find("", start_tag)
+ if end_tag == -1:
break
+ tag_end = content.rfind(">", start_tag, end_tag)
+ if tag_end == -1 or tag_end <= start_tag:
+ cursor = end_tag + 4
+ continue
# Extract only the necessary slices
filename = content[tag_end + 1:end_tag].strip()
diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
index 933a0783f2..c84140f459 100644
--- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
+++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
@@ -88,7 +88,7 @@ def _test_sdist(env):
struct(
attrs = [
'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
- 'data-requires-python=">=3.7"',
+ 'data-requires-python=">=3.7"',
"data-yanked",
],
filename = "foo-0.0.1.tar.gz",
@@ -105,7 +105,7 @@ def _test_sdist(env):
struct(
attrs = [
'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
- 'data-requires-python=">=3.7"',
+ 'data-requires-python="<=3.7"',
"data-yanked=\"Something
with "quotes"
over two lines\"",
],
filename = "foo-0.0.1.tar.gz",