From d4cea676e998b291961f0f0b953a7de7b7a35940 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Tue, 10 Mar 2026 20:05:54 +0900
Subject: [PATCH 01/14] wip

wip
---
 python/private/pypi/parse_simpleapi_html.bzl |  24 ++-
 python/private/pypi/simpleapi_download.bzl   | 175 +++++++++++--------
 2 files changed, 123 insertions(+), 76 deletions(-)

diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 563130791e..78669d5ff8 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -16,16 +16,20 @@
 Parse SimpleAPI HTML in Starlark.
 """
 
+load("//python/private:normalize_name.bzl", "normalize_name")
 load(":version_from_filename.bzl", "version_from_filename")
 
-def parse_simpleapi_html(*, content):
+def parse_simpleapi_html(*, content, parse_index = False):
     """Get the package URLs for given shas by parsing the Simple API HTML.
 
     Args:
-        content(str): The Simple API HTML content.
+        content: {type}`str` The Simple API HTML content.
+        parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
+            e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
 
     Returns:
-        A list of structs with:
+        If it is the index page, return the map of package to URL it can be queried from.
+        Otherwise, a list of structs with:
           * filename: {type}`str` The filename of the artifact.
           * version: {type}`str` The version of the artifact.
           * url: {type}`str` The URL to download the artifact.
@@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content):
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))
 
+    packages = {}
+
     # 2. Iterate using find() to avoid huge list allocations from .split("<a ")
     cursor = 0
     for _ in range(1000000):  # Safety break for Starlark
@@ -73,18 +79,23 @@ def parse_simpleapi_html(*, content):
             break
 
         # Extract only the necessary slices
-        attr_part = content[start_tag + 3:tag_end]
         filename = content[tag_end + 1:end_tag].strip()
+        attr_part = content[start_tag + 3:tag_end]
 
         # Update cursor for next iteration
         cursor = end_tag + 4
 
-        # 3. Efficient Attribute Parsing
         attrs = _parse_attrs(attr_part)
         href = attrs.get("href", "")
         if not href:
             continue
 
+        if parse_index:
+            pkg_name = filename
+            packages[normalize_name(pkg_name)] = href
+            continue
+
+        # 3. Efficient Attribute Parsing
         dist_url, _, sha256 = href.partition("#sha256=")
 
         # Handle Yanked status
@@ -121,6 +132,9 @@ def parse_simpleapi_html(*, content):
         else:
             sdists[sha256] = dist
 
+    if packages:
+        return packages
+
     return struct(
         sdists = sdists,
         whls = whls,
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 20d79ba9b4..f15d835a48 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -16,7 +16,6 @@
 A file that houses private functions used in the `bzlmod` extension with the same name.
 """
 
-load("@bazel_features//:features.bzl", "bazel_features")
 load("//python/private:auth.bzl", _get_auth = "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
@@ -35,6 +34,11 @@ def simpleapi_download(
         _fail = fail):
     """Download Simple API HTML.
 
+    First it queries all of the indexes for available packages and then it downloads the contents of
+    the per-package URLs and sha256 values. This is to enable us to use bazel_downloader with
+    `requirements.txt` files. As a side effect we also are able to "cross-compile" by fetching the
+    right wheel for the right target platform through the information that we retrieve here.
+
     Args:
         ctx: The module_ctx or repository_ctx.
         attr: Contains the parameters for the download. They are grouped into a
@@ -77,66 +81,105 @@ def simpleapi_download(
     index_urls = [attr.index_url] + attr.extra_index_urls
     read_simpleapi = read_simpleapi or _read_simpleapi
 
-    download_kwargs = {}
-    if bazel_features.external_deps.download_has_block_param:
-        download_kwargs["block"] = not parallel_download
+    dist_urls = _get_dist_urls(
+        ctx,
+        index_urls,
+        index_url_overrides,
+        read_simpleapi = read_simpleapi,
+        cache = cache,
+        get_auth = get_auth,
+        attr = attr,
+        block = not parallel_download,
+        _fail = _fail,
+    )
 
-    if len(index_urls) == 1 or index_url_overrides:
-        download_kwargs["allow_fail"] = False
-    else:
-        download_kwargs["allow_fail"] = True
+    ctx.report_progress("Fetch package lists from PyPI index")
 
-    input_sources = attr.sources
+    sources = {
+        normalize_name(pkg): versions
+        for pkg, versions in attr.sources.items()
+    }
 
-    found_on_index = {}
-    warn_overrides = False
-    ctx.report_progress("Fetch package lists from PyPI index")
-    for i, index_url in enumerate(index_urls):
-        if i != 0:
-            # Warn the user about a potential fix for the overrides
-            warn_overrides = True
-
-        async_downloads = {}
-        sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index}
-        for pkg, versions in sources.items():
-            pkg_normalized = normalize_name(pkg)
-            url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
-                index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
-                distribution = pkg,
-            ))
-            result = read_simpleapi(
-                ctx = ctx,
-                attr = attr,
-                versions = versions,
-                url = url,
-                cache = cache,
-                get_auth = get_auth,
-                **download_kwargs
-            )
-            if hasattr(result, "wait"):
-                # We will process it in a separate loop:
-                async_downloads[pkg] = struct(
-                    pkg_normalized = pkg_normalized,
-                    wait = result.wait,
-                    url = url,
-                )
-            elif result.success:
-                contents[pkg_normalized] = _with_index_url(url, result.output)
-                found_on_index[pkg] = index_url
-
-        if not async_downloads:
-            continue
+    downloads = {}
+    contents = {}
+    for pkg, url in dist_urls.items():
+        result = read_simpleapi(
+            ctx = ctx,
+            attr = attr,
+            url = url,
+            cache = cache,
+            versions = sources[pkg],
+            get_auth = get_auth,
+            block = not parallel_download,
+        )
+        if hasattr(result, "wait"):
+            # We will process it in a separate loop:
+            downloads[pkg] = result
+        else:
+            contents[pkg] = _with_index_url(url, result.output)
 
+    for pkg, d in downloads.items():
         # If we use `block` == False, then we need to have a second loop that is
         # collecting all of the results as they were being downloaded in parallel.
-        for pkg, download in async_downloads.items():
-            result = download.wait()
+        contents[pkg] = _with_index_url(dist_urls[pkg], d.wait().output)
 
-            if result.success:
-                contents[download.pkg_normalized] = _with_index_url(download.url, result.output)
-                found_on_index[pkg] = index_url
+    return contents
 
-    failed_sources = [pkg for pkg in input_sources if pkg not in found_on_index]
+def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr, block, _fail = fail, **kwargs):
+    if index_url_overrides:
+        first_index = index_urls[0]
+        return {
+            pkg: urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
+                index_url = index_url_overrides.get(normalize_name(pkg), first_index).rstrip("/"),
+                distribution = pkg,
+            ))
+            for pkg in attr.sources
+        }
+
+    downloads = {}
+    results = {}
+    for index_url in index_urls:
+        # TODO @aignas 2026-03-20: pull from the cache/facts
+        # we can store the following schema:
+        # facts: {
+        #   "index_urls": {
+        #       "<index_url>": {
+        #           "<pkg_normalized>": "<dist_url>",
+        #       }
+        #   }
+        # }
+        download = read_simpleapi(
+            ctx = ctx,
+            attr = attr,
+            url = urllib.strip_empty_path_segments("{index_url}/".format(
+                index_url = index_url,
+            )),
+            parse_index = True,
+            versions = None,
+            block = block,
+            allow_fail = False,
+            **kwargs
+        )
+        if hasattr(download, "wait"):
+            downloads[index_url] = download
+        else:
+            results[index_url] = download
+
+    for index_url, download in downloads.items():
+        results[index_url] = download.wait()
+
+    found_on_index = {}
+    for index_url, result in results.items():
+        sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+
+        available_packages = result.output
+        sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages]
+        found_on_index.update({
+            pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)])
+            for pkg in sources
+        })
+
+    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
     if failed_sources:
         pkg_index_urls = {
             pkg: index_url_overrides.get(
@@ -148,7 +191,7 @@ def simpleapi_download(
 
         _fail(
             """
-Failed to download metadata of the following packages from urls:
+Failed to find packages on PyPI of the following packages from urls:
 {pkg_index_urls}
 
 If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
@@ -159,22 +202,9 @@ If you would like to skip downloading metadata for these packages please add 'si
         )
         return None
 
-    if warn_overrides:
-        index_url_overrides = {
-            pkg: found_on_index[pkg]
-            for pkg in attr.sources
-            if found_on_index[pkg] != attr.index_url
-        }
-
-        if index_url_overrides:
-            # buildifier: disable=print
-            print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
-                render.dict(index_url_overrides),
-            ))
-
-    return contents
+    return {normalize_name(pkg): url for pkg, url in found_on_index.items()}
 
-def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs):
+def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
     """Read SimpleAPI.
 
     Args:
@@ -189,6 +219,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
         cache: {type}`struct` the `pypi_cache` instance.
         versions: {type}`list[str] The versions that have been requested.
         get_auth: A function to get auth information. Used in tests.
+        parse_index: TODO
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.
 
@@ -242,6 +273,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
                 output = output,
                 cache = cache,
                 cache_key = cache_key,
+                parse_index = parse_index,
             ),
         )
 
@@ -251,15 +283,16 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download
         output = output,
         cache = cache,
         cache_key = cache_key,
+        parse_index = parse_index,
     )
 
-def _read_index_result(ctx, *, result, output, cache, cache_key):
+def _read_index_result(ctx, *, result, output, cache, cache_key, parse_index):
     if not result.success:
         return struct(success = False)
 
     content = ctx.read(output)
 
-    output = parse_simpleapi_html(content = content)
+    output = parse_simpleapi_html(content = content, parse_index = parse_index)
     if output:
         cache.setdefault(cache_key, output)
         return struct(success = True, output = output)

From 0107a54801377f16ea1caf6e206076d201d13aec Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:09:51 +0900
Subject: [PATCH 02/14] add facts

---
 python/private/pypi/pypi_cache.bzl         | 51 +++++++++++++++
 python/private/pypi/simpleapi_download.bzl | 73 ++++++++--------------
 2 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 28c6cbeafb..747bf6a7a1 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -122,6 +122,15 @@ def _filter_packages(dists, requested_versions):
     if dists == None or not requested_versions:
         return dists
 
+    if type(dists) == "dict":
+        pkgs = requested_versions
+        filtered = {
+            pkg: url
+            for pkg, url in dists.items()
+            if pkg in pkgs
+        }
+        return filtered
+
     sha256s_by_version = {}
     whls = {}
     sdists = {}
@@ -193,6 +202,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
         # cannot trust known facts, different version that we know how to parse
         return None
 
+    if type(requested_versions) == "dict":
+        return _filter_packages(
+            dists = known_facts.get("index_urls", {}).get(index_url, {}),
+            requested_versions = requested_versions,
+        )
+
     known_sources = {}
 
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
@@ -266,10 +281,46 @@ def _store_facts(facts, fact_version, index_url, value):
 
     facts["fact_version"] = fact_version
 
+    if type(value) == "dict":
+        # facts: {
+        #   "index_urls": {
+        #     "<index_url>": {
+        #       "<pkg_normalized>": "<dist_url>",
+        #     },
+        #   },
+        # },
+        for pkg, url in value.items():
+            facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url)
+        return value
+
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
     distribution = distribution.rstrip("/")
     root_url = root_url.rstrip("/")
 
+    # The schema is
+    # facts: {
+    #   "dist_hashes": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<sha256>",
+    #       },
+    #     },
+    #   },
+    #   "dist_filenames": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<filename>",   # if it is different from the URL
+    #       },
+    #     },
+    #   },
+    #   "dist_yanked": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<sha256>": "<reason>",   # if the package is yanked
+    #       },
+    #     },
+    #   },
+    # },
     for sha256, d in (value.sdists | value.whls).items():
         facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
         if not d.url.endswith(d.filename):
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index f15d835a48..a1dd447e1e 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -75,6 +75,11 @@ def simpleapi_download(
         for p, i in (attr.index_url_overrides or {}).items()
     }
 
+    sources = {
+        normalize_name(pkg): versions
+        for pkg, versions in attr.sources.items()
+    }
+
     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
     contents = {}
@@ -83,8 +88,9 @@ def simpleapi_download(
 
     dist_urls = _get_dist_urls(
         ctx,
-        index_urls,
-        index_url_overrides,
+        index_urls = index_urls,
+        index_url_overrides = index_url_overrides,
+        sources = sources,
         read_simpleapi = read_simpleapi,
         cache = cache,
         get_auth = get_auth,
@@ -95,11 +101,6 @@ def simpleapi_download(
 
     ctx.report_progress("Fetch package lists from PyPI index")
 
-    sources = {
-        normalize_name(pkg): versions
-        for pkg, versions in attr.sources.items()
-    }
-
     downloads = {}
     contents = {}
     for pkg, url in dist_urls.items():
@@ -125,29 +126,10 @@ def simpleapi_download(
 
     return contents
 
-def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr, block, _fail = fail, **kwargs):
-    if index_url_overrides:
-        first_index = index_urls[0]
-        return {
-            pkg: urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
-                index_url = index_url_overrides.get(normalize_name(pkg), first_index).rstrip("/"),
-                distribution = pkg,
-            ))
-            for pkg in attr.sources
-        }
-
+def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
     downloads = {}
     results = {}
     for index_url in index_urls:
-        # TODO @aignas 2026-03-20: pull from the cache/facts
-        # we can store the following schema:
-        # facts: {
-        #   "index_urls": {
-        #       "<index_url>": {
-        #           "<pkg_normalized>": "<dist_url>",
-        #       }
-        #   }
-        # }
         download = read_simpleapi(
             ctx = ctx,
             attr = attr,
@@ -155,7 +137,7 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr
                 index_url = index_url,
             )),
             parse_index = True,
-            versions = None,
+            versions = {pkg: None for pkg in sources},
             block = block,
             allow_fail = False,
             **kwargs
@@ -170,25 +152,25 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr
 
     found_on_index = {}
     for index_url, result in results.items():
-        sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
-
-        available_packages = result.output
-        sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages]
+        # Filter out the things that we have already found
         found_on_index.update({
-            pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)])
+            pkg: urllib.absolute_url(index_url, result.output[pkg])
             for pkg in sources
         })
+        sources = [
+            pkg
+            for pkg in sources
+            if pkg not in found_on_index
+        ]
 
-    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
-    if failed_sources:
+    if sources:
         pkg_index_urls = {
-            pkg: index_url_overrides.get(
-                normalize_name(pkg),
-                index_urls,
-            )
-            for pkg in failed_sources
+            pkg: index_url_overrides.get(pkg, index_urls)
+            for pkg in sources
         }
 
+        # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can
+        # print a warning, or we can fallback to PyPI. For now let's fail
         _fail(
             """
 Failed to find packages on PyPI of the following packages from urls:
@@ -196,13 +178,13 @@ Failed to find packages on PyPI of the following packages from urls:
 
 If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
 """.format(
-                pkg_index_urls = render.dict(pkg_index_urls),
-                failed_sources = render.list(failed_sources),
+                pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))),
+                failed_sources = render.list(sources),
             ),
         )
         return None
 
-    return {normalize_name(pkg): url for pkg, url in found_on_index.items()}
+    return found_on_index
 
 def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
     """Read SimpleAPI.
@@ -227,11 +209,6 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_inde
         A similar object to what `download` would return except that in result.out
         will be the parsed simple api contents.
     """
-    # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
-    # the whl location and we cannot handle multiple URLs at once by passing
-    # them to ctx.download if we want to correctly handle the relative URLs.
-    # TODO: Add a test that env subbed index urls do not leak into the lock file.
-
     real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
 
     cache_key = (url, real_url, versions)

From 0492f31707349406229ffc12109dd6eab82bcea5 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:48:46 +0900
Subject: [PATCH 03/14] finish POC

---
 python/private/pypi/pypi_cache.bzl         | 11 ++++++-----
 python/private/pypi/simpleapi_download.bzl |  4 +---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 747bf6a7a1..bc92de0bde 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -89,6 +89,9 @@ def _pypi_cache_get(self, key):
     if not cached and versions:
         # Could not get from in-memory, read from lockfile facts
         cached = self._facts.get(index_url, versions)
+    else:
+        # TODO @aignas 2026-03-20: add a test here
+        self._facts.setdefault(index_url, cached)
 
     return cached
 
@@ -123,13 +126,11 @@ def _filter_packages(dists, requested_versions):
         return dists
 
     if type(dists) == "dict":
-        pkgs = requested_versions
-        filtered = {
+        return {
             pkg: url
             for pkg, url in dists.items()
-            if pkg in pkgs
+            if pkg in requested_versions
         }
-        return filtered
 
     sha256s_by_version = {}
     whls = {}
@@ -290,7 +291,7 @@ def _store_facts(facts, fact_version, index_url, value):
         #   },
         # },
         for pkg, url in value.items():
-            facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url)
+            facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
         return value
 
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index a1dd447e1e..b8caacef82 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -82,13 +82,11 @@ def simpleapi_download(
 
     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
-    contents = {}
-    index_urls = [attr.index_url] + attr.extra_index_urls
     read_simpleapi = read_simpleapi or _read_simpleapi
 
     dist_urls = _get_dist_urls(
         ctx,
-        index_urls = index_urls,
+        index_urls = [attr.index_url] + attr.extra_index_urls,
         index_url_overrides = index_url_overrides,
         sources = sources,
         read_simpleapi = read_simpleapi,

From b77854d0e28543f54b3763e05d5ae059581f152f Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:10:07 +0900
Subject: [PATCH 04/14] remove a warning

---
 python/private/pypi/simpleapi_download.bzl | 31 +++++-----------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index b8caacef82..a73dc4538a 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -19,7 +19,6 @@ A file that houses private functions used in the `bzlmod` extension with the sam
 load("//python/private:auth.bzl", _get_auth = "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
-load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
 load(":urllib.bzl", "urllib")
 
@@ -150,37 +149,19 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
 
     found_on_index = {}
     for index_url, result in results.items():
-        # Filter out the things that we have already found
-        found_on_index.update({
-            pkg: urllib.absolute_url(index_url, result.output[pkg])
-            for pkg in sources
-        })
         sources = [
             pkg
             for pkg in sources
             if pkg not in found_on_index
         ]
 
-    if sources:
-        pkg_index_urls = {
-            pkg: index_url_overrides.get(pkg, index_urls)
+        # Filter out the things that we have already found
+        found_on_index.update({
+            pkg: urllib.absolute_url(index_url, result.output[pkg])
             for pkg in sources
-        }
-
-        # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can
-        # print a warning, or we can fallback to PyPI. For now let's fail
-        _fail(
-            """
-Failed to find packages on PyPI of the following packages from urls:
-{pkg_index_urls}
-
-If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
-""".format(
-                pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))),
-                failed_sources = render.list(sources),
-            ),
-        )
-        return None
+            # TODO @aignas 2026-03-20: add a test here
+            if index_url_overrides.get(pkg, index_url)
+        })
 
     return found_on_index
 

From cb97d74fd86804ec4ab629b90f92e212ac947f46 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:21:02 +0900
Subject: [PATCH 05/14] add a test

---
 python/private/pypi/pypi_cache.bzl         |  1 -
 python/private/pypi/simpleapi_download.bzl |  9 +++---
 tests/pypi/pypi_cache/pypi_cache_tests.bzl | 33 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index bc92de0bde..2d2418c6ae 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -90,7 +90,6 @@ def _pypi_cache_get(self, key):
         # Could not get from in-memory, read from lockfile facts
         cached = self._facts.get(index_url, versions)
     else:
-        # TODO @aignas 2026-03-20: add a test here
         self._facts.setdefault(index_url, cached)
 
     return cached
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index a73dc4538a..de12b9d675 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -73,16 +73,17 @@ def simpleapi_download(
         normalize_name(p): i
         for p, i in (attr.index_url_overrides or {}).items()
     }
-
     sources = {
         normalize_name(pkg): versions
         for pkg, versions in attr.sources.items()
     }
 
-    # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
-    # to replicate how `pip` would handle this case.
     read_simpleapi = read_simpleapi or _read_simpleapi
 
+    ctx.report_progress("Fetch package lists from PyPI index")
+
+    # NOTE: we are not merging results from multiple indexes to replicate how `pip` would
+    # handle this case. What we do is we select a particular index to download the packages
     dist_urls = _get_dist_urls(
         ctx,
         index_urls = [attr.index_url] + attr.extra_index_urls,
@@ -96,7 +97,7 @@ def simpleapi_download(
         _fail = _fail,
     )
 
-    ctx.report_progress("Fetch package lists from PyPI index")
+    ctx.report_progress("Fetching package URLs from PyPI index")
 
     downloads = {}
     contents = {}
diff --git a/tests/pypi/pypi_cache/pypi_cache_tests.bzl b/tests/pypi/pypi_cache/pypi_cache_tests.bzl
index 7b6168ce7b..3cf01c7450 100644
--- a/tests/pypi/pypi_cache/pypi_cache_tests.bzl
+++ b/tests/pypi/pypi_cache/pypi_cache_tests.bzl
@@ -155,6 +155,39 @@ def _test_pypi_cache_writes_to_facts(env):
         "fact_version": "v1",  # Facts version
     })
 
+    # When we get the other items cached in memory, they get written to facts
+    got = cache.get((key[0], key[1], ["1.1.0"]))
+    got.whls().contains_exactly({
+        "sha_whl_2": fake_result.whls["sha_whl_2"],
+    })
+    got.sdists().contains_exactly({})
+    got.sha256s_by_version().contains_exactly({
+        "1.1.0": fake_result.sha256s_by_version["1.1.0"],
+    })
+
+    # Then when we get facts at the end
+    cache.get_facts().contains_exactly({
+        "dist_hashes": {
+            # We are not using the real index URL, because we may have credentials in here
+            "https://{PYPI_INDEX_URL}": {
+                "pkg": {
+                    "https://pypi.org/files/pkg-1.0.0-py3-none-any.whl": "sha_whl",
+                    "https://pypi.org/files/pkg-1.0.0.tar.gz": "sha_sdist",
+                    "https://pypi.org/files/pkg-1.1.0-py3-none-any.whl": "sha_whl_2",
+                },
+            },
+        },
+        "dist_yanked": {
+            "https://{PYPI_INDEX_URL}": {
+                "pkg": {
+                    "sha_sdist": "",
+                    "sha_whl": "Security issue",
+                },
+            },
+        },
+        "fact_version": "v1",  # Facts version
+    })
+
 _tests.append(_test_pypi_cache_writes_to_facts)
 
 def _test_pypi_cache_reads_from_facts(env):

From 7e71a5898037d6aee581bf039f8019c11bcbc836 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:25:15 +0900
Subject: [PATCH 06/14] add a test

---
 .../parse_simpleapi_html_tests.bzl            | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
index f72d61371c..933a0783f2 100644
--- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
+++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
@@ -42,6 +42,29 @@ def _generate_html(*items):
         ]),
     )
 
+def _test_index(env):
+    # buildifier: disable=unsorted-dict-items
+    tests = [
+        (
+            [
+                struct(attrs = ['href="/simple/foo/"'], filename = "foo"),
+                struct(attrs = ['href="./b-ar/"'], filename = "b-._.-aR"),
+            ],
+            {
+                "b_ar": "./b-ar/",
+                "foo": "/simple/foo/",
+            },
+        ),
+    ]
+
+    for (input, want) in tests:
+        html = _generate_html(*input)
+        got = parse_simpleapi_html(content = html, parse_index = True)
+
+        env.expect.that_dict(got).contains_exactly(want)
+
+_tests.append(_test_index)
+
 def _test_sdist(env):
     # buildifier: disable=unsorted-dict-items
     tests = [

From 1cd90d76c1fac9758776899e119bd83a9b795c23 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sat, 21 Mar 2026 23:42:25 +0900
Subject: [PATCH 07/14] fix index override handling and fix a few tests

---
 python/private/pypi/parse_simpleapi_html.bzl  |   2 +-
 python/private/pypi/simpleapi_download.bzl    |  38 ++--
 .../simpleapi_download_tests.bzl              | 162 +++++++-----------
 3 files changed, 82 insertions(+), 120 deletions(-)

diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 78669d5ff8..83b006ffd2 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -132,7 +132,7 @@ def parse_simpleapi_html(*, content, parse_index = False):
         else:
             sdists[sha256] = dist
 
-    if packages:
+    if parse_index:
         return packages
 
     return struct(
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index de12b9d675..55e11e6fd6 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -110,6 +110,7 @@ def simpleapi_download(
             versions = sources[pkg],
             get_auth = get_auth,
             block = not parallel_download,
+            parse_index = False,
         )
         if hasattr(result, "wait"):
             # We will process it in a separate loop:
@@ -127,6 +128,10 @@ def simpleapi_download(
 def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
     downloads = {}
     results = {}
+    for extra in index_url_overrides.values():
+        if extra not in index_urls:
+            index_urls.append(extra)
+
     for index_url in index_urls:
         download = read_simpleapi(
             ctx = ctx,
@@ -137,7 +142,6 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
             parse_index = True,
             versions = {pkg: None for pkg in sources},
             block = block,
-            allow_fail = False,
             **kwargs
         )
         if hasattr(download, "wait"):
@@ -150,23 +154,27 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
 
     found_on_index = {}
     for index_url, result in results.items():
-        sources = [
-            pkg
-            for pkg in sources
-            if pkg not in found_on_index
-        ]
-
-        # Filter out the things that we have already found
-        found_on_index.update({
-            pkg: urllib.absolute_url(index_url, result.output[pkg])
-            for pkg in sources
-            # TODO @aignas 2026-03-20: add a test here
-            if index_url_overrides.get(pkg, index_url)
-        })
+        for pkg in sources:
+            if pkg in found_on_index:
+                # We have already found the package, skip
+                continue
+
+            if index_url_overrides.get(pkg, index_url) != index_url:
+                # we should not use this index for the package
+                continue
+
+            if not hasattr(result.output, "get"):
+                fail(result.output)
+
+            found = result.output.get(pkg)
+            if not found:
+                continue
+
+            found_on_index[pkg] = urllib.absolute_url(index_url, found)
 
     return found_on_index
 
-def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs):
+def _read_simpleapi(ctx, url, attr, cache, versions, parse_index, get_auth = None, **download_kwargs):
     """Read SimpleAPI.
 
     Args:
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 9a6b7ca5af..8d8a26dd4e 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -23,26 +23,30 @@ _tests = []
 def _test_simple(env):
     calls = []
 
-    def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, allow_fail):
-        _ = ctx, attr, cache, get_auth, versions  # buildifier: disable=unused-variable
-        env.expect.that_bool(block).equals(False)
-        env.expect.that_bool(allow_fail).equals(True)
-        calls.append(url)
-        if "foo" in url and "main" in url:
+    def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, parse_index):
+        if parse_index:
             return struct(
-                output = "",
-                success = False,
-            )
-        else:
-            return struct(
-                output = struct(
-                    sdists = {"deadbeef": url.strip("/").split("/")[-1]},
-                    whls = {"deadb33f": url.strip("/").split("/")[-1]},
-                    sha256s_by_version = {"fizz": url.strip("/").split("/")[-1]},
-                ),
                 success = True,
+                output = {
+                    "bar": "/bar/",
+                    "baz": "/baz/",
+                } if "main" in url else {
+                    "foo": "/foo/",
+                },
             )
 
+        _ = ctx, attr, cache, get_auth, versions  # buildifier: disable=unused-variable
+        env.expect.that_bool(block).equals(False)
+        calls.append(url)
+        return struct(
+            output = struct(
+                sdists = {"deadbeef": url.strip("/").split("/")[-1]},
+                whls = {"deadb33f": url.strip("/").split("/")[-1]},
+                sha256s_by_version = {"fizz": url.strip("/").split("/")[-1]},
+            ),
+            success = True,
+        )
+
     contents = simpleapi_download(
         ctx = struct(
             getenv = {}.get,
@@ -50,8 +54,8 @@ def _test_simple(env):
         ),
         attr = struct(
             index_url_overrides = {},
-            index_url = "main",
-            extra_index_urls = ["extra"],
+            index_url = "https://main.com",
+            extra_index_urls = ["https://extra.com"],
             sources = {"bar": None, "baz": None, "foo": None},
             envsubst = [],
         ),
@@ -61,26 +65,25 @@ def _test_simple(env):
     )
 
     env.expect.that_collection(calls).contains_exactly([
-        "extra/foo/",
-        "main/bar/",
-        "main/baz/",
-        "main/foo/",
+        "https://extra.com/foo/",
+        "https://main.com/bar/",
+        "https://main.com/baz/",
     ])
     env.expect.that_dict(contents).contains_exactly({
         "bar": struct(
-            index_url = "main/bar/",
+            index_url = "https://main.com/bar/",
             sdists = {"deadbeef": "bar"},
             sha256s_by_version = {"fizz": "bar"},
             whls = {"deadb33f": "bar"},
         ),
         "baz": struct(
-            index_url = "main/baz/",
+            index_url = "https://main.com/baz/",
             sdists = {"deadbeef": "baz"},
             sha256s_by_version = {"fizz": "baz"},
             whls = {"deadb33f": "baz"},
         ),
         "foo": struct(
-            index_url = "extra/foo/",
+            index_url = "https://extra.com/foo/",
             sdists = {"deadbeef": "foo"},
             sha256s_by_version = {"fizz": "foo"},
             whls = {"deadb33f": "foo"},
@@ -89,85 +92,25 @@ def _test_simple(env):
 
 _tests.append(_test_simple)
 
-def _test_fail(env):
+def _test_index_overrides(env):
     calls = []
     fails = []
 
-    def read_simpleapi(ctx, url, versions, attr, cache, get_auth, block, allow_fail):
-        _ = ctx, attr, cache, get_auth, versions  # buildifier: disable=unused-variable
-        env.expect.that_bool(block).equals(False)
-        env.expect.that_bool(allow_fail).equals(True)
-        calls.append(url)
-        if "foo" in url:
+    def read_simpleapi(ctx, *, url, versions, attr, cache, get_auth, block, parse_index):
+        if parse_index:
             return struct(
-                output = "",
-                success = False,
-            )
-        if "bar" in url:
-            return struct(
-                output = "",
-                success = False,
-            )
-        else:
-            return struct(
-                output = struct(
-                    sdists = {},
-                    whls = {},
-                    sha256s_by_version = {},
-                ),
                 success = True,
+                output = {
+                    "Baz": "/baz/",  # let's test normalization
+                    "bar": "/bar/",
+                    "foo": "/foo-should-fail/",
+                } if "main" in url else {
+                    "foo": "/foo/",
+                },
             )
 
-    simpleapi_download(
-        ctx = struct(
-            getenv = {}.get,
-            report_progress = lambda _: None,
-        ),
-        attr = struct(
-            index_url_overrides = {},
-            index_url = "main",
-            extra_index_urls = ["extra"],
-            sources = {"bar": None, "baz": None, "foo": None},
-            envsubst = [],
-        ),
-        cache = pypi_cache(),
-        parallel_download = True,
-        read_simpleapi = read_simpleapi,
-        _fail = fails.append,
-    )
-
-    env.expect.that_collection(fails).contains_exactly([
-        """
-Failed to download metadata of the following packages from urls:
-{
-    "bar": ["main", "extra"],
-    "foo": ["main", "extra"],
-}
-
-If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[
-    "bar",
-    "foo",
-]' to your 'pip.parse' call.
-""",
-    ])
-    env.expect.that_collection(calls).contains_exactly([
-        "main/foo/",
-        "main/bar/",
-        "main/baz/",
-        "extra/foo/",
-        "extra/bar/",
-    ])
-
-_tests.append(_test_fail)
-
-def _test_allow_fail_single_index(env):
-    calls = []
-    fails = []
-
-    def read_simpleapi(ctx, *, url, versions, attr, cache, get_auth, block, allow_fail):
         _ = ctx, attr, cache, get_auth, versions  # buildifier: disable=unused-variable
         env.expect.that_bool(block).equals(False)
-        env.expect.that_bool(allow_fail).equals(False)
         calls.append(url)
         return struct(
             output = struct(
@@ -185,9 +128,9 @@ def _test_allow_fail_single_index(env):
         ),
         attr = struct(
             index_url_overrides = {
-                "foo": "extra",
+                "foo": "https://extra.com",
             },
-            index_url = "main",
+            index_url = "https://main.com",
             extra_index_urls = [],
             sources = {"bar": None, "baz": None, "foo": None},
             envsubst = [],
@@ -200,32 +143,32 @@ def _test_allow_fail_single_index(env):
 
     env.expect.that_collection(fails).contains_exactly([])
     env.expect.that_collection(calls).contains_exactly([
-        "main/bar/",
-        "main/baz/",
-        "extra/foo/",
+        "https://main.com/bar/",
+        "https://main.com/baz/",
+        "https://extra.com/foo/",
     ])
     env.expect.that_dict(contents).contains_exactly({
         "bar": struct(
-            index_url = "main/bar/",
+            index_url = "https://main.com/bar/",
             sdists = {"deadbeef": "bar"},
             sha256s_by_version = {"fizz": "bar"},
             whls = {"deadb33f": "bar"},
         ),
         "baz": struct(
-            index_url = "main/baz/",
+            index_url = "https://main.com/baz/",
             sdists = {"deadbeef": "baz"},
             sha256s_by_version = {"fizz": "baz"},
             whls = {"deadb33f": "baz"},
         ),
         "foo": struct(
-            index_url = "extra/foo/",
+            index_url = "https://extra.com/foo/",
             sdists = {"deadbeef": "foo"},
             sha256s_by_version = {"fizz": "foo"},
             whls = {"deadb33f": "foo"},
         ),
     })
 
-_tests.append(_test_allow_fail_single_index)
+_tests.append(_test_index_overrides)
 
 def _test_download_url(env):
     downloads = {}
@@ -233,6 +176,17 @@ def _test_download_url(env):
     def download(url, output, **kwargs):
         _ = kwargs  # buildifier: disable=unused-variable
         downloads[url[0]] = output
+
+        if len(downloads) == 1:
+            return struct(
+                success = True,
+                output = """
+                <a href="/main/simple/bar/">bar</a>
+                <a href="/main/simple/baz/">baz</a>
+                <a href="/main/simple/foo/">foo</a>
+                """,
+            )
+
         return struct(success = True)
 
     simpleapi_download(

From 3eb7adf3e592085f8e550db8c4ef64d5e0e12836 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:35:04 +0900
Subject: [PATCH 08/14] handle envsubst when reading the index

---
 python/private/pypi/simpleapi_download.bzl    | 12 ++--
 python/private/pypi/urllib.bzl                |  2 +-
 .../simpleapi_download_tests.bzl              | 67 ++++++++++++++-----
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 55e11e6fd6..5a633e9915 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -163,14 +163,18 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
                 # we should not use this index for the package
                 continue
 
-            if not hasattr(result.output, "get"):
-                fail(result.output)
-
             found = result.output.get(pkg)
             if not found:
                 continue
 
-            found_on_index[pkg] = urllib.absolute_url(index_url, found)
+            # The spec says that we should be able to reach the thing via `<index_url>/<dist_name>`,
+            # so let's extract that
+            found, _, part = found.rpartition("/")
+            if not part:
+                _, _, part = found.rpartition("/")
+            found_on_index[pkg] = urllib.strip_empty_path_segments(
+                "{}/{}/".format(index_url, part),
+            )
 
     return found_on_index
 
diff --git a/python/private/pypi/urllib.bzl b/python/private/pypi/urllib.bzl
index ca6ded76b1..ea4cd32cc9 100644
--- a/python/private/pypi/urllib.bzl
+++ b/python/private/pypi/urllib.bzl
@@ -3,7 +3,7 @@
 def _get_root_directory(url):
     scheme_end = url.find("://")
     if scheme_end == -1:
-        fail("Invalid URL format")
+        fail("Invalid URL format: '{}'".format(url))
 
     scheme = url[:scheme_end]
     host_end = url.find("/", scheme_end + 3)
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 8d8a26dd4e..25494505ba 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -101,8 +101,8 @@ def _test_index_overrides(env):
             return struct(
                 success = True,
                 output = {
-                    "Baz": "/baz/",  # let's test normalization
                     "bar": "/bar/",
+                    "baz": "/baz/",
                     "foo": "/foo-should-fail/",
                 } if "main" in url else {
                     "foo": "/foo/",
@@ -172,21 +172,21 @@ _tests.append(_test_index_overrides)
 
 def _test_download_url(env):
     downloads = {}
+    reads = [
+        # The first read is the index which seeds the downloads later
+        """
+        <a href="/main/simple/bar/">bar</a>
+        <a href="/main/simple/baz/">baz</a>
+        <a href="/main/simple/foo/">foo</a>
+        """,
+        "",
+        "",
+        "",
+    ]
 
     def download(url, output, **kwargs):
         _ = kwargs  # buildifier: disable=unused-variable
         downloads[url[0]] = output
-
-        if len(downloads) == 1:
-            return struct(
-                success = True,
-                output = """
-                <a href="/main/simple/bar/">bar</a>
-                <a href="/main/simple/baz/">baz</a>
-                <a href="/main/simple/foo/">foo</a>
-                """,
-            )
-
         return struct(success = True)
 
     simpleapi_download(
@@ -194,14 +194,16 @@ def _test_download_url(env):
             getenv = {}.get,
             download = download,
             report_progress = lambda _: None,
-            read = lambda i: "contents of " + i,
+            # We will first add a download to the list, so this is a poor man's `next(foo)`
+            # implementation
+            read = lambda i: reads[len(downloads) - 1],
             path = lambda i: "path/for/" + i,
         ),
         attr = struct(
             index_url_overrides = {},
             index_url = "https://example.com/main/simple/",
             extra_index_urls = [],
-            sources = {"bar": None, "baz": None, "foo": None},
+            sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]},
             envsubst = [],
         ),
         cache = pypi_cache(),
@@ -210,6 +212,7 @@ def _test_download_url(env):
     )
 
     env.expect.that_dict(downloads).contains_exactly({
+        "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
         "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
         "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
         "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
@@ -219,6 +222,18 @@ _tests.append(_test_download_url)
 
 def _test_download_url_parallel(env):
     downloads = {}
+    reads = [
+        # The first read is the index which seeds the downloads later
+        """
+        <a href="/main/simple/bar/">bar</a>
+        <a href="/main/simple/baz/">baz</a>
+        <a href="/main/simple/foo/">foo</a>
+        """,
+        "",
+        "",
+        "",
+        "",
+    ]
 
     def download(url, output, **kwargs):
         _ = kwargs  # buildifier: disable=unused-variable
@@ -230,13 +245,15 @@ def _test_download_url_parallel(env):
             getenv = {}.get,
             download = download,
             report_progress = lambda _: None,
-            read = lambda i: "contents of " + i,
+            # We will first add a download to the list, so this is a poor man's `next(foo)`
+            # implementation. We use 2 because we will enqueue 2 downloads in parallel.
+            read = lambda i: reads[len(downloads) - 2],
             path = lambda i: "path/for/" + i,
         ),
         attr = struct(
             index_url_overrides = {},
             index_url = "https://example.com/main/simple/",
-            extra_index_urls = [],
+            extra_index_urls = ["https://example.com/extra/simple/"],
             sources = {"bar": None, "baz": None, "foo": None},
             envsubst = [],
         ),
@@ -246,6 +263,8 @@ def _test_download_url_parallel(env):
     )
 
     env.expect.that_dict(downloads).contains_exactly({
+        "https://example.com/extra/simple/": "path/for/https___example_com_extra_simple.html",
+        "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
         "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
         "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
         "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
@@ -255,6 +274,17 @@ _tests.append(_test_download_url_parallel)
 
 def _test_download_envsubst_url(env):
     downloads = {}
+    reads = [
+        # The first read is the index which seeds the downloads later
+        """
+        <a href="/main/simple/bar/">bar</a>
+        <a href="/main/simple/baz/">baz</a>
+        <a href="/main/simple/foo/">foo</a>
+        """,
+        "",
+        "",
+        "",
+    ]
 
     def download(url, output, **kwargs):
         _ = kwargs  # buildifier: disable=unused-variable
@@ -266,7 +296,9 @@ def _test_download_envsubst_url(env):
             getenv = {"INDEX_URL": "https://example.com/main/simple/"}.get,
             download = download,
             report_progress = lambda _: None,
-            read = lambda i: "contents of " + i,
+            # We will first add a download to the list, so this is a poor man's `next(foo)`
+            # implementation
+            read = lambda i: reads[len(downloads) - 1],
             path = lambda i: "path/for/" + i,
         ),
         attr = struct(
@@ -282,6 +314,7 @@ def _test_download_envsubst_url(env):
     )
 
     env.expect.that_dict(downloads).contains_exactly({
+        "https://example.com/main/simple/": "path/for/~index_url~.html",
         "https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html",
         "https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html",
         "https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html",

From fdafe7d6af50bc6bf3fab251be61facc46fbde96 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:36:08 +0900
Subject: [PATCH 09/14] handle envsubst when reading the index

---
 python/private/pypi/simpleapi_download.bzl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 5a633e9915..0f1ea9557d 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -169,12 +169,14 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
 
             # The spec says that we should be able to reach the thing via `<index_url>/<dist_name>`,
             # so let's extract that
+            parts = [index_url]
             found, _, part = found.rpartition("/")
+            parts.append(part)
             if not part:
                 _, _, part = found.rpartition("/")
-            found_on_index[pkg] = urllib.strip_empty_path_segments(
-                "{}/{}/".format(index_url, part),
-            )
+                parts.append(part)
+
+            found_on_index[pkg] = urllib.strip_empty_path_segments("/".join(parts))
 
     return found_on_index
 

From c3b68993789c817d2491a655c94bbe732d8399ae Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:47:44 +0900
Subject: [PATCH 10/14] Ensure the URL construction for dist is robust enough

---
 python/private/pypi/simpleapi_download.bzl    | 16 ++++++----------
 .../simpleapi_download_tests.bzl              | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 0f1ea9557d..e559196638 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -167,16 +167,12 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
             if not found:
                 continue
 
-            # The spec says that we should be able to reach the thing via `<index_url>/<dist_name>`,
-            # so let's extract that
-            parts = [index_url]
-            found, _, part = found.rpartition("/")
-            parts.append(part)
-            if not part:
-                _, _, part = found.rpartition("/")
-                parts.append(part)
-
-            found_on_index[pkg] = urllib.strip_empty_path_segments("/".join(parts))
+            # Ignore the URL here because we know how to construct it.
+
+            found_on_index[pkg] = urllib.strip_empty_path_segments("{}/{}/".format(
+                index_url,
+                pkg.replace("_", "-"),  # Use the official normalization for URLs
+            ))
 
     return found_on_index
 
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 25494505ba..2ab4063952 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -101,8 +101,9 @@ def _test_index_overrides(env):
             return struct(
                 success = True,
                 output = {
+                    # normalized
+                    "ba_z": "/ba-z/",
                     "bar": "/bar/",
-                    "baz": "/baz/",
                     "foo": "/foo-should-fail/",
                 } if "main" in url else {
                     "foo": "/foo/",
@@ -132,7 +133,7 @@ def _test_index_overrides(env):
             },
             index_url = "https://main.com",
             extra_index_urls = [],
-            sources = {"bar": None, "baz": None, "foo": None},
+            sources = {"ba_z": None, "bar": None, "foo": None},
             envsubst = [],
         ),
         cache = pypi_cache(),
@@ -144,22 +145,22 @@ def _test_index_overrides(env):
     env.expect.that_collection(fails).contains_exactly([])
     env.expect.that_collection(calls).contains_exactly([
         "https://main.com/bar/",
-        "https://main.com/baz/",
+        "https://main.com/ba-z/",
         "https://extra.com/foo/",
     ])
     env.expect.that_dict(contents).contains_exactly({
+        "ba_z": struct(
+            index_url = "https://main.com/ba-z/",
+            sdists = {"deadbeef": "ba-z"},
+            sha256s_by_version = {"fizz": "ba-z"},
+            whls = {"deadb33f": "ba-z"},
+        ),
         "bar": struct(
             index_url = "https://main.com/bar/",
             sdists = {"deadbeef": "bar"},
             sha256s_by_version = {"fizz": "bar"},
             whls = {"deadb33f": "bar"},
         ),
-        "baz": struct(
-            index_url = "https://main.com/baz/",
-            sdists = {"deadbeef": "baz"},
-            sha256s_by_version = {"fizz": "baz"},
-            whls = {"deadb33f": "baz"},
-        ),
         "foo": struct(
             index_url = "https://extra.com/foo/",
             sdists = {"deadbeef": "foo"},

From 26e56bb3bfa661594f7ea0413fd62feb0c13a142 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 01:01:06 +0900
Subject: [PATCH 11/14] fix more tests

---
 tests/pypi/hub_builder/hub_builder_tests.bzl | 33 ++++++++++++++------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl
index 637c7881c2..31a41f6af5 100644
--- a/tests/pypi/hub_builder/hub_builder_tests.bzl
+++ b/tests/pypi/hub_builder/hub_builder_tests.bzl
@@ -247,12 +247,19 @@ def _test_simple_extras_vs_no_extras(env):
 _tests.append(_test_simple_extras_vs_no_extras)
 
 def _test_simple_extras_vs_no_extras_simpleapi(env):
-    def mockread_simpleapi(*_, **__):
+    def mockread_simpleapi(*_, parse_index, **__):
+        if parse_index:
+            content = """\
+    <a href="/simple/>simple</a><br/>
+"""
+        else:
+            content = """\
+    <a href="/simple-0.0.1-py3-none-any.whl#sha256=deadbeef">simple-0.0.1-py3-none-any.whl</a><br/>
+"""
         return struct(
             output = parse_simpleapi_html(
-                content = """\
-    <a href="/simple-0.0.1-py3-none-any.whl#sha256=deadbeef">simple-0.0.1-py3-none-any.whl</a><br/>
-""",
+                content = content,
+                parse_index = parse_index,
             ),
             success = True,
         )
@@ -489,10 +496,13 @@ def _test_simple_with_markers(env):
 _tests.append(_test_simple_with_markers)
 
 def _test_torch_experimental_index_url(env):
-    def mockread_simpleapi(*_, **__):
-        return struct(
-            output = parse_simpleapi_html(
-                content = """\
+    def mockread_simpleapi(*_, parse_index, **__):
+        if parse_index:
+            content = """\
+    <a href="/ignored/">torch</a>
+"""
+        else:
+            content = """\
     <a href="/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=833490a28ac156762ed6adaa7c695879564fa2fd0dc51bcf3fdb2c7b47dc55e6">torch-2.4.1+cpu-cp310-cp310-linux_x86_64.whl</a><br/>
     <a href="/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-win_amd64.whl#sha256=1dd062d296fb78aa7cfab8690bf03704995a821b5ef69cfc807af5c0831b4202">torch-2.4.1+cpu-cp310-cp310-win_amd64.whl</a><br/>
     <a href="/whl/cpu/torch-2.4.1%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=2b03e20f37557d211d14e3fb3f71709325336402db132a1e0dd8b47392185baf">torch-2.4.1+cpu-cp311-cp311-linux_x86_64.whl</a><br/>
@@ -513,7 +523,12 @@ def _test_torch_experimental_index_url(env):
     <a href="/whl/cpu/torch-2.4.1-cp38-none-macosx_11_0_arm64.whl#sha256=5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71">torch-2.4.1-cp38-none-macosx_11_0_arm64.whl</a><br/>
     <a href="/whl/cpu/torch-2.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=1495132f30f722af1a091950088baea383fe39903db06b20e6936fd99402803e">torch-2.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl</a><br/>
     <a href="/whl/cpu/torch-2.4.1-cp39-none-macosx_11_0_arm64.whl#sha256=a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec">torch-2.4.1-cp39-none-macosx_11_0_arm64.whl</a><br/>
-""",
+"""
+
+        return struct(
+            output = parse_simpleapi_html(
+                content = content,
+                parse_index = parse_index,
             ),
             success = True,
         )

From 0f493abbce261a335f0c6590298fa6e87e7e7122 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Sun, 22 Mar 2026 01:11:41 +0900
Subject: [PATCH 12/14] doc: self review

---
 CHANGELOG.md                               | 9 +++++++--
 python/private/pypi/BUILD.bazel            | 3 +--
 python/private/pypi/pypi_cache.bzl         | 3 +++
 python/private/pypi/simpleapi_download.bzl | 3 ++-
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18be4def9c..ec9467e7a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -67,8 +67,13 @@ END_UNRELEASED_TEMPLATE
 Other changes:
 * (pypi) Update dependencies used for `compile_pip_requirements`, building
   sdists in the `whl_library` rule and fetching wheels using `pip`.
-* (pypi) We will set `allow_fail` to `False` if the {attr}`experimental_index_url_overrides` is set
-  to a non-empty value. This means that failures will be no-longer cached in this particular case.
+* (pypi) Before using the bazel downloader to fetch the PyPI package metadata
+  we will from now on fetch the lists of available packages on each index. The
+  used package mappings will be written as facts to the `MODULE.bazel.lock` file
+  on supported bazel versions and it should be done at most once. As a result,
+  per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
+  passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
+  Fixes
   ([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and 
   [#2632](https://github.com/bazel-contrib/rules_python/issues/2632))
 
diff --git a/python/private/pypi/BUILD.bazel b/python/private/pypi/BUILD.bazel
index 6b4822333c..869be4705a 100644
--- a/python/private/pypi/BUILD.bazel
+++ b/python/private/pypi/BUILD.bazel
@@ -244,6 +244,7 @@ bzl_library(
     srcs = ["parse_simpleapi_html.bzl"],
     deps = [
         ":version_from_filename_bzl",
+        "//python/private:normalize_name_bzl",
     ],
 )
 
@@ -424,8 +425,6 @@ bzl_library(
         ":urllib_bzl",
         "//python/private:auth_bzl",
         "//python/private:normalize_name_bzl",
-        "//python/private:text_util_bzl",
-        "@bazel_features//:features",
     ],
 )
 
diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl
index 2d2418c6ae..7b24102263 100644
--- a/python/private/pypi/pypi_cache.bzl
+++ b/python/private/pypi/pypi_cache.bzl
@@ -90,6 +90,9 @@ def _pypi_cache_get(self, key):
         # Could not get from in-memory, read from lockfile facts
         cached = self._facts.get(index_url, versions)
     else:
+        # We might be using something from memory that is not yet stored in facts (e.g. we processed
+        # the requirements.txt for one Python version and the deps got cached, but new python
+        # version means different deps, which may add extras.
         self._facts.setdefault(index_url, cached)
 
     return cached
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index e559196638..3551f18a0c 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -191,7 +191,8 @@ def _read_simpleapi(ctx, url, attr, cache, versions, parse_index, get_auth = Non
         cache: {type}`struct` the `pypi_cache` instance.
         versions: {type}`list[str] The versions that have been requested.
         get_auth: A function to get auth information. Used in tests.
-        parse_index: TODO
+        parse_index:  {type}`bool` Whether to parse the content as a root index page
+            (e.g. `/simple/`) instead of a package-specific page.
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.
 

From 208ee55c9c2511348deec6ad2e57cf2e802a159d Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Mon, 23 Mar 2026 22:47:49 +0900
Subject: [PATCH 13/14] behave like uv does

---
 CHANGELOG.md                                  |  3 +++
 python/private/pypi/simpleapi_download.bzl    | 23 ++++++++++++++-----
 .../simpleapi_download_tests.bzl              | 10 ++++----
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27a75cac94..8e7c152acd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -75,6 +75,9 @@ Other changes:
   on supported bazel versions and it should be done at most once. As a result,
   per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
   passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
+  What is more, we start implementing the flags for `--index_url` and `--extra_index_urls` more in
+  line to how it is used in `uv` and `pip`, i.e. we default to `--index_url` if the package is not
+  found in `--extra_index_urls`.
   Fixes
   ([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and 
   [#2632](https://github.com/bazel-contrib/rules_python/issues/2632)).
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index 3551f18a0c..2171e8b56a 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -42,11 +42,13 @@ def simpleapi_download(
         ctx: The module_ctx or repository_ctx.
         attr: Contains the parameters for the download. They are grouped into a
           struct for better clarity. It must have attributes:
-           * index_url: str, the index.
+           * index_url: str, the index, or if `extra_index_urls` are passed, the default index.
            * index_url_overrides: dict[str, str], the index overrides for
              separate packages.
-           * extra_index_urls: Extra index URLs that will be looked up after
-             the main is looked up.
+           * extra_index_urls: Will be looked at in the order they are defined and the first match
+                wins. This is similar to what uv does, see
+                https://docs.astral.sh/uv/concepts/indexes/#searching-across-multiple-indexes.
+                PRs for implementing other strategies are welcome.
            * sources: list[str], the sources to download things for. Each value is
              the contents of requirements files.
            * envsubst: list[str], the envsubst vars for performing substitution in index url.
@@ -86,7 +88,8 @@ def simpleapi_download(
     # handle this case. What we do is we select a particular index to download the packages
     dist_urls = _get_dist_urls(
         ctx,
-        index_urls = [attr.index_url] + attr.extra_index_urls,
+        default_index = attr.index_url,
+        index_urls = attr.extra_index_urls,
         index_url_overrides = index_url_overrides,
         sources = sources,
         read_simpleapi = read_simpleapi,
@@ -125,13 +128,17 @@ def simpleapi_download(
 
     return contents
 
-def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
+def _get_dist_urls(ctx, *, default_index, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs):
     downloads = {}
     results = {}
     for extra in index_url_overrides.values():
         if extra not in index_urls:
             index_urls.append(extra)
 
+    index_urls = index_urls or []
+    if default_index not in index_urls:
+        index_urls.append(default_index)
+
     for index_url in index_urls:
         download = read_simpleapi(
             ctx = ctx,
@@ -156,7 +163,11 @@ def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simple
     for index_url, result in results.items():
         for pkg in sources:
             if pkg in found_on_index:
-                # We have already found the package, skip
+                # We have already found the package, skip searching for it in
+                # other indexes.
+                #
+                # If we wanted to merge all of the index results, we would have to continue here
+                # and in the outer function process merging of the results.
                 continue
 
             if index_url_overrides.get(pkg, index_url) != index_url:
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
index 2ab4063952..55439c2593 100644
--- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -253,7 +253,7 @@ def _test_download_url_parallel(env):
         ),
         attr = struct(
             index_url_overrides = {},
-            index_url = "https://example.com/main/simple/",
+            index_url = "https://example.com/default/simple/",
             extra_index_urls = ["https://example.com/extra/simple/"],
             sources = {"bar": None, "baz": None, "foo": None},
             envsubst = [],
@@ -264,11 +264,11 @@ def _test_download_url_parallel(env):
     )
 
     env.expect.that_dict(downloads).contains_exactly({
+        "https://example.com/default/simple/": "path/for/https___example_com_default_simple.html",
         "https://example.com/extra/simple/": "path/for/https___example_com_extra_simple.html",
-        "https://example.com/main/simple/": "path/for/https___example_com_main_simple.html",
-        "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html",
-        "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html",
-        "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html",
+        "https://example.com/extra/simple/bar/": "path/for/https___example_com_extra_simple_bar.html",
+        "https://example.com/extra/simple/baz/": "path/for/https___example_com_extra_simple_baz.html",
+        "https://example.com/extra/simple/foo/": "path/for/https___example_com_extra_simple_foo.html",
     })
 
 _tests.append(_test_download_url_parallel)

From f950c32600231e73b0b0900229c67f1920c304c2 Mon Sep 17 00:00:00 2001
From: Ignas Anikevicius <240938+aignas@users.noreply.github.com>
Date: Tue, 24 Mar 2026 10:49:02 +0900
Subject: [PATCH 14/14] fix: support unescaped characters in strings

---
 python/private/pypi/parse_simpleapi_html.bzl        | 13 +++++++++----
 .../parse_simpleapi_html_tests.bzl                  |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 83b006ffd2..7f0d2776d7 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -72,11 +72,16 @@ def parse_simpleapi_html(*, content, parse_index = False):
         if start_tag == -1:
             break
 
-        # Find the end of the opening tag and the closing </a>
-        tag_end = content.find(">", start_tag)
-        end_tag = content.find("</a>", tag_end)
-        if tag_end == -1 or end_tag == -1:
+        # Find the closing </a> tag first, then find the end of the opening
+        # <a ...> tag using rfind. This correctly handles attributes that
+        # contain > characters, e.g. data-requires-python=">=3.6".
+        end_tag = content.find("</a>", start_tag)
+        if end_tag == -1:
             break
+        tag_end = content.rfind(">", start_tag, end_tag)
+        if tag_end == -1 or tag_end <= start_tag:
+            cursor = end_tag + 4
+            continue
 
         # Extract only the necessary slices
         filename = content[tag_end + 1:end_tag].strip()
diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
index 933a0783f2..c84140f459 100644
--- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
+++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
@@ -88,7 +88,7 @@ def _test_sdist(env):
             struct(
                 attrs = [
                     'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
-                    'data-requires-python="&gt;=3.7"',
+                    'data-requires-python=">=3.7"',
                     "data-yanked",
                 ],
                 filename = "foo-0.0.1.tar.gz",
@@ -105,7 +105,7 @@ def _test_sdist(env):
             struct(
                 attrs = [
                     'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
-                    'data-requires-python="&gt;=3.7"',
+                    'data-requires-python="<=3.7"',
                     "data-yanked=\"Something &#10;with &quot;quotes&quot;&#10;over two lines\"",
                 ],
                 filename = "foo-0.0.1.tar.gz",