From b9e429f8d1e647e07e77450f0880d64456d9e871 Mon Sep 17 00:00:00 2001
From: Abderrahim Kitouni <akitouni@gnome.org>
Date: Sat, 5 Apr 2025 08:40:45 +0100
Subject: [PATCH 1/4] scheduler: Stop scheduling from the imperative queue when
 asked to quit

For instance, we shouldn't schedule new build jobs when asked to
quit. However, not considering the build queue at all would miss
pushing newly built elements.

What this commit does is:
* Pull elements forward through all queues (including the ones we no
longer need)
* Only schedule jobs after the imperative queue

This means that elements in the build queue (or any imperative queue)
will still get passed to the following queues, but no new build jobs
get scheduled.

Fixes https://github.com/apache/buildstream/issues/1787
---
 src/buildstream/_scheduler/scheduler.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/buildstream/_scheduler/scheduler.py b/src/buildstream/_scheduler/scheduler.py
index 5b475daea..f2ae736a2 100644
--- a/src/buildstream/_scheduler/scheduler.py
+++ b/src/buildstream/_scheduler/scheduler.py
@@ -353,25 +353,26 @@ def _sched_queue_jobs(self):
 
         if self._queue_jobs:
             # Scheduler.stop() was not called, consider all queues.
-            queues = self.queues
+            queues_to_process = self.queues
         else:
             # Limit processing to post-imperative queues
             for queue in self.queues:
                 if queue.imperative:
-                    # Here the `queues` list will consists of the imperative queue along with
-                    # any subsequent queues, this means elements will be carried only from the
-                    # imerative queue onwards, and only post-imperative jobs will be processed.
-                    queues = self.queues[self.queues.index(queue) :]
+                    # Here the `queues_to_process` list will consists of all the queues after
+                    # the imperative queue. This means only post-imperative jobs will be processed.
+                    queues_to_process = self.queues[self.queues.index(queue) + 1 :]
                     break
             else:
                 # No imperative queue was marked, stop queueing any jobs
-                queues = []
+                queues_to_process = []
 
         while process_queues:
 
-            # Pull elements forward through queues
+            # Pull elements forward through all queues, regardless of whether we're processing those
+            # queues. The main reason to do this is to ensure we propagate finished jobs from the
+            # imperative queue.
             elements = []
-            for queue in queues:
+            for queue in self.queues:
                 queue.enqueue(elements)
                 elements = list(queue.dequeue())
 
@@ -386,13 +387,13 @@ def _sched_queue_jobs(self):
             # to fetch tasks for elements which failed to pull, and
             # thus need all the pulls to complete before ever starting
             # a build
-            ready.extend(chain.from_iterable(q.harvest_jobs() for q in reversed(queues)))
+            ready.extend(chain.from_iterable(q.harvest_jobs() for q in reversed(queues_to_process)))
 
             # harvest_jobs() may have decided to skip some jobs, making
             # them eligible for promotion to the next queue as a side effect.
             #
             # If that happens, do another round.
-            process_queues = any(q.dequeue_ready() for q in queues)
+            process_queues = any(q.dequeue_ready() for q in queues_to_process)
 
         # Start the jobs
         #

From bde7d673b796f47b805b9fef34c06bfb19d057b8 Mon Sep 17 00:00:00 2001
From: Tristan van Berkom <tristan.vanberkom@codethink.co.uk>
Date: Tue, 3 Jun 2025 18:54:54 +0900
Subject: [PATCH 2/4] _testing/runcli.py: Don't fail when removing non-existant
 artifacts

We use remove_artifact_from_cache() in integration tests where the cache is
shared and we just want to clean up some artifacts regardless of whether they
have already been built or not.
---
 src/buildstream/_testing/runcli.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/buildstream/_testing/runcli.py b/src/buildstream/_testing/runcli.py
index a06891ab1..0083e71f3 100644
--- a/src/buildstream/_testing/runcli.py
+++ b/src/buildstream/_testing/runcli.py
@@ -632,7 +632,14 @@ def remove_artifact_from_cache(self, cache_dir, element_name):
 
         normal_name = element_name.replace(os.sep, "-")
         cache_dir = os.path.splitext(os.path.join(cache_dir, "test", normal_name))[0]
-        shutil.rmtree(cache_dir)
+
+        # Don't cause a failure here if the artifact is not there, we use this in integration
+        # tests where the cache is shared and we just want to clean up some artifacts
+        # regardless of whether they have already been built or not.
+        try:
+            shutil.rmtree(cache_dir)
+        except FileNotFoundError:
+            pass
 
     # is_cached():
     #

From 7dfb54177904bb77bc08cc882f0fe37da54d03bb Mon Sep 17 00:00:00 2001
From: Tristan van Berkom <tristan.vanberkom@codethink.co.uk>
Date: Fri, 23 May 2025 14:40:06 +0900
Subject: [PATCH 3/4] tests/integration/cachedfail.py: Refactored to use its
 own individual project

This approach has been applied mostly elsewhere but not yet in the
integration tests.

Since we are modifying integration tests, it is our responsibility to
remodel the tests so that multiple tests do not share the same project
directory data.
---
 .../cached-fail/elements/base-fail.bst        |   9 ++
 .../cached-fail/elements/base-success.bst     |   8 +
 .../integration/cached-fail/elements/base.bst |  17 +++
 .../depends-on-base-fail-expect-foo.bst       |   9 ++
 tests/integration/cached-fail/project.conf    |  37 +++++
 tests/integration/cachedfail.py               | 138 +++---------------
 6 files changed, 103 insertions(+), 115 deletions(-)
 create mode 100644 tests/integration/cached-fail/elements/base-fail.bst
 create mode 100644 tests/integration/cached-fail/elements/base-success.bst
 create mode 100644 tests/integration/cached-fail/elements/base.bst
 create mode 100644 tests/integration/cached-fail/elements/depends-on-base-fail-expect-foo.bst
 create mode 100644 tests/integration/cached-fail/project.conf

diff --git a/tests/integration/cached-fail/elements/base-fail.bst b/tests/integration/cached-fail/elements/base-fail.bst
new file mode 100644
index 000000000..5b0248d12
--- /dev/null
+++ b/tests/integration/cached-fail/elements/base-fail.bst
@@ -0,0 +1,9 @@
+kind: script
+
+build-depends:
+- base.bst
+
+config:
+  commands:
+  - touch %{install-root}/foo
+  - false
diff --git a/tests/integration/cached-fail/elements/base-success.bst b/tests/integration/cached-fail/elements/base-success.bst
new file mode 100644
index 000000000..2043911f6
--- /dev/null
+++ b/tests/integration/cached-fail/elements/base-success.bst
@@ -0,0 +1,8 @@
+kind: script
+
+build-depends:
+- base.bst
+
+config:
+  commands:
+  - touch %{install-root}/foo
diff --git a/tests/integration/cached-fail/elements/base.bst b/tests/integration/cached-fail/elements/base.bst
new file mode 100644
index 000000000..c5833095d
--- /dev/null
+++ b/tests/integration/cached-fail/elements/base.bst
@@ -0,0 +1,17 @@
+kind: import
+
+description: |
+  Alpine Linux base for tests
+
+  Generated using the `tests/integration-tests/base/generate-base.sh` script.
+
+sources:
+  - kind: tar
+    base-dir: ''
+    (?):
+    - arch == "x86-64":
+        ref: 3eb559250ba82b64a68d86d0636a6b127aa5f6d25d3601a79f79214dc9703639
+        url: "alpine:integration-tests-base.v1.x86_64.tar.xz"
+    - arch == "aarch64":
+        ref: 431fb5362032ede6f172e70a3258354a8fd71fcbdeb1edebc0e20968c792329a
+        url: "alpine:integration-tests-base.v1.aarch64.tar.xz"
diff --git a/tests/integration/cached-fail/elements/depends-on-base-fail-expect-foo.bst b/tests/integration/cached-fail/elements/depends-on-base-fail-expect-foo.bst
new file mode 100644
index 000000000..792cf4c27
--- /dev/null
+++ b/tests/integration/cached-fail/elements/depends-on-base-fail-expect-foo.bst
@@ -0,0 +1,9 @@
+kind: script
+
+build-depends:
+- base.bst
+- base-fail.bst
+
+config:
+  commands:
+  - test -e /foo
diff --git a/tests/integration/cached-fail/project.conf b/tests/integration/cached-fail/project.conf
new file mode 100644
index 000000000..e8f7af487
--- /dev/null
+++ b/tests/integration/cached-fail/project.conf
@@ -0,0 +1,37 @@
+# Project config for frontend build test
+name: test
+min-version: 2.0
+element-path: elements
+
+aliases:
+  alpine: https://bst-integration-test-images.ams3.cdn.digitaloceanspaces.com/
+  project_dir: file://{project_dir}
+
+plugins:
+- origin: pip
+  package-name: sample-plugins
+  elements:
+  - autotools
+
+options:
+  linux:
+    type: bool
+    description: Whether to expect a linux platform
+    default: True
+  arch:
+    type: arch
+    description: Current architecture
+    variable: build_arch
+    values:
+      - x86-64
+      - aarch64
+
+environment:
+  TEST_VAR: pony
+
+split-rules:
+  test:
+    - |
+      /tests
+    - |
+      /tests/*
diff --git a/tests/integration/cachedfail.py b/tests/integration/cachedfail.py
index 2b5a86ba5..5d4fef3f2 100644
--- a/tests/integration/cachedfail.py
+++ b/tests/integration/cachedfail.py
@@ -30,43 +30,24 @@
 pytestmark = pytest.mark.integration
 
 
-DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "project")
+DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cached-fail")
 
 
 @pytest.mark.datafiles(DATA_DIR)
 @pytest.mark.skipif(not HAVE_SANDBOX, reason="Only available with a functioning sandbox")
 def test_build_checkout_cached_fail(cli, datafiles):
     project = str(datafiles)
-    element_path = os.path.join(project, "elements", "element.bst")
     checkout = os.path.join(cli.directory, "checkout")
 
-    # Write out our test target
-    element = {
-        "kind": "script",
-        "depends": [
-            {
-                "filename": "base.bst",
-                "type": "build",
-            },
-        ],
-        "config": {
-            "commands": [
-                "touch %{install-root}/foo",
-                "false",
-            ],
-        },
-    }
-    _yaml.roundtrip_dump(element, element_path)
-
     # Try to build it, this should result in a failure that contains the content
-    result = cli.run(project=project, args=["build", "element.bst"])
+    result = cli.run(project=project, args=["build", "base-fail.bst"])
     result.assert_main_error(ErrorDomain.STREAM, None)
 
     # Assert that it's cached in a failed artifact
-    assert cli.get_element_state(project, "element.bst") == "failed"
+    assert cli.get_element_state(project, "base-fail.bst") == "failed"
 
     # Now check it out
-    result = cli.run(project=project, args=["artifact", "checkout", "element.bst", "--directory", checkout])
+    result = cli.run(project=project, args=["artifact", "checkout", "base-fail.bst", "--directory", checkout])
     result.assert_success()
 
     # Check that the checkout contains the file created before failure
@@ -78,58 +59,20 @@ def test_build_checkout_cached_fail(cli, datafiles):
 @pytest.mark.skipif(not HAVE_SANDBOX, reason="Only available with a functioning sandbox")
 def test_build_depend_on_cached_fail(cli, datafiles):
     project = str(datafiles)
-    dep_path = os.path.join(project, "elements", "dep.bst")
-    target_path = os.path.join(project, "elements", "target.bst")
-
-    dep = {
-        "kind": "script",
-        "depends": [
-            {
-                "filename": "base.bst",
-                "type": "build",
-            },
-        ],
-        "config": {
-            "commands": [
-                "touch %{install-root}/foo",
-                "false",
-            ],
-        },
-    }
-    _yaml.roundtrip_dump(dep, dep_path)
-    target = {
-        "kind": "script",
-        "depends": [
-            {
-                "filename": "base.bst",
-                "type": "build",
-            },
-            {
-                "filename": "dep.bst",
-                "type": "build",
-            },
-        ],
-        "config": {
-            "commands": [
-                "test -e /foo",
-            ],
-        },
-    }
-    _yaml.roundtrip_dump(target, target_path)
 
     # Try to build it, this should result in caching a failure to build dep
-    result = cli.run(project=project, args=["build", "dep.bst"])
+    result = cli.run(project=project, args=["build", "base-fail.bst"])
     result.assert_main_error(ErrorDomain.STREAM, None)
 
     # Assert that it's cached in a failed artifact
-    assert cli.get_element_state(project, "dep.bst") == "failed"
+    assert cli.get_element_state(project, "base-fail.bst") == "failed"
 
     # Now we should fail because we've a cached fail of dep
-    result = cli.run(project=project, args=["build", "target.bst"])
+    result = cli.run(project=project, args=["build", "depends-on-base-fail-expect-foo.bst"])
     result.assert_main_error(ErrorDomain.STREAM, None)
 
     # Assert that it's not yet built, since one of its dependencies isn't ready.
-    assert cli.get_element_state(project, "target.bst") == "waiting"
+    assert cli.get_element_state(project, "depends-on-base-fail-expect-foo.bst") == "waiting"
 
 
 @pytest.mark.skipif(not HAVE_SANDBOX, reason="Only available with a functioning sandbox")
@@ -218,38 +161,20 @@ def test_host_tools_errors_are_not_cached(cli, datafiles, tmp_path):
     os.symlink(utils._get_host_tool_internal("buildbox-casd", search_subprojects_dir="buildbox"), str(buildbox_casd))
 
     project = str(datafiles)
-    element_path = os.path.join(project, "elements", "element.bst")
-
-    # Write out our test target
-    element = {
-        "kind": "script",
-        "depends": [
-            {
-                "filename": "base.bst",
-                "type": "build",
-            },
-        ],
-        "config": {
-            "commands": [
-                "true",
-            ],
-        },
-    }
-    _yaml.roundtrip_dump(element, element_path)
 
     # Build without access to host tools, this will fail
     result1 = cli.run(
         project=project,
-        args=["build", "element.bst"],
+        args=["build", "base-success.bst"],
         env={"PATH": str(tmp_path.joinpath("bin"))},
     )
     result1.assert_task_error(ErrorDomain.SANDBOX, "unavailable-local-sandbox")
-    assert cli.get_element_state(project, "element.bst") == "buildable"
+    assert cli.get_element_state(project, "base-success.bst") == "buildable"
 
     # When rebuilding, this should work
-    result2 = cli.run(project=project, args=["build", "element.bst"])
+    result2 = cli.run(project=project, args=["build", "base-success.bst"])
     result2.assert_success()
-    assert cli.get_element_state(project, "element.bst") == "cached"
+    assert cli.get_element_state(project, "base-success.bst") == "cached"
 
 
 # Tests that failed builds will be retried if --retry-failed is specified
@@ -261,7 +186,6 @@ def test_host_tools_errors_are_not_cached(cli, datafiles, tmp_path):
 @pytest.mark.parametrize("strict", (True, False), ids=["strict", "non-strict"])
 def test_retry_failed(cli, tmpdir, datafiles, use_share, retry, strict):
     project = str(datafiles)
-    target_path = os.path.join(project, "elements", "target.bst")
 
     # Use separate cache directories for each iteration of this test
     # even though we're using cli_integration
@@ -269,47 +193,31 @@ def test_retry_failed(cli, tmpdir, datafiles, use_share, retry, strict):
     # Global nonstrict configuration ensures all commands will be non-strict
     cli.configure({"cachedir": cli.directory, "projects": {"test": {"strict": strict}}})
 
-    def generate_target():
-        return {
-            "kind": "manual",
-            "depends": [
-                "base.bst",
-            ],
-            "config": {
-                "build-commands": [
-                    "test -e /foo",
-                ],
-            },
-        }
-
     with ExitStack() as stack:
 
         if use_share:
             share = stack.enter_context(create_artifact_share(os.path.join(str(tmpdir), "artifactshare")))
             cli.configure({"artifacts": {"servers": [{"url": share.repo, "push": True}]}})
 
-        target = generate_target()
-        _yaml.roundtrip_dump(target, target_path)
-
         # Try to build it, this should result in caching a failure of the target
-        result = cli.run(project=project, args=["build", "target.bst"])
+        result = cli.run(project=project, args=["build", "base-fail.bst"])
         result.assert_main_error(ErrorDomain.STREAM, None)
 
         # Assert that it's cached in a failed artifact
-        assert cli.get_element_state(project, "target.bst") == "failed"
+        assert cli.get_element_state(project, "base-fail.bst") == "failed"
 
         if use_share:
             # Delete the local cache, provoke pulling of the failed build
-            cli.remove_artifact_from_cache(project, "target.bst")
+            cli.remove_artifact_from_cache(project, "base-fail.bst")
 
             # Assert that the failed build has been removed
-            assert cli.get_element_state(project, "target.bst") == "buildable"
+            assert cli.get_element_state(project, "base-fail.bst") == "buildable"
 
         # Even though we are in non-strict mode, the failed build should be retried
         if retry:
-            result = cli.run(project=project, args=["build", "--retry-failed", "target.bst"])
+            result = cli.run(project=project, args=["build", "--retry-failed", "base-fail.bst"])
         else:
-            result = cli.run(project=project, args=["build", "target.bst"])
+            result = cli.run(project=project, args=["build", "base-fail.bst"])
 
         # If we did not modify the cache key, we want to assert that we did not
         # in fact attempt to rebuild the failed artifact.
@@ -319,17 +227,17 @@ def generate_target():
         #
         result.assert_main_error(ErrorDomain.STREAM, None)
         if retry:
-            assert "target.bst" in result.get_built_elements()
-            assert "target.bst" in result.get_discarded_elements()
+            assert "base-fail.bst" in result.get_built_elements()
+            assert "base-fail.bst" in result.get_discarded_elements()
         else:
-            assert "target.bst" not in result.get_built_elements()
-            assert "target.bst" not in result.get_discarded_elements()
+            assert "base-fail.bst" not in result.get_built_elements()
+            assert "base-fail.bst" not in result.get_discarded_elements()
 
         if use_share:
             # Assert that we did indeed go through the motions of downloading the failed
             # build, and possibly discarded the failed artifact if the strong key did not match
             #
-            assert "target.bst" in result.get_pulled_elements()
+            assert "base-fail.bst" in result.get_pulled_elements()
 
 
 # Tests that failed builds will be retried in strict mode when dependencies have changed.

From 8078dd78a2a25e1a3f31d3243c8664069d2aef5c Mon Sep 17 00:00:00 2001
From: Tristan van Berkom <tristan.vanberkom@codethink.co.uk>
Date: Fri, 23 May 2025 14:42:48 +0900
Subject: [PATCH 4/4] tests/integration/cachedfail.py: Test scheduler behavior
 after failed builds

Test that builds are no longer scheduled after a build fails with `--on-error quit`

Also test that failed builds are *still* pushed after a build fails, even
if no other builds are scheduled.

This is a regression test for https://github.com/apache/buildstream/issues/1787
---
 .../cached-fail/elements/base-also-fail.bst   |  9 ++
 .../elements/depends-on-two-failures.bst      |  9 ++
 tests/integration/cachedfail.py               | 82 ++++++++++++++++++-
 3 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 tests/integration/cached-fail/elements/base-also-fail.bst
 create mode 100644 tests/integration/cached-fail/elements/depends-on-two-failures.bst

diff --git a/tests/integration/cached-fail/elements/base-also-fail.bst b/tests/integration/cached-fail/elements/base-also-fail.bst
new file mode 100644
index 000000000..5b0248d12
--- /dev/null
+++ b/tests/integration/cached-fail/elements/base-also-fail.bst
@@ -0,0 +1,9 @@
+kind: script
+
+build-depends:
+- base.bst
+
+config:
+  commands:
+  - touch %{install-root}/foo
+  - false
diff --git a/tests/integration/cached-fail/elements/depends-on-two-failures.bst b/tests/integration/cached-fail/elements/depends-on-two-failures.bst
new file mode 100644
index 000000000..7dcea144c
--- /dev/null
+++ b/tests/integration/cached-fail/elements/depends-on-two-failures.bst
@@ -0,0 +1,9 @@
+kind: script
+
+build-depends:
+- base-fail.bst
+- base-also-fail.bst
+
+config:
+  commands:
+  - test -e /foo
diff --git a/tests/integration/cachedfail.py b/tests/integration/cachedfail.py
index 5d4fef3f2..ea7708adf 100644
--- a/tests/integration/cachedfail.py
+++ b/tests/integration/cachedfail.py
@@ -24,8 +24,11 @@
 from buildstream._testing import cli_integration as cli  # pylint: disable=unused-import
 from buildstream._testing._utils.site import HAVE_SANDBOX
 
-from tests.testutils import create_artifact_share
-
+from tests.testutils import (
+    create_artifact_share,
+    assert_shared,
+    assert_not_shared,
+)
 
 pytestmark = pytest.mark.integration
 
@@ -422,3 +425,78 @@ def generate_target():
 
     # Assert that it's no longer cached, and returns to a buildable state
     assert cli.get_element_state(project, "target.bst") == "buildable"
+
+
+# Test that we do not keep scheduling builds after one build fails
+# with `--builders 1` and `--on-error quit`.
+#
+@pytest.mark.datafiles(DATA_DIR)
+@pytest.mark.skipif(not HAVE_SANDBOX, reason="Only available with a functioning sandbox")
+def test_stop_building_after_failed(cli, datafiles):
+    project = str(datafiles)
+
+    # Since integration tests share a local artifact cache (for test performance, avoiding
+    # downloading a runtime in every test), we reset the local cache state here by
+    # deleting the two failing artifacts we are testing with
+    #
+    cli.remove_artifact_from_cache(project, "base-fail.bst")
+    cli.remove_artifact_from_cache(project, "base-also-fail.bst")
+
+    # Set only 1 builder, and explicitly configure `--on-error quit`
+    cli.configure({"scheduler": {"builders": 1, "on-error": "quit"}})
+
+    # Try to build it, this should result in only one failure
+    result = cli.run(project=project, args=["build", "depends-on-two-failures.bst"])
+    result.assert_main_error(ErrorDomain.STREAM, None)
+
+    # Assert that out of the two elements, only one of them failed, the other one is
+    # buildable because they both depend on base.bst which must have succeeded.
+    states = cli.get_element_states(project, ["base-fail.bst", "base-also-fail.bst"], deps="none")
+    assert "failed" in states.values()
+    assert "buildable" in states.values()
+
+
+# Test that we do push the failed build artifact, but we do not keep scheduling
+# builds after one build fails with `--builders 1` and `--on-error quit`.
+#
+@pytest.mark.skipif(not HAVE_SANDBOX, reason="Only available with a functioning sandbox")
+@pytest.mark.datafiles(DATA_DIR)
+def test_push_but_stop_building_after_failed(cli, tmpdir, datafiles):
+    project = str(datafiles)
+
+    # Since integration tests share a local artifact cache (for test performance, avoiding
+    # downloading a runtime in every test), we reset the local cache state here by
+    # deleting the two failing artifacts we are testing with
+    #
+    cli.remove_artifact_from_cache(project, "base-fail.bst")
+    cli.remove_artifact_from_cache(project, "base-also-fail.bst")
+
+    with create_artifact_share(os.path.join(str(tmpdir), "remote")) as share:
+
+        # Set only 1 builder, and explicitly configure `--on-error quit`
+        cli.configure(
+            {
+                "scheduler": {"builders": 1, "on-error": "quit"},
+                "artifacts": {"servers": [{"url": share.repo, "push": True}]},
+            }
+        )
+
+        # Try to build it, this should result in only one failure
+        result = cli.run(project=project, args=["build", "depends-on-two-failures.bst"])
+        result.assert_main_error(ErrorDomain.STREAM, None)
+
+        # Assert that out of the two elements, only one of them failed, the other one is
+        # buildable because they both depend on base.bst which must have succeeded.
+        states = cli.get_element_states(project, ["base-fail.bst", "base-also-fail.bst"], deps="none")
+        assert "failed" in states.values()
+        assert "buildable" in states.values()
+
+        # Assert that the failed build is cached in a failed artifact, and that the other build
+        # which would have failed, of course never made it to the artifact cache.
+        for element_name, state in states.items():
+            if state == "buildable":
+                assert_not_shared(cli, share, project, element_name)
+            elif state == "failed":
+                assert_shared(cli, share, project, element_name)
+            else:
+                assert False, "Unreachable code reached !"