From a79ead29d5a27a54a599d1dfe2d4358b7153d6e7 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Feb 2026 14:14:40 -0500 Subject: [PATCH 1/2] Add retry logic for singularity pull failures Retry singularity pull operations that fail with exit code 255, which typically indicates network-related issues (unexpected EOF, timeouts, connection failures). Uses exponential backoff with up to 3 attempts. Cleans up partial downloads between retries and logs progress for visibility. Co-Authored-By: Claude Sonnet 4.5 --- scripts/create_singularities | 55 +++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/scripts/create_singularities b/scripts/create_singularities index 83c22dcc9..b725ea5c1 100755 --- a/scripts/create_singularities +++ b/scripts/create_singularities @@ -386,13 +386,54 @@ class DockerImage: """ if not self.imagefile.exists() and not self.imagefile.is_symlink(): tmpfile = self.imagefile.with_suffix(".sif.tmp") - builder.runcmd( - "singularity", - "pull", - "--disable-cache", - str(tmpfile), - self.docker_uri, - ) + + # Retry singularity pull on fatal errors (exit 255) with exponential backoff + # These are typically network-related issues (EOF, timeouts, etc.) + max_attempts = 3 + sleepiter = exp_wait(attempts=max_attempts) + attempt = 0 + while True: + attempt += 1 + try: + builder.runcmd( + "singularity", + "pull", + "--disable-cache", + str(tmpfile), + self.docker_uri, + ) + break # Success, exit retry loop + except subprocess.CalledProcessError as e: + # Retry on exit code 255 (singularity's code for fatal errors, + # often network-related: EOF, timeout, connection issues) + if e.returncode == 255: + # Clean up partial download if it exists + if tmpfile.exists(): + tmpfile.unlink() + + if (wait := next(sleepiter, None)) is not None: + log.warning( + "Singularity pull of %s failed on attempt %d/%d (exit %d); " + "sleeping for %f seconds and retrying", + self.docker_uri, + attempt, + max_attempts, + e.returncode, + wait, + ) + sleep(wait) + else: + # Out of retries + log.error( + "Singularity pull of %s failed after %d attempts", + self.docker_uri, + attempt, + ) + raise e + else: + # Non-255 exit code, don't retry + raise e + # Use cp with reflink to avoid permission issues builder.runcmd("cp", "--reflink=auto", str(tmpfile), str(self.imagefile)) tmpfile.unlink() From 118a9937dd83dc79bec436b4491b8f892f64004e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Feb 2026 15:01:30 -0500 Subject: [PATCH 2/2] Refactor check_images--help to use if/else instead of && || Replace complex chained && || construct with clear if/else statements for better readability and maintainability. Co-Authored-By: Claude Sonnet 4.5 --- scripts/check_images--help | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/check_images--help b/scripts/check_images--help index 752ee571c..9527d33f1 100755 --- a/scripts/check_images--help +++ b/scripts/check_images--help @@ -5,5 +5,20 @@ # git annex find --in here | grep '\.si*' | while read -r f; do echo "D: testing $f" - { singularity run "$f" --help 2>&1 && echo "$f: ok" || echo "$f: failed to execute --help" >&2; } | grep -q "no runscript" && echo "$f: no runscript" >&2 || :; + + # Capture output and exit code + output=$(singularity run "$f" --help 2>&1) + exit_code=$? + + # Check execution success + if [ $exit_code -eq 0 ]; then + echo "$f: ok" + else + echo "$f: failed to execute --help" >&2 + fi + + # Check for "no runscript" message + if echo "$output" | grep -q "no runscript"; then + echo "$f: no runscript" >&2 + fi done