Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion scripts/check_images--help
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,20 @@
#
git annex find --in here | grep '\.si*' | while read -r f; do
echo "D: testing $f"
{ singularity run "$f" --help 2>&1 && echo "$f: ok" || echo "$f: failed to execute --help" >&2; } | grep -q "no runscript" && echo "$f: no runscript" >&2 || :;

# Capture output and exit code
output=$(singularity run "$f" --help 2>&1)
exit_code=$?

# Check execution success
if [ $exit_code -eq 0 ]; then
echo "$f: ok"
else
echo "$f: failed to execute --help" >&2
fi

# Check for "no runscript" message
if echo "$output" | grep -q "no runscript"; then
echo "$f: no runscript" >&2
fi
done
55 changes: 48 additions & 7 deletions scripts/create_singularities
Original file line number Diff line number Diff line change
Expand Up @@ -386,13 +386,54 @@ class DockerImage:
"""
if not self.imagefile.exists() and not self.imagefile.is_symlink():
tmpfile = self.imagefile.with_suffix(".sif.tmp")
builder.runcmd(
"singularity",
"pull",
"--disable-cache",
str(tmpfile),
self.docker_uri,
)

# Retry singularity pull on fatal errors (exit 255) with exponential backoff
# These are typically network-related issues (EOF, timeouts, etc.)
max_attempts = 3
sleepiter = exp_wait(attempts=max_attempts)
attempt = 0
while True:
attempt += 1
try:
builder.runcmd(
"singularity",
"pull",
"--disable-cache",
str(tmpfile),
self.docker_uri,
)
break # Success, exit retry loop
except subprocess.CalledProcessError as e:
# Retry on exit code 255 (singularity's code for fatal errors,
# often network-related: EOF, timeout, connection issues)
if e.returncode == 255:
# Clean up partial download if it exists
if tmpfile.exists():
tmpfile.unlink()

if (wait := next(sleepiter, None)) is not None:
log.warning(
"Singularity pull of %s failed on attempt %d/%d (exit %d); "
"sleeping for %f seconds and retrying",
self.docker_uri,
attempt,
max_attempts,
e.returncode,
wait,
)
sleep(wait)
else:
# Out of retries
log.error(
"Singularity pull of %s failed after %d attempts",
self.docker_uri,
attempt,
)
raise e
else:
# Non-255 exit code, don't retry
raise e

# Use cp with reflink to avoid permission issues
builder.runcmd("cp", "--reflink=auto", str(tmpfile), str(self.imagefile))
tmpfile.unlink()
Expand Down