Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9b73f48
Bugfix: Prevent segfault when TMPDIR is not given
neicker Feb 19, 2024
ba3cbb2
Bugfix: Fix segfault when trying to bind unforeseen symbol
neicker Feb 19, 2024
177b48f
Bugfix: Actually use scontrol for host expansion if detected
neicker Feb 19, 2024
8412d4f
Bugfix: Fix argument propagation to Slurm SPANK plugin
nchaimov Nov 6, 2025
d696729
Bugfix: Let debug report the correct assignment
neicker Feb 19, 2024
3ec5ec6
Bugfix: Prevent memory leak
neicker Feb 19, 2024
ad21381
Actually debug print scontrol's command line
neicker Feb 19, 2024
d4726a2
Cleanup the unique_file helping to identify the local BE process
neicker Feb 19, 2024
fc68897
Do not write 'fepid' file that is never read again
neicker Feb 19, 2024
fa51280
Let spindle helper escape to own process group
neicker Feb 19, 2024
e82d89c
Remove combined_argc, combined_argv which is never used
neicker Feb 19, 2024
4e9eafe
Add detection of sinfo binary for use in SPANK plugin
nchaimov Nov 6, 2025
22c38f8
Translate Slurm NodeName to NodeAddr for server communication setup
neicker Feb 19, 2024
6586e56
Check for SPANK's spank_prepend_task_argv() in configure
nchaimov Nov 6, 2025
326944c
Use spank_prepend_task_argv() to tweak task's argv
neicker Feb 19, 2024
aad8df5
Only free env_value if it was actually allocated
nchaimov Nov 14, 2025
00b1868
Don't require rsh launch when SPANK plugin is built; ensure shutdown is
nchaimov Dec 31, 2025
aec4f50
Handle --level=off in SPANK plugin
nchaimov Jan 8, 2026
cada056
CI testing for SPANK plugin
nchaimov Jan 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,61 @@ jobs:
cd containers/spindle-slurm-ubuntu/testing
docker compose down

spindle-slurm-plugin-ubuntu:
name: Testsuite (Slurm Plugin, Ubuntu)
environment: Spindle CI
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Check out Spindle
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8

- name: Setup Docker Compose
uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746
with:
version: latest

- name: Login to GitHub Container Registry
if: ${{ !env.ACT }}
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Generate MariaDB configuration
id: slurm-ubuntu-mariadb
run: |
cd containers/spindle-slurm-ubuntu/testing-plugin
./generate_config.sh

- name: Build spindle-slurm-plugin-ubuntu image
id: slurm-ubuntu-build
run: |
cd containers/spindle-slurm-ubuntu/testing-plugin
docker compose --progress=plain build

- name: Bring spindle-slurm-plugin-ubuntu up
id: slurm-ubuntu-up
run: |
cd containers/spindle-slurm-ubuntu/testing-plugin
docker compose up -d --wait --wait-timeout 120

- name: Verify munge works in spindle-slurm-plugin-ubuntu
id: slurm-ubuntu-munge
run: |
docker exec slurm-plugin-head bash -c 'munge -n | unmunge'

- name: Run spindle-slurm-plugin-ubuntu testsuite
id: slurm-ubuntu-testsuite
run: |
docker exec slurm-plugin-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}'

- name: Bring spindle-slurm-plugin-ubuntu down
id: slurm-ubuntu-down
if: ${{ always() }}
continue-on-error: true
run: |
cd containers/spindle-slurm-ubuntu/testing-plugin
docker compose down

1 change: 1 addition & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
SINFO_ABSPATH = @SINFO_ABSPATH@
SRUN_PATH = @SRUN_PATH@
STATICFLAG = @STATICFLAG@
STRIP = @STRIP@
Expand Down
4 changes: 4 additions & 0 deletions config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
/* Slurm with via plugin is enabled */
#undef ENABLE_SLURM_PLUGIN

/* Define to 1 if you have the declaration of `spank_prepend_task_argv', and
to 0 if you don't. */
#undef HAVE_DECL_SPANK_PREPEND_TASK_ARGV

/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H

Expand Down
119 changes: 118 additions & 1 deletion configure
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ LIBOBJS
PKGSYSCONF_DIR
BLD_SLURMPLUGIN_FALSE
BLD_SLURMPLUGIN_TRUE
SINFO_ABSPATH
SCONTROL_ABSPATH
BE_host
BE_CXXCPP
Expand Down Expand Up @@ -2212,6 +2213,52 @@ fi
eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno

} # ac_fn_c_check_header_mongrel

# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
# ---------------------------------------------
# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
# accordingly.
ac_fn_c_check_decl ()
{
as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
as_decl_name=`echo $2|sed 's/ *(.*//'`
as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
if eval \${$3+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
$4
int
main ()
{
#ifndef $as_decl_name
#ifdef __cplusplus
(void) $as_decl_use;
#else
(void) $as_decl_name;
#endif
#endif

;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
eval "$3=yes"
else
eval "$3=no"
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
fi
eval ac_res=\$$3
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
$as_echo "$ac_res" >&6; }
eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno

} # ac_fn_c_check_decl
cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
Expand Down Expand Up @@ -16911,7 +16958,7 @@ $as_echo "yes" >&6; }
$as_echo "$as_me: WARNING: Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." >&2;}
fi

if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] ; then
if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ] ; then
if test "x$BROKEN_SRUN" == "x1"; then
as_fn_error $? "Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." "$LINENO" 5
else
Expand Down Expand Up @@ -18658,6 +18705,18 @@ fi


CPPFLAGS=$OCPPFLAGS
ac_fn_c_check_decl "$LINENO" "spank_prepend_task_argv" "ac_cv_have_decl_spank_prepend_task_argv" "#include<slurm/spank.h>
"
if test "x$ac_cv_have_decl_spank_prepend_task_argv" = xyes; then :
ac_have_decl=1
else
ac_have_decl=0
fi

cat >>confdefs.h <<_ACEOF
#define HAVE_DECL_SPANK_PREPEND_TASK_ARGV $ac_have_decl
_ACEOF


OPATH=$PATH
if test "x$SLURM_DIR" != "x"; then
Expand Down Expand Up @@ -18707,6 +18766,50 @@ fi
if test "x$SCONTROL_ABSPATH" == "xnotfound"; then
as_fn_error $? "Could not find scontrol" "$LINENO" 5
fi
# Extract the first word of "sinfo", so it can be a program name with args.
set dummy sinfo; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_path_SINFO_ABSPATH+:} false; then :
$as_echo_n "(cached) " >&6
else
case $SINFO_ABSPATH in
[\\/]* | ?:[\\/]*)
ac_cv_path_SINFO_ABSPATH="$SINFO_ABSPATH" # Let the user override the test with a path.
;;
*)
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_path_SINFO_ABSPATH="$as_dir/$ac_word$ac_exec_ext"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS

test -z "$ac_cv_path_SINFO_ABSPATH" && ac_cv_path_SINFO_ABSPATH="notfound"
;;
esac
fi
SINFO_ABSPATH=$ac_cv_path_SINFO_ABSPATH
if test -n "$SINFO_ABSPATH"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $SINFO_ABSPATH" >&5
$as_echo "$SINFO_ABSPATH" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi


if test "x$SINFO_ABSPATH" == "xnotfound"; then
as_fn_error $? "Could not find sinfo" "$LINENO" 5
fi
PATH=$OPATH
fi

Expand All @@ -18720,6 +18823,16 @@ fi

SCONTROL_ABSPATH=$SCONTROL_ABSPATH

SINFO_ABSPATH=$SINFO_ABSPATH


if test "x$ENABLE_FLUX_PLUGIN" = "xtrue"; then
BLD_FLUXPLUGIN_TRUE=
BLD_FLUXPLUGIN_FALSE='#'
else
BLD_FLUXPLUGIN_TRUE='#'
BLD_FLUXPLUGIN_FALSE=
fi


cat >confcache <<\_ACEOF
Expand Down Expand Up @@ -18935,6 +19048,10 @@ if test -z "${BLD_SLURMPLUGIN_TRUE}" && test -z "${BLD_SLURMPLUGIN_FALSE}"; then
as_fn_error $? "conditional \"BLD_SLURMPLUGIN\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${BLD_FLUXPLUGIN_TRUE}" && test -z "${BLD_FLUXPLUGIN_FALSE}"; then
as_fn_error $? "conditional \"BLD_FLUXPLUGIN\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi

: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
Expand Down
7 changes: 7 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then
[],
[AC_MSG_ERROR([Could not find slurm/spank.h])])
CPPFLAGS=$OCPPFLAGS
AC_CHECK_DECLS([spank_prepend_task_argv], [], [], [[#include<slurm/spank.h>]])

OPATH=$PATH
if test "x$SLURM_DIR" != "x"; then
Expand All @@ -85,12 +86,18 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then
if test "x$SCONTROL_ABSPATH" == "xnotfound"; then
AC_MSG_ERROR([Could not find scontrol])
fi
AC_PATH_PROG([SINFO_ABSPATH], [sinfo], [notfound])
if test "x$SINFO_ABSPATH" == "xnotfound"; then
AC_MSG_ERROR([Could not find sinfo])
fi
PATH=$OPATH
fi

AM_CONDITIONAL([BLD_SLURMPLUGIN], [test "x$ENABLE_SLURM_PLUGIN" == "xtrue"])
AC_SUBST(SCONTROL_ABSPATH, $SCONTROL_ABSPATH)
AC_SUBST(SINFO_ABSPATH, $SINFO_ABSPATH)
AC_SUBST(PKGSYSCONF_DIR)
AM_CONDITIONAL([BLD_FLUXPLUGIN], [test "x$ENABLE_FLUX_PLUGIN" = "xtrue"])

AC_OUTPUT

Expand Down
2 changes: 1 addition & 1 deletion configure.common.ac
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ if test "x$ENABLE_SLURM" == "xtrue"; then
AC_MSG_WARN([Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster.])
fi

if [[ "x$ENABLE_RSH_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_LAUNCH" != "x1" ]] ; then
if [[ "x$ENABLE_RSH_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ]] ; then
if test "x$BROKEN_SRUN" == "x1"; then
AC_MSG_ERROR([Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster.])
else
Expand Down
42 changes: 42 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ARG BASE_VERSION=latest
FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION}
ARG replicas=4
ENV workers=${replicas}

ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin

# Slurm daemons run as $SLURM_USER
ARG SLURM_USER=slurm

# Applications run as $USER
ARG USER=slurmuser
ARG UID=1001

# Set up the Slurm install already present in the base image
COPY ${BUILD_ROOT}/scripts/setup_slurm.sh /setup_slurm.sh
COPY ${BUILD_ROOT}/conf/slurm.conf /home/${SLURM_USER}/slurm.conf
COPY ${BUILD_ROOT}/conf/slurmdbd.conf /home/${SLURM_USER}/slurmdbd.conf
COPY ${BUILD_ROOT}/conf/cgroup.conf /home/${SLURM_USER}/cgroup.conf
RUN /setup_slurm.sh

USER ${USER}
WORKDIR /home/${USER}

# Copy the Spindle repo into the container and build it
RUN mkdir -p /home/${USER}/Spindle
COPY . /home/${USER}/Spindle
COPY ${BUILD_ROOT}/scripts/build_spindle.sh /home/${USER}/build_spindle.sh
RUN ./build_spindle.sh

USER root
COPY ${BUILD_ROOT}/scripts/setup_spank_plugin.sh /setup_spank_plugin.sh
COPY ${BUILD_ROOT}/conf/plugstack.conf /home/${SLURM_USER}/plugstack.conf
RUN /setup_spank_plugin.sh

USER ${USER}

COPY ${BUILD_ROOT}/scripts/entrypoint.sh /home/${USER}/entrypoint.sh
ENV PATH /home/${USER}/Spindle-inst/bin:$PATH

ENTRYPOINT /bin/bash ./entrypoint.sh

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CgroupPlugin=cgroup/v1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
required /home/slurmuser/Spindle-inst/lib/libspindleslurm.so
42 changes: 42 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-plugin/conf/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ClusterName=linux
ControlMachine=slurm-head
ControlAddr=slurm-head
SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
TaskPlugin=task/affinity
ReturnToService=2
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurm-db
AccountingStoragePort=6819
NodeName=slurm-node-1 NodeAddr=slurm-node-1 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-2 NodeAddr=slurm-node-2 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-3 NodeAddr=slurm-node-3 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-4 NodeAddr=slurm-node-4 CPUs=3 RealMemory=1000 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
AuthType=auth/munge
DbdAddr=slurm-db
DbdHost=slurm-db
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=slurm-mariadb
StorageUser=slurm
Loading