From 9b73f48456ba9a702deddafa51d4f9f4d29729f2 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:13:29 +0100 Subject: [PATCH 01/19] Bugfix: Prevent segfault when TMPDIR is not given --- src/slurm_plugin/slurm_plugin.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 58ed086f..b4f7a888 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -618,11 +618,14 @@ static int handleExit(void *params, char **output_str) return -1; } - result = spindleExitBE(args.location); - if (result == -1) { - sdprintf(1, "ERROR: spindleExitBE returned and error on location %s\n", args.location); - return -1; + if (!args.location) { + sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); + } else { + result = spindleExitBE(args.location); + if (result == -1) { + sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); + return -1; + } } - return 0; } From ba3cbb299e147ee2080825e13d18afe7aa26f069 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:06:45 +0100 Subject: [PATCH 02/19] Bugfix: Fix segfault when trying to bind unforeseen symbol --- src/client/auditclient/redirect.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/client/auditclient/redirect.c b/src/client/auditclient/redirect.c index b237c841..51ce5d7c 100644 --- a/src/client/auditclient/redirect.c +++ b/src/client/auditclient/redirect.c @@ -36,6 +36,12 @@ ElfX_Addr client_call_binding(const char *symname, ElfX_Addr symvalue) if (!binding) return symvalue; + if (!binding->libc_func) { + debug_printf("%s: symname %s has no libc_func container\n", + __func__, symname); + return (ElfX_Addr) binding->spindle_func; + } + if (*binding->libc_func == NULL) *binding->libc_func = (void *) symvalue; @@ -44,4 +50,3 @@ ElfX_Addr client_call_binding(const char *symname, ElfX_Addr symvalue) else return symvalue; } - From 177b48fdd2d7f36cc7173193ac026e8367a1a475 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:26:48 +0100 Subject: [PATCH 03/19] Bugfix: Actually use scontrol for host expansion if detected --- src/slurm_plugin/slurm_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index b4f7a888..ff615a77 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -345,7 +345,7 @@ static char **get_hostlist(spank_t spank, unsigned int num_hosts) return NULL; } -#if defined(USE_SCONTROL) +#if defined(SCONTROL_BIN) hostlist = getHostsScontrol(num_hosts, short_hosts); #else hostlist = getHostsParse(num_hosts, short_hosts); From 8412d4f7f4ad74d5f34076be0b0a0046295624b4 Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Thu, 6 Nov 2025 13:51:46 -0800 Subject: [PATCH 04/19] Bugfix: Fix argument propagation to Slurm SPANK plugin Co-authored-by: Norbert Eicker --- src/fe/startup/spindle_fe.cc | 6 +++--- src/slurm_plugin/slurm_plugin.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index 31b29cf7..cb53023b 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -258,9 +258,9 @@ int fillInSpindleArgsCmdlineFE(spindle_args_t *params, unsigned int options, int for (i = 0; i < sargc && sargv && sargv[i] != NULL; i++) { mod_argv[i+1] = sargv[i]; } - i++; - mod_argv[i++] = const_cast("launcher"); - mod_argv[i] = NULL; + mod_argv[i+1] = const_cast("launcher"); + mod_argv[i+2] = NULL; + mod_argc = i+2; string errmsg; bool result = gatherAllConfigInfo(mod_argc, mod_argv, false, config, errmsg); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index ff615a77..2da6055b 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -273,9 +273,9 @@ static int process_spindle_args(spank_t spank, int site_argc, char *site_argv[], site_options_size = strlen(site_options); user_options_size = strlen(user_options); - combined_options_size = site_options_size + user_options_size + 2; + combined_options_size = site_options_size + user_options_size + 3; combined_options = (char *) malloc(combined_options_size); - snprintf(combined_options, combined_options_size, "%s%s%s", + snprintf(combined_options, combined_options_size, "%s%s%s ", site_options, (site_options_size && user_options_size) ? " " : "", user_options); @@ -301,7 +301,7 @@ static int process_spindle_args(spank_t spank, int site_argc, char *site_argv[], if (spindle_config) free(spindle_config); - if (combined_argv) { + if (!out_argv && combined_argv) { for (i = 0; i < combined_argc; i++) { if (combined_argv[i]) free(combined_argv[i]); From d696729348dd05141c229a1bc64d553871ef865f Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:11:27 +0100 Subject: [PATCH 05/19] Bugfix: Let debug report the correct assignment --- src/client/client/lookup_libc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/lookup_libc.c b/src/client/client/lookup_libc.c index 14f2408a..f25497be 100644 --- a/src/client/client/lookup_libc.c +++ b/src/client/client/lookup_libc.c @@ -212,7 +212,7 @@ int lookup_libc_symbols() } else { mallocfunc = (malloc_sig_t) (symtab[result].st_value + libc->l_addr); - debug_printf3("Bound errno_location to %p\n", app_errno_location); + debug_printf3("Bound mallocfunc to %p\n", mallocfunc); found++; } } From 3ec5ec6c9772c0c6d300cd4b91573bbf409f48a5 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:59:12 +0100 Subject: [PATCH 06/19] Bugfix: Prevent memory leak --- src/slurm_plugin/slurm_plugin.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 2da6055b..31e76258 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -432,7 +432,7 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) int result; int is_fe_host = 0; int is_be_leader = 0; - unsigned int num_hosts; + unsigned int i, num_hosts; int num_hosts_result; int launch_result = -1; @@ -470,7 +470,8 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) launch_result = 0; done: - if (hostlist) + if (hostlist) { + for (i = 0; i < num_hosts; i++) free(hostlist[i]); free(hostlist); return launch_result; From ad21381e9f1b8241daf352ce40a51a267071a0af Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:36:08 +0100 Subject: [PATCH 07/19] Actually debug print scontrol's command line --- src/slurm_plugin/plugin_utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 7112e090..49d2855e 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -46,7 +46,6 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) scontrol_cmdline_len = strlen(scontrol_path) + strlen(scontrol_args) + strlen(hoststr) + strlen(scontrol_suffix) + 6; scontrol_cmdline = (char *) malloc(scontrol_cmdline_len); - sdprintf(2, "Running scontrol to get host list: %s\n", scontrol_cmdline); result = snprintf(scontrol_cmdline, scontrol_cmdline_len, "%s %s \"%s\" %s", scontrol_path, scontrol_args, hoststr, scontrol_suffix); if (result >= scontrol_cmdline_len) { @@ -54,6 +53,7 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) scontrol_cmdline, result, scontrol_cmdline_len); goto done; } + sdprintf(2, "Running scontrol to get host list: %s\n", scontrol_cmdline); f = popen(scontrol_cmdline, "r"); if (!f) { From d4726a288c828e747f2ed29237464f48d30144cd Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:04:07 +0100 Subject: [PATCH 08/19] Cleanup the unique_file helping to identify the local BE process --- src/slurm_plugin/plugin_utils.c | 11 ++++++----- src/slurm_plugin/plugin_utils.h | 1 + src/slurm_plugin/slurm_plugin.c | 5 +++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 49d2855e..ca44114b 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -172,12 +172,13 @@ int isFEHost(char **hostlist, unsigned int num_hosts) return feresult; } +char *unique_file = NULL; + #define UNIQUE_FILE_NAME "spindle_unique" int isBEProc(spindle_args_t *params) { char *dir = NULL, *expanded_dir = NULL, *realized_dir = NULL; - char *unique_file = NULL; char hostname[256], session_id_str[32]; size_t unique_file_len; int beproc_result = -1; @@ -218,9 +219,11 @@ int isBEProc(spindle_args_t *params) sdprintf(2, "Opened %s to result %d\n", unique_file, fd); if (fd != -1) beproc_result = 1; - else if (error == EEXIST) + else if (error == EEXIST) { beproc_result = 0; - else { + free(unique_file); + unique_file = NULL; + } else { sdprintf(1, "ERROR: Could not create spindle unique_file %s: %s\n", unique_file, strerror(error)); goto done; } @@ -230,8 +233,6 @@ int isBEProc(spindle_args_t *params) free(expanded_dir); if (realized_dir) free(realized_dir); - if (unique_file) - free(unique_file); if (fd != -1) close(fd); sdprintf(2, "returning %d\n", beproc_result); diff --git a/src/slurm_plugin/plugin_utils.h b/src/slurm_plugin/plugin_utils.h index 971d5707..d57f55da 100644 --- a/src/slurm_plugin/plugin_utils.h +++ b/src/slurm_plugin/plugin_utils.h @@ -42,6 +42,7 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr); char **getHostsParse(unsigned int num_hosts, const char *shortlist); int isFEHost(char **hostlist, unsigned int num_hosts); +extern char *unique_file; int isBEProc(spindle_args_t *params); char *encodeCmdArgs(int sargc, char **sargv); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 31e76258..fa2aebf3 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -557,6 +557,11 @@ static int launchBE(spank_t spank, spindle_args_t *params) sdprintf(1, "ERROR: spindleRunBE failed\n"); else sdprintf(1, "spindleRunBE completed. Session finishing.\n"); + + if (unique_file) unlink(unique_file); + free(unique_file); + unique_file = NULL; + exit(result); return 0; From fc688976b91d299671b8f16309c6511f90c5f25d Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:09:08 +0100 Subject: [PATCH 09/19] Do not write 'fepid' file that is never read again --- src/slurm_plugin/slurm_plugin.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index fa2aebf3..f544afdc 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -500,7 +500,6 @@ static int launchFE(char **hostlist, spindle_args_t *params) } if (pidFE) { sdprintf(2, "Forked FE as pid %d\n", pidFE); - registerFEPid(pidFE, params); return 0; } From fa512800de3147aec1c63b7fb7b75634ea683098 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:37:59 +0100 Subject: [PATCH 10/19] Let spindle helper escape to own process group - This saves them from premature killing by psmgmt --- src/logging/spindle_logc.c | 1 + src/slurm_plugin/plugin_utils.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/logging/spindle_logc.c b/src/logging/spindle_logc.c index 9e5edb1f..4e82300f 100644 --- a/src/logging/spindle_logc.c +++ b/src/logging/spindle_logc.c @@ -76,6 +76,7 @@ void spawnLogDaemon(char *tempdir) if (result == 0) { char *params[7]; int cur = 0; + setpgid(0, 0); /* escape to own process group */ params[cur++] = spindle_log_daemon_name; params[cur++] = tempdir; if (spindle_debug_prints) { diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index ca44114b..94e9a0cb 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -701,6 +701,7 @@ pid_t grandchild_fork() exit(result == sizeof(grandchild_pid) ? 0 : -1); } //In grandchild + setpgid(0, 0); /* escape to own process group */ fork_result = 0; goto done; From e82d89ce128ac4e054f39b1c287e5e9f03ec4f58 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:46:42 +0100 Subject: [PATCH 11/19] Remove combined_argc, combined_argv which is never used --- src/slurm_plugin/slurm_plugin.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index f544afdc..98e43945 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -84,8 +84,6 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) saved_env_t *env = NULL; static int initialized = 0; spindle_args_t params; - int combined_argc; - char **combined_argv; if (!enable_spindle) return 0; @@ -109,7 +107,7 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) push_env(spank, &env); sdprintf(1, "Beginning spindle plugin\n"); - result = process_spindle_args(spank, site_argc, site_argv, ¶ms, &combined_argc, &combined_argv); + result = process_spindle_args(spank, site_argc, site_argv, ¶ms, NULL, NULL); if (result == -1) { sdprintf(1, "Error processesing spindle arguments. Aborting spindle\n"); goto done; From 4e9eafe83da76d71d4f81462f5de74f14ef4da27 Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Thu, 6 Nov 2025 13:57:10 -0800 Subject: [PATCH 12/19] Add detection of sinfo binary for use in SPANK plugin Co-authored-by: Norbert Eicker --- Makefile.in | 1 + configure | 47 ++++++++++++++++++++++++++++++++++++ configure.ac | 6 +++++ doc/Makefile.in | 1 + src/flux/Makefile.in | 1 + src/slurm_plugin/Makefile.am | 2 +- src/slurm_plugin/Makefile.in | 3 ++- testsuite/Makefile.in | 1 + 8 files changed, 60 insertions(+), 2 deletions(-) diff --git a/Makefile.in b/Makefile.in index c4aac527..9e4b21b3 100644 --- a/Makefile.in +++ b/Makefile.in @@ -315,6 +315,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STATICFLAG = @STATICFLAG@ STRIP = @STRIP@ diff --git a/configure b/configure index aaa2f552..a9bcee84 100755 --- a/configure +++ b/configure @@ -638,6 +638,7 @@ LIBOBJS PKGSYSCONF_DIR BLD_SLURMPLUGIN_FALSE BLD_SLURMPLUGIN_TRUE +SINFO_ABSPATH SCONTROL_ABSPATH BE_host BE_CXXCPP @@ -18707,6 +18708,50 @@ fi if test "x$SCONTROL_ABSPATH" == "xnotfound"; then as_fn_error $? "Could not find scontrol" "$LINENO" 5 fi + # Extract the first word of "sinfo", so it can be a program name with args. +set dummy sinfo; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_SINFO_ABSPATH+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $SINFO_ABSPATH in + [\\/]* | ?:[\\/]*) + ac_cv_path_SINFO_ABSPATH="$SINFO_ABSPATH" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_SINFO_ABSPATH="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_path_SINFO_ABSPATH" && ac_cv_path_SINFO_ABSPATH="notfound" + ;; +esac +fi +SINFO_ABSPATH=$ac_cv_path_SINFO_ABSPATH +if test -n "$SINFO_ABSPATH"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SINFO_ABSPATH" >&5 +$as_echo "$SINFO_ABSPATH" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + if test "x$SINFO_ABSPATH" == "xnotfound"; then + as_fn_error $? "Could not find sinfo" "$LINENO" 5 + fi PATH=$OPATH fi @@ -18720,6 +18765,8 @@ fi SCONTROL_ABSPATH=$SCONTROL_ABSPATH +SINFO_ABSPATH=$SINFO_ABSPATH + cat >confcache <<\_ACEOF diff --git a/configure.ac b/configure.ac index 637fe9de..fa553700 100644 --- a/configure.ac +++ b/configure.ac @@ -85,12 +85,18 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then if test "x$SCONTROL_ABSPATH" == "xnotfound"; then AC_MSG_ERROR([Could not find scontrol]) fi + AC_PATH_PROG([SINFO_ABSPATH], [sinfo], [notfound]) + if test "x$SINFO_ABSPATH" == "xnotfound"; then + AC_MSG_ERROR([Could not find sinfo]) + fi PATH=$OPATH fi AM_CONDITIONAL([BLD_SLURMPLUGIN], [test "x$ENABLE_SLURM_PLUGIN" == "xtrue"]) AC_SUBST(SCONTROL_ABSPATH, $SCONTROL_ABSPATH) +AC_SUBST(SINFO_ABSPATH, $SINFO_ABSPATH) AC_SUBST(PKGSYSCONF_DIR) +AM_CONDITIONAL([BLD_FLUXPLUGIN], [test "x$ENABLE_FLUX_PLUGIN" = "xtrue"]) AC_OUTPUT diff --git a/doc/Makefile.in b/doc/Makefile.in index 3dc35cbb..78079922 100644 --- a/doc/Makefile.in +++ b/doc/Makefile.in @@ -253,6 +253,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STATICFLAG = @STATICFLAG@ STRIP = @STRIP@ diff --git a/src/flux/Makefile.in b/src/flux/Makefile.in index 18199543..e6733b24 100644 --- a/src/flux/Makefile.in +++ b/src/flux/Makefile.in @@ -361,6 +361,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STATICFLAG = @STATICFLAG@ STRIP = @STRIP@ diff --git a/src/slurm_plugin/Makefile.am b/src/slurm_plugin/Makefile.am index a9638b65..489dffcb 100644 --- a/src/slurm_plugin/Makefile.am +++ b/src/slurm_plugin/Makefile.am @@ -3,7 +3,7 @@ lib_LTLIBRARIES = libspindleslurm.la libver=`$(top_srcdir)/LIB_VERSION spindleslurm` libspindleslurm_la_SOURCES = encode_decode.c plugin_utils.c slurm_plugin.c $(top_srcdir)/src/utils/spindle_mkdir.c $(top_srcdir)/src/utils/parseloc.c -libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSPINDLE_DO_EXPORT +libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSINFO_BIN="$(SINFO_ABSPATH)" -DSPINDLE_DO_EXPORT libspindleslurm_la_LDFLAGS = $(AM_LDFLAGS) -ldl -version-info $(libver) libspindleslurm_la_LIBADD = $(top_builddir)/src/server/startup/libspindlebe.la $(top_builddir)/src/fe/startup/libspindlefe.la libspindleslurm_la_CFLAGS = $(CFLAGS) -fvisibility=hidden diff --git a/src/slurm_plugin/Makefile.in b/src/slurm_plugin/Makefile.in index c8488a96..a7fe188f 100644 --- a/src/slurm_plugin/Makefile.in +++ b/src/slurm_plugin/Makefile.in @@ -315,6 +315,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STATICFLAG = @STATICFLAG@ STRIP = @STRIP@ @@ -382,7 +383,7 @@ top_srcdir = @top_srcdir@ lib_LTLIBRARIES = libspindleslurm.la libver = `$(top_srcdir)/LIB_VERSION spindleslurm` libspindleslurm_la_SOURCES = encode_decode.c plugin_utils.c slurm_plugin.c $(top_srcdir)/src/utils/spindle_mkdir.c $(top_srcdir)/src/utils/parseloc.c -libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSPINDLE_DO_EXPORT +libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSINFO_BIN="$(SINFO_ABSPATH)" -DSPINDLE_DO_EXPORT libspindleslurm_la_LDFLAGS = $(AM_LDFLAGS) -ldl -version-info $(libver) libspindleslurm_la_LIBADD = $(top_builddir)/src/server/startup/libspindlebe.la $(top_builddir)/src/fe/startup/libspindlefe.la libspindleslurm_la_CFLAGS = $(CFLAGS) -fvisibility=hidden diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in index 9c225322..511a6aca 100644 --- a/testsuite/Makefile.in +++ b/testsuite/Makefile.in @@ -273,6 +273,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STATICFLAG = @STATICFLAG@ STRIP = @STRIP@ From 22c38f894002a63d87507e21a2e4c7b53778d2a0 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:58:06 +0100 Subject: [PATCH 13/19] Translate Slurm NodeName to NodeAddr for server communication setup --- src/slurm_plugin/plugin_utils.c | 85 +++++++++++++++++++++++++++++++++ src/slurm_plugin/plugin_utils.h | 1 + src/slurm_plugin/slurm_plugin.c | 14 +++++- 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 94e9a0cb..47d553cf 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -22,6 +22,11 @@ #else #define SLURM_SCONTROL_BIN "scontrol" #endif +#if defined SINFO_BIN +#define SLURM_SINFO_BIN STR(SINFO_BIN) +#else +#define SLURM_SINFO_BIN "sinfo" +#endif extern char *parse_location(char *loc, number_t number); @@ -119,6 +124,86 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) return ret; } +char **getHostAddrSinfo(unsigned int num_hosts, char **hostlist) +{ + const char *sinfo_path = SLURM_SINFO_BIN; + const char *sinfo_args = "-O NodeAddr -h -n"; + const char *sinfo_suffix = "2> /dev/null"; + FILE *f = NULL; + char **hostaddrlist = NULL, *s, *sinfo_cmdline = NULL, **ret = NULL; + int i, j, hostnamelen; + int result; + size_t maxHostLen = 0, sinfo_cmdlineLen, len; + + hostaddrlist = calloc(num_hosts+1, sizeof(char*)); + + for (i = 0; i < num_hosts; i++) { + if (hostlist[i]) { + size_t thisLen = strlen(hostlist[i]); + if (thisLen > maxHostLen) maxHostLen = thisLen; + } + } + + sinfo_cmdlineLen = strlen(sinfo_path) + strlen(sinfo_args) + + maxHostLen + strlen(sinfo_suffix) + 6; + sinfo_cmdline = (char *) malloc(sinfo_cmdlineLen); + + for (i = 0; i < num_hosts; i++) { + if (!hostlist[i] || !strlen(hostlist[i])) goto done; + result = snprintf(sinfo_cmdline, sinfo_cmdlineLen, "%s %s \"%s\" %s", + sinfo_path, sinfo_args, hostlist[i], sinfo_suffix); + if (result >= sinfo_cmdlineLen) { + sdprintf(1, "ERROR: Formatting error creating sinfo cmdline '%s' (%d)\n", + sinfo_cmdline, result); + goto done; + } + sdprintf(2, "Running sinfo to get host address: %s\n", sinfo_cmdline); + + f = popen(sinfo_cmdline, "r"); + if (!f) { + sdprintf(1, "ERROR: Could not run sinfo: %s\n", sinfo_cmdline); + goto done; + } + + len = 0; + result = getline(&(hostaddrlist[i]), &len, f); + pclose(f); + if (result == -1) { + int error = errno; + sdprintf(1, "ERROR: Resolving '%s' failed: %s\n", hostlist[i], strerror(error)); + (void) error; + goto done; + } + + s = hostaddrlist[i]; + hostnamelen = strlen(s); + for (j = 0; j < hostnamelen; j++) { + if (!((s[j] >= '0' && s[j] <= '9') || + (s[j] >= 'a' && s[j] <= 'z') || + (s[j] >= 'A' && s[j] <= 'Z') || + (s[j] == '-' || s[j] == '_' || s[j] == '.'))) { + s[j] = '\0'; + break; + } + } + sdprintf(3, "sinfo returned hostaddr %s for %s\n", s, hostlist[i]); + } + if (i != num_hosts) { + sdprintf(1, "ERROR: expected %d hosts from sinfo. Got %d\n", num_hosts, i); + goto done; + } + + ret = hostaddrlist; + done: + if (sinfo_cmdline) + free(sinfo_cmdline); + if (!ret && hostaddrlist) { + for (i = 0; i < num_hosts; i++) free(hostaddrlist[i]); + free(hostlist); + } + return ret; +} + int isFEHost(char **hostlist, unsigned int num_hosts) { char host[256]; diff --git a/src/slurm_plugin/plugin_utils.h b/src/slurm_plugin/plugin_utils.h index d57f55da..eaa0da12 100644 --- a/src/slurm_plugin/plugin_utils.h +++ b/src/slurm_plugin/plugin_utils.h @@ -39,6 +39,7 @@ int decodeSpindleConfig(const char *encodedstr, int *spindle_argc, char ***spindle_argv); char **getHostsScontrol(unsigned int num_hosts, const char *hoststr); +char **getHostAddrSinfo(unsigned int num_hosts, char **hostlist); char **getHostsParse(unsigned int num_hosts, const char *shortlist); int isFEHost(char **hostlist, unsigned int num_hosts); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 98e43945..700008a9 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -426,7 +426,7 @@ static int get_spindle_args(spank_t spank, spindle_args_t *params) static int launch_spindle(spank_t spank, spindle_args_t *params) { - char **hostlist = NULL; + char **hostlist = NULL, **hostaddrlist = NULL; int result; int is_fe_host = 0; int is_be_leader = 0; @@ -460,7 +460,14 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) } if (is_fe_host && is_be_leader) { +#if defined(SINFO_BIN) + hostaddrlist = getHostAddrSinfo(num_hosts, hostlist); + if (!hostaddrlist) + goto done; + result = launchFE(hostaddrlist, params); +#else result = launchFE(hostlist, params); +#endif if (result == -1) goto done; } @@ -471,6 +478,11 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) if (hostlist) { for (i = 0; i < num_hosts; i++) free(hostlist[i]); free(hostlist); + } + if (hostaddrlist) { + for (i = 0; i < num_hosts; i++) free(hostaddrlist[i]); + free(hostaddrlist); + } return launch_result; } From 6586e56d994d8e2bbe151520ab7d32019570739c Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Thu, 6 Nov 2025 13:58:36 -0800 Subject: [PATCH 14/19] Check for SPANK's spank_prepend_task_argv() in configure Co-authored-by: Norbert Eicker --- config.h.in | 4 ++++ configure | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 1 + 3 files changed, 63 insertions(+) diff --git a/config.h.in b/config.h.in index a346ae96..8c7a94db 100644 --- a/config.h.in +++ b/config.h.in @@ -33,6 +33,10 @@ /* Slurm with via plugin is enabled */ #undef ENABLE_SLURM_PLUGIN +/* Define to 1 if you have the declaration of `spank_prepend_task_argv', and + to 0 if you don't. */ +#undef HAVE_DECL_SPANK_PREPEND_TASK_ARGV + /* Define to 1 if you have the header file. */ #undef HAVE_DLFCN_H diff --git a/configure b/configure index a9bcee84..f04e5c2c 100755 --- a/configure +++ b/configure @@ -2213,6 +2213,52 @@ fi eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno } # ac_fn_c_check_header_mongrel + +# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES +# --------------------------------------------- +# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR +# accordingly. +ac_fn_c_check_decl () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + as_decl_name=`echo $2|sed 's/ *(.*//'` + as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'` + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5 +$as_echo_n "checking whether $as_decl_name is declared... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +#ifndef $as_decl_name +#ifdef __cplusplus + (void) $as_decl_use; +#else + (void) $as_decl_name; +#endif +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_decl cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. @@ -18659,6 +18705,18 @@ fi CPPFLAGS=$OCPPFLAGS + ac_fn_c_check_decl "$LINENO" "spank_prepend_task_argv" "ac_cv_have_decl_spank_prepend_task_argv" "#include +" +if test "x$ac_cv_have_decl_spank_prepend_task_argv" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_SPANK_PREPEND_TASK_ARGV $ac_have_decl +_ACEOF + OPATH=$PATH if test "x$SLURM_DIR" != "x"; then diff --git a/configure.ac b/configure.ac index fa553700..b2c519d4 100644 --- a/configure.ac +++ b/configure.ac @@ -76,6 +76,7 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then [], [AC_MSG_ERROR([Could not find slurm/spank.h])]) CPPFLAGS=$OCPPFLAGS + AC_CHECK_DECLS([spank_prepend_task_argv], [], [], [[#include]]) OPATH=$PATH if test "x$SLURM_DIR" != "x"; then From 326944ca59569f93c46764b7cd0c701d78e484df Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:35:56 +0100 Subject: [PATCH 15/19] Use spank_prepend_task_argv() to tweak task's argv - Requires Slurm 23.11 or later or a backport of this function --- src/slurm_plugin/slurm_plugin.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 700008a9..643467eb 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -29,6 +29,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "spindle_launch.h" #include "plugin_utils.h" +#include "config.h" SPINDLE_EXPORT extern const char plugin_name[]; SPINDLE_EXPORT extern const char plugin_type[]; @@ -578,9 +579,13 @@ static int launchBE(spank_t spank, spindle_args_t *params) static int prepApp(spank_t spank, spindle_args_t *params) { +#if HAVE_DECL_SPANK_PREPEND_TASK_ARGV == 1 + int result; +#else int app_argc, result; char **app_argv; char *app_exe_name, *last_slash; +#endif spank_err_t err; int bootstrap_argc; char **bootstrap_argv; @@ -591,6 +596,16 @@ static int prepApp(spank_t spank, spindle_args_t *params) return -1; } +#if HAVE_DECL_SPANK_PREPEND_TASK_ARGV == 1 + sdprintf(2, "Prepping task process %d to run spindle\n", getpid()); + + const char **filter_argv = (const char **)bootstrap_argv; + err = spank_prepend_task_argv(spank, bootstrap_argc, filter_argv); + if (err != ESPANK_SUCCESS) { + sdprintf(1, "WARNING: Could not prepend spindle filter.\n"); + result = -1; + } +#else sdprintf(2, "Prepping app process %d to run spindle\n", getpid()); err = spank_get_item(spank, S_JOB_ARGV, &app_argc, &app_argv); @@ -605,6 +620,7 @@ static int prepApp(spank_t spank, spindle_args_t *params) } result = spindleHookSpindleArgsIntoExecBE(bootstrap_argc, bootstrap_argv, app_exe_name); +#endif if (result == -1) { sdprintf(1, "ERROR setting up app to run spindle. Spindle won't work\n"); return -1; From aad8df5ab408ddb8d5cc2ba523f1be10186f2eaa Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Fri, 14 Nov 2025 10:27:03 -0800 Subject: [PATCH 16/19] Only free env_value if it was actually allocated --- src/utils/parseloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/utils/parseloc.c b/src/utils/parseloc.c index 0d8dcc81..00cdcf24 100644 --- a/src/utils/parseloc.c +++ b/src/utils/parseloc.c @@ -134,7 +134,9 @@ static char *parse_location_impl(char *loc, number_t number, int print_on_error) i += envvar_len + 1; j += env_value_len; #if defined(CUSTOM_GETENV) && defined(CUSTOM_GETENV_FREE) - free(env_value); + if(env_value != env_value_str) { + free(env_value); + } #endif } else { From 00b18681e776e3b474eb9989bae1d6c2e591efc6 Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Tue, 30 Dec 2025 17:06:33 -0800 Subject: [PATCH 17/19] Don't require rsh launch when SPANK plugin is built; ensure shutdown is run exactly once on every proc --- configure | 14 ++++++++++- configure.common.ac | 2 +- src/client/configure | 2 +- src/fe/configure | 2 +- src/server/configure | 2 +- src/slurm_plugin/plugin_utils.c | 11 +++++---- src/slurm_plugin/plugin_utils.h | 2 +- src/slurm_plugin/slurm_plugin.c | 41 ++++++++++++++++++++------------- 8 files changed, 50 insertions(+), 26 deletions(-) diff --git a/configure b/configure index f04e5c2c..e9a4f95a 100755 --- a/configure +++ b/configure @@ -16958,7 +16958,7 @@ $as_echo "yes" >&6; } $as_echo "$as_me: WARNING: Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." >&2;} fi - if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] ; then + if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ] ; then if test "x$BROKEN_SRUN" == "x1"; then as_fn_error $? "Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." "$LINENO" 5 else @@ -18826,6 +18826,14 @@ SCONTROL_ABSPATH=$SCONTROL_ABSPATH SINFO_ABSPATH=$SINFO_ABSPATH + if test "x$ENABLE_FLUX_PLUGIN" = "xtrue"; then + BLD_FLUXPLUGIN_TRUE= + BLD_FLUXPLUGIN_FALSE='#' +else + BLD_FLUXPLUGIN_TRUE='#' + BLD_FLUXPLUGIN_FALSE= +fi + cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure @@ -19040,6 +19048,10 @@ if test -z "${BLD_SLURMPLUGIN_TRUE}" && test -z "${BLD_SLURMPLUGIN_FALSE}"; then as_fn_error $? "conditional \"BLD_SLURMPLUGIN\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${BLD_FLUXPLUGIN_TRUE}" && test -z "${BLD_FLUXPLUGIN_FALSE}"; then + as_fn_error $? "conditional \"BLD_FLUXPLUGIN\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 diff --git a/configure.common.ac b/configure.common.ac index 780aef00..e8d311f4 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -168,7 +168,7 @@ if test "x$ENABLE_SLURM" == "xtrue"; then AC_MSG_WARN([Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster.]) fi - if [[ "x$ENABLE_RSH_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_LAUNCH" != "x1" ]] ; then + if [[ "x$ENABLE_RSH_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_LAUNCH" != "x1" ]] && [[ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ]] ; then if test "x$BROKEN_SRUN" == "x1"; then AC_MSG_ERROR([Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster.]) else diff --git a/src/client/configure b/src/client/configure index dbd69e1f..236528aa 100755 --- a/src/client/configure +++ b/src/client/configure @@ -12883,7 +12883,7 @@ $as_echo "yes" >&6; } $as_echo "$as_me: WARNING: Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." >&2;} fi - if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] ; then + if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ] ; then if test "x$BROKEN_SRUN" == "x1"; then as_fn_error $? "Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." "$LINENO" 5 else diff --git a/src/fe/configure b/src/fe/configure index 95e99784..75088c9a 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -16733,7 +16733,7 @@ $as_echo "yes" >&6; } $as_echo "$as_me: WARNING: Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." >&2;} fi - if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] ; then + if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ] ; then if test "x$BROKEN_SRUN" == "x1"; then as_fn_error $? "Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." "$LINENO" 5 else diff --git a/src/server/configure b/src/server/configure index e32c1cfe..a747c739 100755 --- a/src/server/configure +++ b/src/server/configure @@ -16730,7 +16730,7 @@ $as_echo "yes" >&6; } $as_echo "$as_me: WARNING: Slurm launching was explicitly requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You might still be able to get spindle to work by running jobs with srun's --overlap option. Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." >&2;} fi - if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] ; then + if [ "x$ENABLE_RSH_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_LAUNCH" != "x1" ] && [ "x$ENABLE_SLURM_PLUGIN" != "xtrue" ] ; then if test "x$BROKEN_SRUN" == "x1"; then as_fn_error $? "Slurm support was requested, but slurm $srun_version, which is later than $bad_srun_major.$bad_srun_minor, was detected. This version of slurm breaks spindle daemon launch. You can disable this error message and build spindle with slurm-based daemon launching anyways by explicitly passing the --with-slurm-launch option (you might still be able to get spindle to work by running jobs with srun's --overlap option). Or you could switch to having spindle launch daemons with rsh/ssh by passing the --with-rsh-launch option, and ensuring that rsh/ssh to nodes works on your cluster." "$LINENO" 5 else diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 47d553cf..3be45742 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -261,14 +261,14 @@ char *unique_file = NULL; #define UNIQUE_FILE_NAME "spindle_unique" -int isBEProc(spindle_args_t *params) +int isBEProc(spindle_args_t *params, unsigned int exit_phase) { - char *dir = NULL, *expanded_dir = NULL, *realized_dir = NULL; + char *dir = NULL, *expanded_dir = NULL, *realized_dir = NULL, *phase_name = NULL; char hostname[256], session_id_str[32]; size_t unique_file_len; int beproc_result = -1; int fd = -1, error; - + dir = params->location; if (!dir) { sdprintf(1, "ERROR: Location not filled in\n"); @@ -289,13 +289,16 @@ int isBEProc(spindle_args_t *params) snprintf(session_id_str, sizeof(session_id_str), "%lu", (unsigned long) params->number); + phase_name = exit_phase ? "exit" : "launch"; + unique_file_len = strlen(realized_dir) + 1 + strlen(UNIQUE_FILE_NAME) + 1 + + strlen(phase_name) + 1 + strlen(hostname) + 1 + strlen(session_id_str) + 1; unique_file = (char *) malloc(sizeof(char*) * unique_file_len); - snprintf(unique_file, unique_file_len, "%s/%s.%s.%s", realized_dir, UNIQUE_FILE_NAME, hostname, session_id_str); + snprintf(unique_file, unique_file_len, "%s/%s.%s.%s.%s", realized_dir, UNIQUE_FILE_NAME, phase_name, hostname, session_id_str); spindle_mkdir(realized_dir); diff --git a/src/slurm_plugin/plugin_utils.h b/src/slurm_plugin/plugin_utils.h index eaa0da12..acf2ac5d 100644 --- a/src/slurm_plugin/plugin_utils.h +++ b/src/slurm_plugin/plugin_utils.h @@ -44,7 +44,7 @@ char **getHostsParse(unsigned int num_hosts, const char *shortlist); int isFEHost(char **hostlist, unsigned int num_hosts); extern char *unique_file; -int isBEProc(spindle_args_t *params); +int isBEProc(spindle_args_t *params, unsigned int exit_phase); char *encodeCmdArgs(int sargc, char **sargv); void decodeCmdArgs(char *cmd, int *sargc, char ***sargv); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 643467eb..740b48ff 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -36,8 +36,10 @@ SPINDLE_EXPORT extern const char plugin_type[]; SPINDLE_EXPORT extern const unsigned int plugin_version; SPINDLE_EXPORT extern struct spank_option spank_options[]; SPINDLE_EXPORT int slurm_spank_task_init(spank_t spank, int ac, char *argv[]); +SPINDLE_EXPORT int slurm_spank_task_exit(spank_t spank, int ac, char *argv[]); SPINDLE_EXPORT int slurm_spank_exit(spank_t spank, int site_argc, char *site_argv[]); + SPANK_PLUGIN(spindle, 1) typedef struct { @@ -84,8 +86,8 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) int result, func_result = -1; saved_env_t *env = NULL; static int initialized = 0; - spindle_args_t params; - + spindle_args_t params = {0}; + if (!enable_spindle) return 0; @@ -136,7 +138,7 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) return func_result; } -int slurm_spank_exit(spank_t spank, int site_argc, char *site_argv[]) +int slurm_spank_task_exit(spank_t spank, int site_argc, char *site_argv[]) { spank_context_t context; char *result_str; @@ -226,7 +228,7 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv char *err_string; args->unique_id = unique_id; - args->number = (unsigned long) args->unique_id; + args->number = (number_t) args->unique_id; result = fillInSpindleArgsCmdlineFE(args, SPINDLE_FILLARGS_NOUNIQUEID | SPINDLE_FILLARGS_NONUMBER, argc, argv, &err_string); if (result == -1) { @@ -238,6 +240,8 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv return -1; } args->opts |= OPT_BEEXIT; + args->use_launcher = slurm_plugin_launcher; + args->startup_type = startup_external; oldlocation = args->location; current_spank = spank; @@ -444,7 +448,7 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) if (!hostlist) goto done; - is_be_leader = isBEProc(params); + is_be_leader = isBEProc(params, 0); if (is_be_leader == -1) goto done; @@ -516,7 +520,7 @@ static int launchFE(char **hostlist, spindle_args_t *params) superclose(); - sdprintf(1, "Initializing FE on pid %d with unqiue_id %lu\n", (int) getpid(), params->unique_id); + sdprintf(1, "Initializing FE on pid %d with unique_id %lu\n", (int) getpid(), params->unique_id); result = spindleInitFE((const char **) hostlist, params); if (result == -1) { sdprintf(1, "ERROR: Could not launch FE. Spindle will likely hang.\n"); @@ -597,7 +601,7 @@ static int prepApp(spank_t spank, spindle_args_t *params) } #if HAVE_DECL_SPANK_PREPEND_TASK_ARGV == 1 - sdprintf(2, "Prepping task process %d to run spindle\n", getpid()); + sdprintf(2, "Prepping task process %d to run spindle using spank_prepend_task_argv method\n", getpid()); const char **filter_argv = (const char **)bootstrap_argv; err = spank_prepend_task_argv(spank, bootstrap_argc, filter_argv); @@ -606,7 +610,7 @@ static int prepApp(spank_t spank, spindle_args_t *params) result = -1; } #else - sdprintf(2, "Prepping app process %d to run spindle\n", getpid()); + sdprintf(2, "Prepping app process %d to run spindle using spindleHookSpindleArgsIntoExecBE method\n", getpid()); err = spank_get_item(spank, S_JOB_ARGV, &app_argc, &app_argv); if (err != ESPANK_SUCCESS) { @@ -633,9 +637,9 @@ static int handleExit(void *params, char **output_str) { exit_params_t *exit_params; spank_t spank; - int site_argc, result; + int site_argc, result, is_be_leader; char **site_argv; - spindle_args_t args; + spindle_args_t args = {0}; exit_params = (exit_params_t *) params; spank = exit_params->spank; @@ -650,13 +654,18 @@ static int handleExit(void *params, char **output_str) } if (!args.location) { - sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); + sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); } else { - result = spindleExitBE(args.location); - if (result == -1) { - sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); - return -1; - } + // The task_exit callback is run for _each proc_, so we use + // isBEProc to pick only one proc per node to call spindleExitBE. + is_be_leader = isBEProc(&args, 1); + if (is_be_leader) { + result = spindleExitBE(args.location); + if (result == -1) { + sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); + return -1; + } + } } return 0; } From aec4f50519fbd8084d88a9ff9de6447617261359 Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Thu, 8 Jan 2026 15:52:17 -0800 Subject: [PATCH 18/19] Handle --level=off in SPANK plugin --- src/slurm_plugin/slurm_plugin.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 740b48ff..c7bbcef2 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -116,6 +116,10 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) goto done; } + if (params.opts & OPT_OFF) { + return 0; + } + result = launch_spindle(spank, ¶ms); if (result == -1) { sdprintf(1, "Error launching spindle. Aborting spindle\n"); @@ -652,6 +656,10 @@ static int handleExit(void *params, char **output_str) sdprintf(1, "ERROR: Could not process spindle args in handleExit\n"); return -1; } + + if (args.opts & OPT_OFF) { + return 0; + } if (!args.location) { sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); From cada056160a425f278011a323e168c25ef8d3a89 Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Mon, 12 Jan 2026 15:05:33 -0800 Subject: [PATCH 19/19] CI testing for SPANK plugin --- .github/workflows/ci.yml | 58 ++++++++ .../testing-plugin/Dockerfile | 42 ++++++ .../testing-plugin/conf/cgroup.conf | 1 + .../testing-plugin/conf/plugstack.conf | 1 + .../testing-plugin/conf/slurm.conf | 42 ++++++ .../conf/slurmdbd.conf.template | 10 ++ .../testing-plugin/docker-compose.yml | 126 ++++++++++++++++++ .../testing-plugin/generate_config.sh | 14 ++ .../testing-plugin/scripts/add_docker_user.sh | 10 ++ .../testing-plugin/scripts/build_spindle.sh | 9 ++ .../testing-plugin/scripts/entrypoint.sh | 21 +++ .../testing-plugin/scripts/setup_slurm.sh | 10 ++ .../scripts/setup_spank_plugin.sh | 5 + testsuite/run_driver_slurm-plugin | 20 +++ 14 files changed, 369 insertions(+) create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/conf/cgroup.conf create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/conf/plugstack.conf create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/conf/slurm.conf create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/conf/slurmdbd.conf.template create mode 100644 containers/spindle-slurm-ubuntu/testing-plugin/docker-compose.yml create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/generate_config.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/scripts/add_docker_user.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/scripts/entrypoint.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_slurm.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_spank_plugin.sh create mode 100644 testsuite/run_driver_slurm-plugin diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f465e09e..c290a735 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,3 +163,61 @@ jobs: cd containers/spindle-slurm-ubuntu/testing docker compose down + spindle-slurm-plugin-ubuntu: + name: Testsuite (Slurm Plugin, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - name: Check out Spindle + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 + + - name: Setup Docker Compose + uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + with: + version: latest + + - name: Login to GitHub Container Registry + if: ${{ !env.ACT }} + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate MariaDB configuration + id: slurm-ubuntu-mariadb + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + ./generate_config.sh + + - name: Build spindle-slurm-plugin-ubuntu image + id: slurm-ubuntu-build + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose --progress=plain build + + - name: Bring spindle-slurm-plugin-ubuntu up + id: slurm-ubuntu-up + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose up -d --wait --wait-timeout 120 + + - name: Verify munge works in spindle-slurm-plugin-ubuntu + id: slurm-ubuntu-munge + run: | + docker exec slurm-plugin-head bash -c 'munge -n | unmunge' + + - name: Run spindle-slurm-plugin-ubuntu testsuite + id: slurm-ubuntu-testsuite + run: | + docker exec slurm-plugin-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' + + - name: Bring spindle-slurm-plugin-ubuntu down + id: slurm-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose down + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile new file mode 100644 index 00000000..7b66a155 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile @@ -0,0 +1,42 @@ +ARG BASE_VERSION=latest +FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} +ARG replicas=4 +ENV workers=${replicas} + +ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin + +# Slurm daemons run as $SLURM_USER +ARG SLURM_USER=slurm + +# Applications run as $USER +ARG USER=slurmuser +ARG UID=1001 + +# Set up the Slurm install already present in the base image +COPY ${BUILD_ROOT}/scripts/setup_slurm.sh /setup_slurm.sh +COPY ${BUILD_ROOT}/conf/slurm.conf /home/${SLURM_USER}/slurm.conf +COPY ${BUILD_ROOT}/conf/slurmdbd.conf /home/${SLURM_USER}/slurmdbd.conf +COPY ${BUILD_ROOT}/conf/cgroup.conf /home/${SLURM_USER}/cgroup.conf +RUN /setup_slurm.sh + +USER ${USER} +WORKDIR /home/${USER} + +# Copy the Spindle repo into the container and build it +RUN mkdir -p /home/${USER}/Spindle +COPY . /home/${USER}/Spindle +COPY ${BUILD_ROOT}/scripts/build_spindle.sh /home/${USER}/build_spindle.sh +RUN ./build_spindle.sh + +USER root +COPY ${BUILD_ROOT}/scripts/setup_spank_plugin.sh /setup_spank_plugin.sh +COPY ${BUILD_ROOT}/conf/plugstack.conf /home/${SLURM_USER}/plugstack.conf +RUN /setup_spank_plugin.sh + +USER ${USER} + +COPY ${BUILD_ROOT}/scripts/entrypoint.sh /home/${USER}/entrypoint.sh +ENV PATH /home/${USER}/Spindle-inst/bin:$PATH + +ENTRYPOINT /bin/bash ./entrypoint.sh + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/conf/cgroup.conf b/containers/spindle-slurm-ubuntu/testing-plugin/conf/cgroup.conf new file mode 100644 index 00000000..e59e9aee --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/conf/cgroup.conf @@ -0,0 +1 @@ +CgroupPlugin=cgroup/v1 diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/conf/plugstack.conf b/containers/spindle-slurm-ubuntu/testing-plugin/conf/plugstack.conf new file mode 100644 index 00000000..a291c1c0 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/conf/plugstack.conf @@ -0,0 +1 @@ +required /home/slurmuser/Spindle-inst/lib/libspindleslurm.so diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurm.conf b/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurm.conf new file mode 100644 index 00000000..abf060d5 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurm.conf @@ -0,0 +1,42 @@ +ClusterName=linux +ControlMachine=slurm-head +ControlAddr=slurm-head +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +TaskPlugin=task/affinity +ReturnToService=2 +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurm-db +AccountingStoragePort=6819 +NodeName=slurm-node-1 NodeAddr=slurm-node-1 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-2 NodeAddr=slurm-node-2 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-3 NodeAddr=slurm-node-3 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-4 NodeAddr=slurm-node-4 CPUs=3 RealMemory=1000 State=UNKNOWN +PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurmdbd.conf.template b/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurmdbd.conf.template new file mode 100644 index 00000000..0e274118 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/conf/slurmdbd.conf.template @@ -0,0 +1,10 @@ +AuthType=auth/munge +DbdAddr=slurm-db +DbdHost=slurm-db +SlurmUser=slurm +DebugLevel=4 +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +StorageType=accounting_storage/mysql +StorageHost=slurm-mariadb +StorageUser=slurm diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/docker-compose.yml b/containers/spindle-slurm-ubuntu/testing-plugin/docker-compose.yml new file mode 100644 index 00000000..2751a71a --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/docker-compose.yml @@ -0,0 +1,126 @@ +# `replicas` must match the number of nodes defined in the services section +x-shared-workers: + &workers + replicas: 4 + +# Base image version to use +x-shared-build-args: &shared-build-args + BASE_VERSION: latest + <<: *workers + +# Docker prohibits copying files from outside of the build context. +# In order to be able to copy the whole repo into the container, +# we have to set the context to be the root of the repo. +# We then have to specify the path from there to the Dockerfile. +x-shared-build-context: &shared-build-context + context: ../../.. + dockerfile: containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile + args: *shared-build-args + +# Name of the head node +x-shared-environment: &shared-environment + SLURM_HEAD_NODE: slurm-head + <<: *workers + +# The entrypoint runs different services depending +# on the node's role. Valid options are: +# - worker: runs slurmd +# - db: runs slurmdbd +# - ctl: runs slurmctld +x-worker-environment: &worker-environment + SLURM_ROLE: worker + <<: *shared-environment + +networks: + slurm: + driver: bridge + +# Common parameters for all nodes. +x-shared-node-parameters: &shared-node-parameters + build: *shared-build-context + networks: + - slurm + cap_add: + - SYS_NICE # Required for libnuma + +x-healthcheck-parameters: &healthcheck-parameters + start_period: 3s + interval: 3s + timeout: 5s + retries: 5 + +x-worker-parameters: &worker-node-parameters + <<: *shared-node-parameters + environment: *worker-environment + depends_on: + slurm-head: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmd/slurmd.pid"] + <<: *healthcheck-parameters + +services: + slurm-mariadb: + image: mariadb:12 + networks: + - slurm + hostname: slurm-mariadb + container_name: slurm-plugin-mariadb + env_file: mariadb.env + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: "slurm_acct_db" + MYSQL_USER: "slurm" + healthcheck: + test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"] + <<: *healthcheck-parameters + + slurm-db: + <<: *shared-node-parameters + hostname: slurm-db + container_name: slurm-plugin-db + environment: + SLURM_ROLE: db + <<: *shared-environment + depends_on: + slurm-mariadb: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmdbd/slurmdbd.pid"] + <<: *healthcheck-parameters + + slurm-head: + <<: *shared-node-parameters + hostname: slurm-head + container_name: slurm-plugin-head + tty: true + environment: + SLURM_ROLE: ctl + <<: *shared-environment + depends_on: + slurm-db: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmd/slurmctld.pid"] + <<: *healthcheck-parameters + + slurm-node-1: + <<: *worker-node-parameters + hostname: slurm-node-1 + container_name: slurm-plugin-node-1 + + slurm-node-2: + <<: *worker-node-parameters + hostname: slurm-node-2 + container_name: slurm-plugin-node-2 + + slurm-node-3: + <<: *worker-node-parameters + hostname: slurm-node-3 + container_name: slurm-plugin-node-3 + + slurm-node-4: + <<: *worker-node-parameters + hostname: slurm-node-4 + container_name: slurm-plugin-node-4 + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/generate_config.sh b/containers/spindle-slurm-ubuntu/testing-plugin/generate_config.sh new file mode 100755 index 00000000..eadb1c8a --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/generate_config.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Generate random password for the MariaDB slurm user +# and set it in config files + +APP_USER=${APP_USER:-slurmuser} +MARIADB_PASS=$(openssl rand --base64 16 | head -c -3) +echo "MARIADB_PASSWORD: \"${MARIADB_PASS}\"" > mariadb.env +cp conf/slurmdbd.conf.template conf/slurmdbd.conf +echo "StoragePass=${MARIADB_PASS}" >> conf/slurmdbd.conf + +# Enable Spindle SPANK plugin + +echo "required /home/${APP_USER}/Spindle-inst/lib/libspindleslurm.so" > conf/plugstack.conf diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/add_docker_user.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/add_docker_user.sh new file mode 100755 index 00000000..ace8c619 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/add_docker_user.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euxo pipefail + +sudo groupadd -g ${UID} ${USER} +sudo useradd -g ${USER} -u ${UID} -d /home/${USER} -m ${USER} +# Allow user to run as other users so that munge can be started as the munge user +sudo sh -c "printf \"${USER} ALL=(ALL) NOPASSWD: ALL\\n\" >> /etc/sudoers" +sudo adduser ${USER} sudo +sudo usermod -s /bin/bash ${USER} + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh new file mode 100755 index 00000000..2a252b32 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p /home/${USER}/Spindle-build +cd /home/${USER}/Spindle-build +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +make -j$(nproc) +make install + diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/entrypoint.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/entrypoint.sh new file mode 100755 index 00000000..54d40e48 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/entrypoint.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +echo "SLURM_ROLE: ${SLURM_ROLE}" + +echo "Starting sshd..." +sudo service ssh start +echo "Starting munged..." +sudo -u munge /usr/sbin/munged + +if [ "${SLURM_ROLE}" = "db" ]; then + echo "Starting slurmdbd..." + sudo -u slurm /usr/sbin/slurmdbd -Dvvv +elif [ "${SLURM_ROLE}" = "ctl" ] ; then + echo "Starting slurmctld..." + sudo -u slurm /usr/sbin/slurmctld -i -Dvvv +elif [ "${SLURM_ROLE}" = "worker" ] ; then + echo "Starting slurmd..." + sudo /usr/sbin/slurmd -Dvvv +fi + +sleep inf diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_slurm.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_slurm.sh new file mode 100755 index 00000000..186beea0 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_slurm.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p /etc/slurm /etc/sysconfig/slurm /var/spool/slurmd /var/spool/slurmctld /var/run/slurmd /var/run/slurmdbd /var/lib/slurmd /var/log/slurm +touch /var/lib/slurmd/node_state /var/lib/slurmd/front_end_state /var/lib/slurmd/job_state /var/lib/slurmd/resv_state /var/lib/slurmd/trigger_state /var/lib/slurmd/assoc_mgr_state /var/lib/slurmd/assoc_usage /var/lib/slurmd/qos_usage /var/lib/slurmd/fed_mgr_state +cp /home/${SLURM_USER}/slurm.conf /etc/slurm/slurm.conf +cp /home/${SLURM_USER}/slurmdbd.conf /etc/slurm/slurmdbd.conf +cp /home/${SLURM_USER}/cgroup.conf /etc/slurm/cgroup.conf +chown -R slurm:slurm /etc/slurm /etc/sysconfig/slurm /var/spool/slurmd /var/spool/slurmctld /var/run/slurmd /var/run/slurmdbd /var/lib/slurmd /var/log/slurm +chmod 600 /etc/slurm/slurmdbd.conf diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_spank_plugin.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_spank_plugin.sh new file mode 100755 index 00000000..7b6ea089 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/setup_spank_plugin.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euxo pipefail + +cp /home/${SLURM_USER}/plugstack.conf /etc/slurm/plugstack.conf +chown -R slurm:slurm /etc/slurm diff --git a/testsuite/run_driver_slurm-plugin b/testsuite/run_driver_slurm-plugin new file mode 100644 index 00000000..6d06cbbf --- /dev/null +++ b/testsuite/run_driver_slurm-plugin @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ x$SLURM_CPUS_ON_NODE = x ] ; then +export SLURM_CPUS_ON_NODE=1 +fi + +if [ x$SPINDLE_TEST_ARGS = x ] ; then +((PROCS=$SLURM_NNODES*$SLURM_CPUS_ON_NODE)) +else +PROCS=$SPINDLE_TEST_ARGS +fi +export PROCS + +if [ "x$SPINDLE_BGQ_LD_PRELOAD" = "xtrue" ] ; then +PRELOAD_ARGS="--runjob-opts=--envs LD_PRELOAD=$LIBRARY_LIST" +elif [ "x$SPINDLE_LD_PRELOAD" != "x" ] ; then +PRELOAD_ARGS="--export=ALL,LD_PRELOAD=$SPINDLE_LD_PRELOAD" +fi + +exec srun $PRELOAD_ARGS --spindle="$SPINDLE_FLAGS $SPINDLE_OPTS" -n $PROCS $*