From c5efe1a9acb5adf27b45b1e75b72e1ce779cad83 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:13:29 +0100 Subject: [PATCH 01/15] Bugfix: Prevent segfault when TMPDIR is not given --- src/slurm_plugin/slurm_plugin.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index a8886c51..2a2dfb33 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -618,11 +618,14 @@ static int handleExit(void *params, char **output_str) return -1; } - result = spindleExitBE(args.location); - if (result == -1) { - sdprintf(1, "ERROR: spindleExitBE returned and error on location %s\n", args.location); - return -1; + if (!args.location) { + sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); + } else { + result = spindleExitBE(args.location); + if (result == -1) { + sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); + return -1; + } } - return 0; } From dc84e430db3255df589fe9cc38f5f444bf7af864 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:06:45 +0100 Subject: [PATCH 02/15] Bugfix: Fix segfault when trying to bind unforeseen symbol --- src/client/auditclient/redirect.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/client/auditclient/redirect.c b/src/client/auditclient/redirect.c index fc97842f..8c58a30e 100644 --- a/src/client/auditclient/redirect.c +++ b/src/client/auditclient/redirect.c @@ -36,9 +36,14 @@ ElfX_Addr client_call_binding(const char *symname, ElfX_Addr symvalue) if (!binding) return symvalue; + if (!binding->libc_func) { + debug_printf("%s: symname %s has no libc_func container\n", + __func__, symname); + return (ElfX_Addr) binding->spindle_func; + } + if (*binding->libc_func == NULL) *binding->libc_func = (void *) symvalue; return (ElfX_Addr) binding->spindle_func; } - From a17dbf2726928a8406be94398c6739565375c214 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:26:48 +0100 Subject: [PATCH 03/15] Bugfix: Actually use scontrol for host expansion if detected --- src/slurm_plugin/slurm_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 2a2dfb33..1394800d 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -345,7 +345,7 @@ static char **get_hostlist(spank_t spank, unsigned int num_hosts) return NULL; } -#if defined(USE_SCONTROL) +#if defined(SCONTROL_BIN) hostlist = getHostsScontrol(num_hosts, short_hosts); #else hostlist = getHostsParse(num_hosts, short_hosts); From 7040869f6c4eecb89a02f8eaf670b3e2a4b5c7de Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:48:48 +0100 Subject: [PATCH 04/15] Bugfix: Fix argument propagation to Slurm SPANK plugin --- src/fe/startup/spindle_fe.cc | 5 +++-- src/slurm_plugin/slurm_plugin.c | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index cad28cbd..93e2f0c3 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -254,13 +254,14 @@ int fillInSpindleArgsCmdlineFE(spindle_args_t *params, unsigned int options, int mod_argv[i+1] = sargv[i]; } mod_argv[i+1] = const_cast("launcher"); - mod_argv[i] = NULL; + mod_argv[i+2] = NULL; + mod_argc = i+2; opts = PARSECMD_FLAG_NOEXIT; opts |= PARSECMD_FLAG_CAPTUREIO; opts |= (options & SPINDLE_FILLARGS_NOUNIQUEID) ? PARSECMD_FLAG_NOUNIQUEID : 0; opts |= (options & SPINDLE_FILLARGS_NONUMBER) ? PARSECMD_FLAG_NONUMBER : 0; - result = parseCommandLine(i, mod_argv, params, opts, errstr); + result = parseCommandLine(mod_argc, mod_argv, params, opts, errstr); params->use_launcher = external_launcher; params->startup_type = startup_external; diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 1394800d..75e4431e 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -273,9 +273,9 @@ static int process_spindle_args(spank_t spank, int site_argc, char *site_argv[], site_options_size = strlen(site_options); user_options_size = strlen(user_options); - combined_options_size = site_options_size + user_options_size + 2; + combined_options_size = site_options_size + user_options_size + 3; combined_options = (char *) malloc(combined_options_size); - snprintf(combined_options, combined_options_size, "%s%s%s", + snprintf(combined_options, combined_options_size, "%s%s%s ", site_options, (site_options_size && user_options_size) ? " " : "", user_options); @@ -301,7 +301,7 @@ static int process_spindle_args(spank_t spank, int site_argc, char *site_argv[], if (spindle_config) free(spindle_config); - if (combined_argv) { + if (!out_argv && combined_argv) { for (i = 0; i < combined_argc; i++) { if (combined_argv[i]) free(combined_argv[i]); From 33d362b71dbd9f46328f79dfe67fd1e491a4df52 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:11:27 +0100 Subject: [PATCH 05/15] Bugfix: Let debug report the correct assignment --- src/client/client/lookup_libc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/lookup_libc.c b/src/client/client/lookup_libc.c index 71b9c0cf..884a6640 100644 --- a/src/client/client/lookup_libc.c +++ b/src/client/client/lookup_libc.c @@ -185,7 +185,7 @@ int lookup_libc_symbols() } else { mallocfunc = (malloc_sig_t) (symtab[result].st_value + libc->l_addr); - debug_printf3("Bound errno_location to %p\n", app_errno_location); + debug_printf3("Bound mallocfunc to %p\n", mallocfunc); found++; } From 29b4640a90fd634659ff6c9f82057c91b33274af Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:59:12 +0100 Subject: [PATCH 06/15] Bugfix: Prevent memory leak --- src/slurm_plugin/slurm_plugin.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 75e4431e..1e541fb5 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -432,7 +432,7 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) int result; int is_fe_host = 0; int is_be_leader = 0; - unsigned int num_hosts; + unsigned int i, num_hosts; int num_hosts_result; int launch_result = -1; @@ -470,7 +470,8 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) launch_result = 0; done: - if (hostlist) + if (hostlist) { + for (i = 0; i < num_hosts; i++) free(hostlist[i]); free(hostlist); return launch_result; From a55fb1132d42e5c07ef78bc839e5dc6b9e6c0e81 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:36:08 +0100 Subject: [PATCH 07/15] Actually debug print scontrol's command line --- src/slurm_plugin/plugin_utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 1042c369..24abfc9d 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -46,7 +46,6 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) scontrol_cmdline_len = strlen(scontrol_path) + strlen(scontrol_args) + strlen(hoststr) + strlen(scontrol_suffix) + 6; scontrol_cmdline = (char *) malloc(scontrol_cmdline_len); - sdprintf(2, "Running scontrol to get host list: %s\n", scontrol_cmdline); result = snprintf(scontrol_cmdline, scontrol_cmdline_len, "%s %s \"%s\" %s", scontrol_path, scontrol_args, hoststr, scontrol_suffix); if (result >= scontrol_cmdline_len) { @@ -54,6 +53,7 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) scontrol_cmdline, result, scontrol_cmdline_len); goto done; } + sdprintf(2, "Running scontrol to get host list: %s\n", scontrol_cmdline); f = popen(scontrol_cmdline, "r"); if (!f) { From 0a4aea93ba2c7403a2951f60af59547d854c332f Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:04:07 +0100 Subject: [PATCH 08/15] Cleanup the unique_file helping to identify the local BE process --- src/slurm_plugin/plugin_utils.c | 11 ++++++----- src/slurm_plugin/plugin_utils.h | 1 + src/slurm_plugin/slurm_plugin.c | 5 +++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 24abfc9d..d39cf49c 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -172,12 +172,13 @@ int isFEHost(char **hostlist, unsigned int num_hosts) return feresult; } +char *unique_file = NULL; + #define UNIQUE_FILE_NAME "spindle_unique" int isBEProc(spindle_args_t *params) { char *dir = NULL, *expanded_dir = NULL, *realized_dir = NULL; - char *unique_file = NULL; char hostname[256], session_id_str[32]; size_t unique_file_len; int beproc_result = -1; @@ -218,9 +219,11 @@ int isBEProc(spindle_args_t *params) sdprintf(2, "Opened %s to result %d\n", unique_file, fd); if (fd != -1) beproc_result = 1; - else if (error == EEXIST) + else if (error == EEXIST) { beproc_result = 0; - else { + free(unique_file); + unique_file = NULL; + } else { sdprintf(1, "ERROR: Could not create spindle unique_file %s: %s\n", unique_file, strerror(error)); goto done; } @@ -230,8 +233,6 @@ int isBEProc(spindle_args_t *params) free(expanded_dir); if (realized_dir) free(realized_dir); - if (unique_file) - free(unique_file); if (fd != -1) close(fd); sdprintf(2, "returning %d\n", beproc_result); diff --git a/src/slurm_plugin/plugin_utils.h b/src/slurm_plugin/plugin_utils.h index 971d5707..d57f55da 100644 --- a/src/slurm_plugin/plugin_utils.h +++ b/src/slurm_plugin/plugin_utils.h @@ -42,6 +42,7 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr); char **getHostsParse(unsigned int num_hosts, const char *shortlist); int isFEHost(char **hostlist, unsigned int num_hosts); +extern char *unique_file; int isBEProc(spindle_args_t *params); char *encodeCmdArgs(int sargc, char **sargv); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 1e541fb5..c1d363d5 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -557,6 +557,11 @@ static int launchBE(spank_t spank, spindle_args_t *params) sdprintf(1, "ERROR: spindleRunBE failed\n"); else sdprintf(1, "spindleRunBE completed. Session finishing.\n"); + + if (unique_file) unlink(unique_file); + free(unique_file); + unique_file = NULL; + exit(result); return 0; From a1e2910138ba3c1ebcec1017a29fa53ed067e256 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:09:08 +0100 Subject: [PATCH 09/15] Do not write 'fepid' file that is never read again --- src/slurm_plugin/slurm_plugin.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index c1d363d5..93b209ef 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -500,7 +500,6 @@ static int launchFE(char **hostlist, spindle_args_t *params) } if (pidFE) { sdprintf(2, "Forked FE as pid %d\n", pidFE); - registerFEPid(pidFE, params); return 0; } From 0960e81fc5a510052561afa823da3f6fc8d007fc Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:37:59 +0100 Subject: [PATCH 10/15] Let spindle helper escape to own process group - This saves them from premature killing by psmgmt --- src/logging/spindle_logc.c | 1 + src/slurm_plugin/plugin_utils.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/logging/spindle_logc.c b/src/logging/spindle_logc.c index 4bedf6ab..2a936bdf 100644 --- a/src/logging/spindle_logc.c +++ b/src/logging/spindle_logc.c @@ -76,6 +76,7 @@ void spawnLogDaemon(char *tempdir) if (result == 0) { char *params[7]; int cur = 0; + setpgid(0, 0); /* escape to own process group */ params[cur++] = spindle_log_daemon_name; params[cur++] = tempdir; if (spindle_debug_prints) { diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index d39cf49c..9a4ec1e7 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -701,6 +701,7 @@ pid_t grandchild_fork() exit(result == sizeof(grandchild_pid) ? 0 : -1); } //In grandchild + setpgid(0, 0); /* escape to own process group */ fork_result = 0; goto done; From a45823a0c084430da1b641a5a6eda04cd9aad418 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:46:42 +0100 Subject: [PATCH 11/15] Remove combined_argc, combined_argv which is never used --- src/slurm_plugin/slurm_plugin.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 93b209ef..a747a91f 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -84,8 +84,6 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) saved_env_t *env = NULL; static int initialized = 0; spindle_args_t params; - int combined_argc; - char **combined_argv; if (!enable_spindle) return 0; @@ -109,7 +107,7 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) push_env(spank, &env); sdprintf(1, "Beginning spindle plugin\n"); - result = process_spindle_args(spank, site_argc, site_argv, ¶ms, &combined_argc, &combined_argv); + result = process_spindle_args(spank, site_argc, site_argv, ¶ms, NULL, NULL); if (result == -1) { sdprintf(1, "Error processesing spindle arguments. Aborting spindle\n"); goto done; From d77ed73cceb08d4d6839617cda3e6bd2f5bc9bd1 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:53:16 +0100 Subject: [PATCH 12/15] Add detection of sinfo binary for use in SPANK plugin --- Makefile.in | 1 + configure | 47 ++++++++++++++++++++++++++++++++++++ configure.ac | 5 ++++ doc/Makefile.in | 1 + src/flux/Makefile.in | 1 + src/slurm_plugin/Makefile.am | 2 +- src/slurm_plugin/Makefile.in | 3 ++- testsuite/Makefile.in | 1 + 8 files changed, 59 insertions(+), 2 deletions(-) diff --git a/Makefile.in b/Makefile.in index 4f14a21b..ce04b249 100644 --- a/Makefile.in +++ b/Makefile.in @@ -312,6 +312,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STRIP = @STRIP@ TESTRM = @TESTRM@ diff --git a/configure b/configure index e697900d..0366dcd2 100755 --- a/configure +++ b/configure @@ -639,6 +639,7 @@ BLD_FLUXPLUGIN_FALSE BLD_FLUXPLUGIN_TRUE BLD_SLURMPLUGIN_FALSE BLD_SLURMPLUGIN_TRUE +SINFO_ABSPATH SCONTROL_ABSPATH BE_host BE_CXXCPP @@ -18423,6 +18424,50 @@ fi if test "x$SCONTROL_ABSPATH" == "xnotfound"; then as_fn_error $? "Could not find scontrol" "$LINENO" 5 fi + # Extract the first word of "sinfo", so it can be a program name with args. +set dummy sinfo; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_SINFO_ABSPATH+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $SINFO_ABSPATH in + [\\/]* | ?:[\\/]*) + ac_cv_path_SINFO_ABSPATH="$SINFO_ABSPATH" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_SINFO_ABSPATH="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_path_SINFO_ABSPATH" && ac_cv_path_SINFO_ABSPATH="notfound" + ;; +esac +fi +SINFO_ABSPATH=$ac_cv_path_SINFO_ABSPATH +if test -n "$SINFO_ABSPATH"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SINFO_ABSPATH" >&5 +$as_echo "$SINFO_ABSPATH" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + if test "x$SINFO_ABSPATH" == "xnotfound"; then + as_fn_error $? "Could not find sinfo" "$LINENO" 5 + fi PATH=$OPATH fi @@ -18436,6 +18481,8 @@ fi SCONTROL_ABSPATH=$SCONTROL_ABSPATH +SINFO_ABSPATH=$SINFO_ABSPATH + if test "x$ENABLE_FLUX_PLUGIN" = "xtrue"; then BLD_FLUXPLUGIN_TRUE= diff --git a/configure.ac b/configure.ac index cc68cf30..3371b0eb 100644 --- a/configure.ac +++ b/configure.ac @@ -82,11 +82,16 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then if test "x$SCONTROL_ABSPATH" == "xnotfound"; then AC_MSG_ERROR([Could not find scontrol]) fi + AC_PATH_PROG([SINFO_ABSPATH], [sinfo], [notfound]) + if test "x$SINFO_ABSPATH" == "xnotfound"; then + AC_MSG_ERROR([Could not find sinfo]) + fi PATH=$OPATH fi AM_CONDITIONAL([BLD_SLURMPLUGIN], [test "x$ENABLE_SLURM_PLUGIN" == "xtrue"]) AC_SUBST(SCONTROL_ABSPATH, $SCONTROL_ABSPATH) +AC_SUBST(SINFO_ABSPATH, $SINFO_ABSPATH) AM_CONDITIONAL([BLD_FLUXPLUGIN], [test "x$ENABLE_FLUX_PLUGIN" = "xtrue"]) diff --git a/doc/Makefile.in b/doc/Makefile.in index 2e73da04..f64d1170 100644 --- a/doc/Makefile.in +++ b/doc/Makefile.in @@ -250,6 +250,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STRIP = @STRIP@ TESTRM = @TESTRM@ diff --git a/src/flux/Makefile.in b/src/flux/Makefile.in index db3b0db0..21142761 100644 --- a/src/flux/Makefile.in +++ b/src/flux/Makefile.in @@ -305,6 +305,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STRIP = @STRIP@ TESTRM = @TESTRM@ diff --git a/src/slurm_plugin/Makefile.am b/src/slurm_plugin/Makefile.am index a9638b65..489dffcb 100644 --- a/src/slurm_plugin/Makefile.am +++ b/src/slurm_plugin/Makefile.am @@ -3,7 +3,7 @@ lib_LTLIBRARIES = libspindleslurm.la libver=`$(top_srcdir)/LIB_VERSION spindleslurm` libspindleslurm_la_SOURCES = encode_decode.c plugin_utils.c slurm_plugin.c $(top_srcdir)/src/utils/spindle_mkdir.c $(top_srcdir)/src/utils/parseloc.c -libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSPINDLE_DO_EXPORT +libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSINFO_BIN="$(SINFO_ABSPATH)" -DSPINDLE_DO_EXPORT libspindleslurm_la_LDFLAGS = $(AM_LDFLAGS) -ldl -version-info $(libver) libspindleslurm_la_LIBADD = $(top_builddir)/src/server/startup/libspindlebe.la $(top_builddir)/src/fe/startup/libspindlefe.la libspindleslurm_la_CFLAGS = $(CFLAGS) -fvisibility=hidden diff --git a/src/slurm_plugin/Makefile.in b/src/slurm_plugin/Makefile.in index 2f2606bf..e82ce952 100644 --- a/src/slurm_plugin/Makefile.in +++ b/src/slurm_plugin/Makefile.in @@ -312,6 +312,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STRIP = @STRIP@ TESTRM = @TESTRM@ @@ -378,7 +379,7 @@ top_srcdir = @top_srcdir@ lib_LTLIBRARIES = libspindleslurm.la libver = `$(top_srcdir)/LIB_VERSION spindleslurm` libspindleslurm_la_SOURCES = encode_decode.c plugin_utils.c slurm_plugin.c $(top_srcdir)/src/utils/spindle_mkdir.c $(top_srcdir)/src/utils/parseloc.c -libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSPINDLE_DO_EXPORT +libspindleslurm_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/src/include -I$(top_srcdir)/src/logging -DUSE_PLUGIN_DEBUG -DDEBUG -DCUSTOM_GETENV -DCUSTOM_GETENV_FREE -DSCONTROL_BIN="$(SCONTROL_ABSPATH)" -DSINFO_BIN="$(SINFO_ABSPATH)" -DSPINDLE_DO_EXPORT libspindleslurm_la_LDFLAGS = $(AM_LDFLAGS) -ldl -version-info $(libver) libspindleslurm_la_LIBADD = $(top_builddir)/src/server/startup/libspindlebe.la $(top_builddir)/src/fe/startup/libspindlefe.la libspindleslurm_la_CFLAGS = $(CFLAGS) -fvisibility=hidden diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in index 7ee5d244..fa821edd 100644 --- a/testsuite/Makefile.in +++ b/testsuite/Makefile.in @@ -270,6 +270,7 @@ SCONTROL_ABSPATH = @SCONTROL_ABSPATH@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SINFO_ABSPATH = @SINFO_ABSPATH@ SRUN_PATH = @SRUN_PATH@ STRIP = @STRIP@ TESTRM = @TESTRM@ From 316198b28942b69aa6757c0bc676966e314bb555 Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 19:58:06 +0100 Subject: [PATCH 13/15] Translate Slurm NodeName to NodeAddr for server communication setup --- src/slurm_plugin/plugin_utils.c | 85 +++++++++++++++++++++++++++++++++ src/slurm_plugin/plugin_utils.h | 1 + src/slurm_plugin/slurm_plugin.c | 14 +++++- 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 9a4ec1e7..ddc97004 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -22,6 +22,11 @@ #else #define SLURM_SCONTROL_BIN "scontrol" #endif +#if defined SINFO_BIN +#define SLURM_SINFO_BIN STR(SINFO_BIN) +#else +#define SLURM_SINFO_BIN "sinfo" +#endif extern char *parse_location(char *loc); @@ -119,6 +124,86 @@ char **getHostsScontrol(unsigned int num_hosts, const char *hoststr) return ret; } +char **getHostAddrSinfo(unsigned int num_hosts, char **hostlist) +{ + const char *sinfo_path = SLURM_SINFO_BIN; + const char *sinfo_args = "-O NodeAddr -h -n"; + const char *sinfo_suffix = "2> /dev/null"; + FILE *f = NULL; + char **hostaddrlist = NULL, *s, *sinfo_cmdline = NULL, **ret = NULL; + int i, j, hostnamelen; + int result; + size_t maxHostLen = 0, sinfo_cmdlineLen, len; + + hostaddrlist = calloc(num_hosts+1, sizeof(char*)); + + for (i = 0; i < num_hosts; i++) { + if (hostlist[i]) { + size_t thisLen = strlen(hostlist[i]); + if (thisLen > maxHostLen) maxHostLen = thisLen; + } + } + + sinfo_cmdlineLen = strlen(sinfo_path) + strlen(sinfo_args) + + maxHostLen + strlen(sinfo_suffix) + 6; + sinfo_cmdline = (char *) malloc(sinfo_cmdlineLen); + + for (i = 0; i < num_hosts; i++) { + if (!hostlist[i] || !strlen(hostlist[i])) goto done; + result = snprintf(sinfo_cmdline, sinfo_cmdlineLen, "%s %s \"%s\" %s", + sinfo_path, sinfo_args, hostlist[i], sinfo_suffix); + if (result >= sinfo_cmdlineLen) { + sdprintf(1, "ERROR: Formatting error creating sinfo cmdline '%s' (%d)\n", + sinfo_cmdline, result); + goto done; + } + sdprintf(2, "Running sinfo to get host address: %s\n", sinfo_cmdline); + + f = popen(sinfo_cmdline, "r"); + if (!f) { + sdprintf(1, "ERROR: Could not run sinfo: %s\n", sinfo_cmdline); + goto done; + } + + len = 0; + result = getline(&(hostaddrlist[i]), &len, f); + pclose(f); + if (result == -1) { + int error = errno; + sdprintf(1, "ERROR: Resolving '%s' failed: %s\n", hostlist[i], strerror(error)); + (void) error; + goto done; + } + + s = hostaddrlist[i]; + hostnamelen = strlen(s); + for (j = 0; j < hostnamelen; j++) { + if (!((s[j] >= '0' && s[j] <= '9') || + (s[j] >= 'a' && s[j] <= 'z') || + (s[j] >= 'A' && s[j] <= 'Z') || + (s[j] == '-' || s[j] == '_' || s[j] == '.'))) { + s[j] = '\0'; + break; + } + } + sdprintf(3, "sinfo returned hostaddr %s for %s\n", s, hostlist[i]); + } + if (i != num_hosts) { + sdprintf(1, "ERROR: expected %d hosts from sinfo. Got %d\n", num_hosts, i); + goto done; + } + + ret = hostaddrlist; + done: + if (sinfo_cmdline) + free(sinfo_cmdline); + if (!ret && hostaddrlist) { + for (i = 0; i < num_hosts; i++) free(hostaddrlist[i]); + free(hostlist); + } + return ret; +} + int isFEHost(char **hostlist, unsigned int num_hosts) { char host[256]; diff --git a/src/slurm_plugin/plugin_utils.h b/src/slurm_plugin/plugin_utils.h index d57f55da..eaa0da12 100644 --- a/src/slurm_plugin/plugin_utils.h +++ b/src/slurm_plugin/plugin_utils.h @@ -39,6 +39,7 @@ int decodeSpindleConfig(const char *encodedstr, int *spindle_argc, char ***spindle_argv); char **getHostsScontrol(unsigned int num_hosts, const char *hoststr); +char **getHostAddrSinfo(unsigned int num_hosts, char **hostlist); char **getHostsParse(unsigned int num_hosts, const char *shortlist); int isFEHost(char **hostlist, unsigned int num_hosts); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index a747a91f..9f2f1a8f 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -426,7 +426,7 @@ static int get_spindle_args(spank_t spank, spindle_args_t *params) static int launch_spindle(spank_t spank, spindle_args_t *params) { - char **hostlist = NULL; + char **hostlist = NULL, **hostaddrlist = NULL; int result; int is_fe_host = 0; int is_be_leader = 0; @@ -460,7 +460,14 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) } if (is_fe_host && is_be_leader) { +#if defined(SINFO_BIN) + hostaddrlist = getHostAddrSinfo(num_hosts, hostlist); + if (!hostaddrlist) + goto done; + result = launchFE(hostaddrlist, params); +#else result = launchFE(hostlist, params); +#endif if (result == -1) goto done; } @@ -471,6 +478,11 @@ static int launch_spindle(spank_t spank, spindle_args_t *params) if (hostlist) { for (i = 0; i < num_hosts; i++) free(hostlist[i]); free(hostlist); + } + if (hostaddrlist) { + for (i = 0; i < num_hosts; i++) free(hostaddrlist[i]); + free(hostaddrlist); + } return launch_result; } From 142235b8ac8497555a919d12b2c9992baa21dedd Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:18:05 +0100 Subject: [PATCH 14/15] Check for SPANK's spank_prepend_task_argv() in configure --- config.h.in | 4 ++++ configure | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 1 + 3 files changed, 63 insertions(+) diff --git a/config.h.in b/config.h.in index 6a90f059..9e6c30eb 100644 --- a/config.h.in +++ b/config.h.in @@ -18,6 +18,10 @@ /* Define if were using sockets for client/server communication */ #undef COMM_SOCKET +/* Define to 1 if you have the declaration of `spank_prepend_task_argv', and + to 0 if you don't. */ +#undef HAVE_DECL_SPANK_PREPEND_TASK_ARGV + /* Define to 1 if you have the header file. */ #undef HAVE_DLFCN_H diff --git a/configure b/configure index 0366dcd2..4e49e6a2 100755 --- a/configure +++ b/configure @@ -2188,6 +2188,52 @@ fi eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno } # ac_fn_c_check_header_mongrel + +# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES +# --------------------------------------------- +# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR +# accordingly. +ac_fn_c_check_decl () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + as_decl_name=`echo $2|sed 's/ *(.*//'` + as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'` + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5 +$as_echo_n "checking whether $as_decl_name is declared... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +#ifndef $as_decl_name +#ifdef __cplusplus + (void) $as_decl_use; +#else + (void) $as_decl_name; +#endif +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_decl cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. @@ -18375,6 +18421,18 @@ fi CPPFLAGS=$OCPPFLAGS + ac_fn_c_check_decl "$LINENO" "spank_prepend_task_argv" "ac_cv_have_decl_spank_prepend_task_argv" "#include +" +if test "x$ac_cv_have_decl_spank_prepend_task_argv" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_SPANK_PREPEND_TASK_ARGV $ac_have_decl +_ACEOF + OPATH=$PATH if test "x$SLURM_DIR" != "x"; then diff --git a/configure.ac b/configure.ac index 3371b0eb..bd7e9bc8 100644 --- a/configure.ac +++ b/configure.ac @@ -73,6 +73,7 @@ if test "x$ENABLE_SLURM_PLUGIN" == "xtrue"; then [], [AC_MSG_ERROR([Could not find slurm/spank.h])]) CPPFLAGS=$OCPPFLAGS + AC_CHECK_DECLS([spank_prepend_task_argv], [], [], [[#include]]) OPATH=$PATH if test "x$SLURM_DIR" != "x"; then From e8843e598ef9861ff297047b69b597a5424b662d Mon Sep 17 00:00:00 2001 From: Norbert Eicker Date: Mon, 19 Feb 2024 20:35:56 +0100 Subject: [PATCH 15/15] Use spank_prepend_task_argv() to tweak task's argv - Requires Slurm 23.11 or later or a backport of this function --- src/slurm_plugin/slurm_plugin.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 9f2f1a8f..ec37f1cc 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -29,6 +29,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "spindle_launch.h" #include "plugin_utils.h" +#include "config.h" SPINDLE_EXPORT extern const char plugin_name[]; SPINDLE_EXPORT extern const char plugin_type[]; @@ -578,9 +579,13 @@ static int launchBE(spank_t spank, spindle_args_t *params) static int prepApp(spank_t spank, spindle_args_t *params) { +#if HAVE_DECL_SPANK_PREPEND_TASK_ARGV == 1 + int result; +#else int app_argc, result; char **app_argv; char *app_exe_name, *last_slash; +#endif spank_err_t err; int bootstrap_argc; char **bootstrap_argv; @@ -591,6 +596,16 @@ static int prepApp(spank_t spank, spindle_args_t *params) return -1; } +#if HAVE_DECL_SPANK_PREPEND_TASK_ARGV == 1 + sdprintf(2, "Prepping task process %d to run spindle\n", getpid()); + + const char **filter_argv = (const char **)bootstrap_argv; + err = spank_prepend_task_argv(spank, bootstrap_argc, filter_argv); + if (err != ESPANK_SUCCESS) { + sdprintf(1, "WARNING: Could not prepend spindle filter.\n"); + result = -1; + } +#else sdprintf(2, "Prepping app process %d to run spindle\n", getpid()); err = spank_get_item(spank, S_JOB_ARGV, &app_argc, &app_argv); @@ -605,6 +620,7 @@ static int prepApp(spank_t spank, spindle_args_t *params) } result = spindleHookSpindleArgsIntoExecBE(bootstrap_argc, bootstrap_argv, app_exe_name); +#endif if (result == -1) { sdprintf(1, "ERROR setting up app to run spindle. Spindle won't work\n"); return -1;