From 7db13dbfc193841aec5389e90c26ba4e38ae5090 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:08 -0700 Subject: [PATCH 01/19] Cachepath: parse_loc utilities. --- src/client/beboot/parseloc.h | 7 ++ src/utils/parseloc.c | 149 ++++++++++++++++++++++++++++------- 2 files changed, 129 insertions(+), 27 deletions(-) diff --git a/src/client/beboot/parseloc.h b/src/client/beboot/parseloc.h index c5362e2e..1731906a 100644 --- a/src/client/beboot/parseloc.h +++ b/src/client/beboot/parseloc.h @@ -24,6 +24,13 @@ extern "C" { #include "spindle_launch.h" char *parse_location(char *loc, number_t number); +char *parse_location_noerr(char *loc, number_t number); +char *realize(char *path); +char **parse_colonsep_prefixes(char *colonsep_list, number_t number); +int is_local_prefix(const char *path, char **local_prefixes); +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); #if defined(__cplusplus) } diff --git a/src/utils/parseloc.c b/src/utils/parseloc.c index 6caea8f5..28c446dd 100644 --- a/src/utils/parseloc.c +++ b/src/utils/parseloc.c @@ -22,6 +22,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include #include +#include #if !defined(USE_PLUGIN_DEBUG) #include "spindle_debug.h" @@ -34,13 +35,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ccwarns.h" #include "spindle_launch.h" -#if defined(__cplusplus) -extern "C" { -#endif - char *parse_location(char *loc, number_t number); -#if defined(__cplusplus) -} -#endif +extern int spindle_mkdir(char *orig_path); #if defined(CUSTOM_GETENV) extern char *custom_getenv(); @@ -166,38 +161,64 @@ char *parse_location_noerr(char *loc, number_t number) **/ char *realize(char *path) { + int local_errno; char *result; - char *origpath, *cur_slash = NULL, *trailing; - struct stat buf; + char *origpath, *cur_slash = NULL, *prev_slash = NULL; + struct stat *buf = calloc( 1, sizeof( struct stat ) ); char newpath[MAX_PATH_LEN+1]; int lastpos; newpath[MAX_PATH_LEN] = '\0'; origpath = strdup(path); - for (;;) { - if (stat(origpath, &buf) != -1) - break; - if (cur_slash) - *cur_slash = '/'; + errno=0; + while( stat( origpath, buf ) == -1 ){ + local_errno = errno; + debug_printf("Failed to stat '%s' (%s).\n", origpath, strerror(local_errno)); + prev_slash = cur_slash; cur_slash = strrchr(origpath, '/'); - if (!cur_slash) - break; - *cur_slash = '\0'; + if( prev_slash ) + *prev_slash = '/'; + if( cur_slash ) + *cur_slash = '\0'; + else{ + debug_printf("Nothing in the original path can be stat'ed. (%s)\n", path); + return NULL; + } } - if (cur_slash) - trailing = cur_slash + 1; - else - trailing = ""; + errno = 0; result = realpath(origpath, newpath); if (!result) { + local_errno = errno; + err_printf( + "Error: realpath(3) failed to create canonical version of '%s' (%s). Returning '%s'.\n", + origpath, strerror(local_errno), path ); + errno = 0; + int rc = stat( origpath, buf ); + local_errno = errno; + err_printf( + " Statting that path results in rc=%d, errno=%d, error='%s'.\n", + rc, local_errno, strerror(local_errno)); free(origpath); - return path; + return NULL; } + free(buf); - strncat(newpath, "/", MAX_PATH_LEN); - strncat(newpath, trailing, MAX_PATH_LEN); - newpath[MAX_PATH_LEN] = '\0'; + if( cur_slash ){ + if( strlen( newpath ) + strlen( cur_slash+1 ) > MAX_PATH_LEN ){ + err_printf( + "Error: The realized path exceeds MAX_PATH_LEN (%d).\n" + " Original path: '%s'\n" + " Statable part: '%s'\n" + " Canonical version: '%s'\n" + " Returning original path.\n", + MAX_PATH_LEN, path, origpath, newpath); + free(origpath); + return path; + } + strncat(newpath, "/", 2); + strncat(newpath, cur_slash+1, MAX_PATH_LEN - strlen( newpath )); + } free(origpath); lastpos = strlen(newpath)-1; @@ -223,13 +244,13 @@ char **parse_colonsep_prefixes(char *colonsep_list, number_t number) prefixes[0] = NULL; return prefixes; } - int numprefixes = 1; + size_t numprefixes = 1; for (size_t i = 0; s[i] != '\0'; i++) { if (s[i] == ':') { numprefixes++; } } - int num_strs = numprefixes + 1; + size_t num_strs = numprefixes + 1; prefixes = (char **) malloc(sizeof(char*) * num_strs); size_t i = 0, cur = 0; @@ -278,3 +299,77 @@ int is_local_prefix(const char *path, char **local_prefixes) { return 0; } +/* validateCandidatePath determines if candidatePath passes parse_location(), realize(), and spindle_mkdir(), which is to say, can + * spindle create a directory from this path? + * + * If not NULL, then realizedPath, parsedPath, and/or symbolicPath will hold the respective intermediate/final results. + * + * Return 1 if the candidatePath is valid, otherwise 0. + */ +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ){ + int rc; + char *parsedCandidatePath, *realizedCandidatePath; + parsedCandidatePath = parse_location( candidatePath, number ); + if( parsedCandidatePath ){ + realizedCandidatePath = realize( parsedCandidatePath ); + if( realizedCandidatePath ){ + rc = spindle_mkdir( parsedCandidatePath ); + if( 0 == rc ){ + if( symbolicPath) *symbolicPath = candidatePath; + if( parsedPath ) *parsedPath = parsedCandidatePath; + if( realizedPath) *realizedPath = realizedCandidatePath; + return 1; + }else{ + debug_printf2("Unable to create directory %s, moving on to the next candidate.\n", realizedCandidatePath ); + } + }else{ + debug_printf2( "Unable to realize candidate %s, moving on to the next candidate.\n", parsedCandidatePath ); + } + }else{ + debug_printf2("Unable to parse candidate %s, moving on to the next candidate.\n", candidatePath ); + } + return 0; +} + +/** + * determineValidCachePaths() works exclusively with the cachepaths parameter. Because not all paths may be valid on all + * compute nodes, and because we want to have all nodes reach a consensus on which cache path to use, we + * determine the validity of all paths in the origPathList, save the intermediate results, and return a bit + * index to the user. Via allReduce() all nodes reach a consensus on the set of valid paths, and retrieves + * that informatino via getValidPathByIndex(). + */ +static char *realizedCachePaths[64], *parsedCachePaths[64], *symbolicCachePaths[64]; + +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ){ + + char *saveptr, *candidatePath, *pathList = strdup( origPathList ); + uint64_t bitoffset = 0; + + *validBitIdx = 0; + debug_printf2("origPathList='%s', number='%lu'.\n", origPathList, number ); + + candidatePath = strtok_r( pathList, ":", &saveptr ); + while( NULL != candidatePath && bitoffset < 64 ){ + *validBitIdx |= validateCandidatePath( + candidatePath, + &realizedCachePaths[bitoffset], + &parsedCachePaths[bitoffset], + &symbolicCachePaths[bitoffset], number ) << bitoffset; + bitoffset++; + candidatePath = strtok_r( NULL, ":", &saveptr ); + } + free( pathList ); +} + +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ){ + uint64_t bitoffset = 0; + if (!validBitIdx){ + return; + } + while( (bitoffset < 64) && (((1 << bitoffset) & validBitIdx) == 0) ){ + bitoffset++; + } + if( realizedCachePath ) *realizedCachePath = realizedCachePaths[bitoffset]; + if( parsedCachePath ) *parsedCachePath = parsedCachePaths[bitoffset]; + if( symbolicCachePath ) *symbolicCachePath = symbolicCachePaths[bitoffset]; +} From df0647e782673b94b895a932324dcd8967d7adae Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:45 -0700 Subject: [PATCH 02/19] Cachepath: remove/rename [orig_]location. --- src/client/beboot/spindle_bootstrap.c | 2 +- src/client/client/client.c | 15 +++++++-------- src/client/client/intercept.h | 1 + src/client/client/intercept_exec.c | 5 +---- src/client/client/intercept_readlink.c | 16 +++++++++++----- src/client/client/should_intercept.c | 25 +++++++++++++++---------- src/client/client/should_intercept.h | 1 + 7 files changed, 37 insertions(+), 28 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 279ef9a7..29d4faf6 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -114,7 +114,6 @@ static void setup_environment() setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_LOCATION", location, 1); - setenv("LDCS_ORIG_LOCATION", orig_location, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) @@ -161,6 +160,7 @@ static int parse_cmdline(int argc, char *argv[]) } symbolic_location = argv[i++]; + i++; // Skip over candidate_cachepaths. number_s = argv[i++]; number = (number_t) strtoul(number_s, NULL, 0); opts_s = argv[i++]; diff --git a/src/client/client/client.c b/src/client/client/client.c index f9c6f578..199066ff 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -40,6 +40,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ccwarns.h" #include "exec_util.h" #include "intercept.h" +#include "should_intercept.h" errno_location_t app_errno_location; @@ -69,11 +70,8 @@ static const ElfW(Phdr) *libc_phdrs, *interp_phdrs; static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; -/* location has the realize'd path to the local file cache. orig_location is not realized and - * may contain symlinks - */ -char *location; -char *orig_location; +static char *location; +static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; number_t number; static int have_stat_patches; @@ -198,7 +196,6 @@ static int init_server_connection() return 0; location = getenv("LDCS_LOCATION"); - orig_location = getenv("LDCS_ORIG_LOCATION"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -218,7 +215,6 @@ static int init_server_connection() debug_printf("Disabling environment variables because we're not following forks\n"); unsetenv("LD_AUDIT"); unsetenv("LDCS_LOCATION"); - unsetenv("LDCS_ORIG_LOCATION"); unsetenv("LDCS_NUMBER"); unsetenv("LDCS_CONNECTION"); unsetenv("LDCS_RANKINFO"); @@ -262,6 +258,9 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); + set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); + set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); @@ -467,7 +466,7 @@ char *client_library_load(const char *name) char *orig_file_name = (char *) name; if (is_in_spindle_cache(name)) { - debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, location); + debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, chosen_realized_cachepath); memset(fixed_name, 0, MAX_PATH_LEN+1); send_orig_path_request(ldcsid, orig_file_name, fixed_name); orig_file_name = fixed_name; diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index 4ace2328..aae968f7 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,6 +89,7 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 14b555ed..8480b75e 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -142,7 +142,6 @@ static char **removeEnvironmentStrs(char **envp) if (strIsPrefix("LD", envp[i])) { if (strIsPrefix("LD_AUDIT=", envp[i]) || strIsPrefix("LDCS_LOCATION=", envp[i]) || - strIsPrefix("LDCS_ORIG_LOCATION=", envp[i]) || strIsPrefix("LDCS_CONNECTION=", envp[i]) || strIsPrefix("LDCS_RANKINFO=", envp[i]) || strIsPrefix("LDCS_OPTIONS=", envp[i]) || @@ -177,7 +176,6 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp unsetf("SPINDLE"); unsetf("LD_AUDIT"); unsetf("LDCS_LOCATION"); - unsetf("LDCS_ORIG_LOCATION"); unsetf("LDCS_CONNECTION"); unsetf("LDCS_RANKINFO"); unsetf("LDCS_OPTIONS"); @@ -198,13 +196,12 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 10; + new_size = orig_size + 20; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); propogateEnvironmentStr(envp, newenv, &pos, "LD_AUDIT"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_LOCATION"); - propogateEnvironmentStr(envp, newenv, &pos, "LDCS_ORIG_LOCATION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_CONNECTION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_RANKINFO"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_OPTIONS"); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index b44a10f2..8d0dea8e 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,19 +31,25 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -extern char *location; +static char *cachepath; + +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + chosen_parsed_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; - int location_len, result; + int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; - location_len = strlen(location); + cachepath_len = strlen(cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(location, buf, location_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); - result = send_orig_path_request(ldcsid, buf+location_len+1, tmp); + result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) return -1; debug_printf2("readlink translated spindle local path %s to %s\n", buf, tmp); diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index be0d7bb6..b3a4376e 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,22 +29,27 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); +static char *cachepath, *orig_cachepath; + +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + orig_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} -extern char *location; -extern char *orig_location; int is_in_spindle_cache(const char *pathname) { - static int location_size = 0; - static int orig_location_size = 0; - if (!location_size) { - location_size = strlen(location); + static int cachepath_size = 0; + static int orig_cachepath_size = 0; + if (!cachepath_size) { + cachepath_size = strlen(cachepath); } - if (!orig_location_size) { - orig_location_size = strlen(orig_location); + if (!orig_cachepath_size) { + orig_cachepath_size = strlen(orig_cachepath); } - return ((strncmp(pathname, location, location_size) == 0) || - (strncmp(pathname, orig_location, orig_location_size) == 0)); + return ((strncmp(pathname, cachepath, cachepath_size) == 0) || + (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index f6a9b510..6a545913 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,6 +27,7 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From 0acada159fa285bb83b64c305a5b55b09c93f9b3 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:51:51 -0700 Subject: [PATCH 03/19] Cachepath: Configure-time support. --- config.h.in | 3 +++ configure | 16 ++++++++++++++++ configure.common.ac | 5 +++++ src/client/config.h.in | 3 +++ src/client/configure | 16 ++++++++++++++++ src/fe/config.h.in | 3 +++ src/fe/configure | 16 ++++++++++++++++ src/server/config.h.in | 3 +++ src/server/configure | 16 ++++++++++++++++ 9 files changed, 81 insertions(+) diff --git a/config.h.in b/config.h.in index 6658b1bb..a52582dc 100644 --- a/config.h.in +++ b/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/configure b/configure index f9984eca..b71d919d 100755 --- a/configure +++ b/configure @@ -846,6 +846,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1589,6 +1590,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16615,6 +16618,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16651,6 +16662,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/configure.common.ac b/configure.common.ac index ff261864..1fb1d0c8 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -17,6 +17,10 @@ AC_ARG_WITH(default-num-ports, [AS_HELP_STRING([--with-default-numports=NUM],[Number of TCP/IP ports to scan for Spindle server communication])], [NUM_COBO_PORTS=${withval}], [NUM_COBO_PORTS=$DEFAULT_NUM_COBO_PORTS]) +AC_ARG_WITH(cachepaths, + [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], + [CACHEPATHS=${withval}], + [CACHEPATHS=$DEFAULT_LOC]) AC_ARG_WITH(localstorage, [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], [SPINDLE_LOC=${withval}], @@ -29,6 +33,7 @@ AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle] AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) AC_DEFINE_UNQUOTED([SPINDLE_LOC],"[$SPINDLE_LOC]",[The default local directory for Spindle]) +AC_DEFINE_UNQUOTED([CACHEPATHS],"[$CACHEPATHS]",[Colon-separated list of potential back-end cache directories]) AC_DEFINE_UNQUOTED([SPINDLE_LOCAL_PREFIX],"[$SPINDLE_LOCAL_PREFIX]",[The default colon-separated list of directories that Spindle will not cache files out of]) TESTRM=unknown diff --git a/src/client/config.h.in b/src/client/config.h.in index c352290c..713e2535 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/client/configure b/src/client/configure index 12d8c016..892bf86f 100755 --- a/src/client/configure +++ b/src/client/configure @@ -810,6 +810,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1532,6 +1533,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -12587,6 +12590,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -12623,6 +12634,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index ab026071..2c46ba96 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/fe/configure b/src/fe/configure index afbec954..f82d36e0 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -831,6 +831,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1570,6 +1571,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16437,6 +16440,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16473,6 +16484,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/server/config.h.in b/src/server/config.h.in index 034f34ba..98a081dc 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/server/configure b/src/server/configure index 7de2107d..3cd471a9 100755 --- a/src/server/configure +++ b/src/server/configure @@ -837,6 +837,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1567,6 +1568,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16434,6 +16437,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16470,6 +16481,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF From a0ee62fc76dffb974326833c1934d33c316a0360 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:52:29 -0700 Subject: [PATCH 04/19] Cachepath: Internal messaging for path resolution --- src/client/client_comlib/client_api.c | 53 ++++++++++++ src/client/client_comlib/client_api.h | 1 + src/fe/startup/spindle_fe.cc | 18 +++++ src/include/ldcs_api.h | 3 + .../auditserver/ldcs_audit_server_handlers.c | 80 +++++++++++++++++++ .../auditserver/ldcs_audit_server_md_cobo.c | 6 ++ src/server/comlib/ldcs_api_util.c | 3 + 7 files changed, 164 insertions(+) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 4999a4de..509326c2 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -37,6 +37,59 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + ldcs_message_t message; + char buffer[MAX_PATH_LEN+1]; + buffer[MAX_PATH_LEN] = '\0'; + + message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; + message.header.len = MAX_PATH_LEN; + message.data = buffer; + + COMM_LOCK; + + debug_printf3("sending message of type: request_location_path.\n" ); + client_send_msg(fd, &message); + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_realized_cachepath ){ + *chosen_realized_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_parsed_cachepath ){ + *chosen_parsed_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_symbolic_cachepath ){ + *chosen_symbolic_cachepath = strdup( buffer ); + } + + return 0; +} + int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { ldcs_message_t message; char buffer[MAX_PATH_LEN+1+sizeof(int)]; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 74f82346..982c4b1c 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,6 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); int get_python_prefix(int fd, char **prefix); diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index 31b29cf7..43e8c6ba 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -41,6 +41,7 @@ static const char *logging_file = NULL; #endif static const char spindle_bootstrap[] = LIBEXECDIR "/spindle_bootstrap"; static bool sendAndWaitForAlive(); +static void determineCachepathConsensus(); #define STARTUP_TIMEOUT 60 @@ -71,6 +72,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); buffer_size += args->location ? strlen(args->location) + 1 : 1; + buffer_size += args->candidate_cachepaths ? strlen(args->candidate_cachepaths) + 1 : 1; buffer_size += args->pythonprefix ? strlen(args->pythonprefix) + 1 : 1; buffer_size += args->preloadfile ? strlen(args->preloadfile) + 1 : 1; buffer_size += args->numa_files ? strlen(args->numa_files) + 1 : 1; @@ -91,6 +93,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) pack_param(args->startup_type, buf, pos); pack_param(args->shm_cache_size, buf, pos); pack_param(args->location, buf, pos); + pack_param(args->candidate_cachepaths, buf, pos); pack_param(args->pythonprefix, buf, pos); pack_param(args->preloadfile, buf, pos); pack_param(args->bundle_timeout_ms, buf, pos); @@ -230,6 +233,7 @@ int getApplicationArgsFE(spindle_args_t *params, int *spindle_argc, char ***spin (*spindle_argv)[n++] = strdup(uniqueid_s); } (*spindle_argv)[n++] = strdup(params->location); + (*spindle_argv)[n++] = strdup(params->candidate_cachepaths); (*spindle_argv)[n++] = strdup(number_s); (*spindle_argv)[n++] = strdup(opt_s); (*spindle_argv)[n++] = strdup(cachesize_s); @@ -395,9 +399,11 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Start FE server */ debug_printf("spindle_args_t { number = %lu; port = %u; num_ports = %u; opts = %lu; unique_id = %lu; " "use_launcher = %u; startup_type = %u; shm_cache_size = %u; location = %s; " + "cachepaths = %s; " "pythonprefix = %s; preloadfile = %s; bundle_timeout_ms = %u; bundle_cachesize_kb = %u }\n", (unsigned long) params->number, params->port, params->num_ports, params->opts, params->unique_id, params->use_launcher, params->startup_type, params->shm_cache_size, params->location, + params->candidate_cachepaths, params->pythonprefix, params->preloadfile, params->bundle_timeout_ms, params->bundle_cachesize_kb); printSpindleFlags(params->opts); @@ -427,6 +433,7 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Wait for servers to indicate startup */ sendAndWaitForAlive(); + determineCachepathConsensus(); return 0; } @@ -483,6 +490,17 @@ void markRSHPidReapedFE() clear_fe_rsh_pid(); } +static void determineCachepathConsensus( void ){ + ldcs_message_t consensus_req_msg; + consensus_req_msg.header.type = LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS; + consensus_req_msg.header.len = 0; + consensus_req_msg.data = NULL; + int result = ldcs_audit_server_fe_broadcast(&consensus_req_msg, NULL); + if (result == -1) { + debug_printf("Failure sending cachepath consensus message\n"); + } +} + static bool sendAndWaitForAlive() { int result; diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index e8ffa43d..e6ccbafb 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -85,6 +85,9 @@ typedef enum { LDCS_MSG_PICKONE_RESP, LDCS_MSG_ALIVE_REQ, LDCS_MSG_ALIVE_RESP, + LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS, + LDCS_MSG_CHOSEN_CACHEPATH_REQUEST, + LDCS_MSG_CHOSEN_CACHEPATH, LDCS_MSG_UNKNOWN } ldcs_message_ids_t; diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 8a42f40c..eb1f29ca 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -178,7 +178,10 @@ static int handle_setup_alias(ldcs_process_data_t *procdata, char *pathname, cha static int handle_client_dirlists_req(ldcs_process_data_t *procdata, int nc); static int handle_close_client_query(ldcs_process_data_t *procdata, int nc); static int handle_alive_msg(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc); +extern void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); /** * Query from client to server. Returns info about client's rank in server data structures. **/ @@ -1883,6 +1886,8 @@ int handle_client_message(ldcs_process_data_t *procdata, int nc, ldcs_message_t return handle_client_pickone_msg(procdata, nc, msg); case LDCS_MSG_END: return handle_client_end(procdata, nc); + case LDCS_MSG_CHOSEN_CACHEPATH_REQUEST: + return handle_chosen_cachepath_request(procdata, nc); default: err_printf("Received unexpected message from client %d: %d\n", nc, (int) msg->header.type); assert(0); @@ -1980,6 +1985,8 @@ int handle_server_message(ldcs_process_data_t *procdata, node_peer_t peer, ldcs_ case LDCS_MSG_ALIVE_REQ: case LDCS_MSG_ALIVE_RESP: return handle_alive_msg(procdata, msg); + case LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS: + return handle_cachepath_consensus(procdata, msg); default: err_printf("Received unexpected message from node: %d\n", (int) msg->header.type); assert(0); @@ -2936,6 +2943,79 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs } } +/** + * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which of the locations, commpaths, and cachepaths are + * available across all of the servers. + */ + +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ + + int num_children = ldcs_audit_server_md_get_num_children(procdata); + + if (num_children) { + spindle_broadcast(procdata, msg); + msgbundle_force_flush(procdata); + } + + ldcs_audit_server_md_consensus(procdata, msg); + + if( procdata->cachepath_bitidx == 0 ){ + err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); + procdata->cachepath = procdata->location; + }else{ + // ldcs_audit_server_filemngt_init() does it's own realize() pass. + getValidCachePathByIndex( procdata->cachepath_bitidx, + &procdata->cachepath, + &procdata->parsed_cachepath, + &procdata->symbolic_cachepath); + } + + debug_printf3("Initializing file cache location %s\n", procdata->location); + ldcs_audit_server_filemngt_init(procdata->cachepath); + + test_printf(" cachepath=%s\n", procdata->cachepath); + return 0; +} + +/** + * Handle LDCS_MSG_CHOSEN_CACHEPATH_REQUEST + */ +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc){ + ldcs_message_t msg; + int connid; + ldcs_client_t *client; + + assert(nc != -1); + client = procdata->client_table + nc; + connid = client->connid; + if (client->state != LDCS_CLIENT_STATUS_ACTIVE || connid < 0) + return 0; + + + msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; + + msg.header.len = strlen(procdata->cachepath) + 1; + msg.data = procdata->cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->parsed_cachepath) + 1; + msg.data = procdata->parsed_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->symbolic_cachepath) + 1; + msg.data = procdata->symbolic_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + return 0; +} + + /** * Handle alive message, which is a broadcast/response ping through all servers */ diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index ab14c0e4..bffa3ae7 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -397,3 +397,9 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) cobo_get_num_childs(&num_childs); return num_childs; } + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ + if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ + cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); + } +} diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index 2bc2455d..b6beb56d 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -91,6 +91,9 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_PICKONE_RESP); STR_CASE(LDCS_MSG_ALIVE_REQ); STR_CASE(LDCS_MSG_ALIVE_RESP); + STR_CASE(LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH_REQUEST); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH); STR_CASE(LDCS_MSG_UNKNOWN); } return "unknown"; From b3299eb2ad8effb3127069d850e18fad91d183ea Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:53:25 -0700 Subject: [PATCH 05/19] Cachepath: Adds cobo_allreduce() --- src/cobo/cobo.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/cobo/ldcs_cobo.h | 16 ++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index de9d9b11..b1db3305 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1432,6 +1432,48 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } +int cobo_allreduce( int64_t *pval, cobo_op_t op ){ + + /* if i have any children, receive their data */ + int64_t child_val; + for(int i=cobo_num_child-1; i>=0; i--) { + /* read int64_t from child */ + if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { + err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); + exit(1); + } + + /* compare child's val to our current val */ + switch( op ){ + case COBO_OP_MIN: if( child_val < *pval ) *pval = child_val; break; + case COBO_OP_MAX: if( child_val > *pval ) *pval = child_val; break; + case COBO_OP_BITWISE_AND: *pval &= child_val; break; + case COBO_OP_BITWISE_OR: *pval |= child_val; break; + case COBO_OP_LOGICAL_AND: *pval = *pval && child_val; break; + case COBO_OP_LOGICAL_OR: *pval = *pval || child_val; break; + case COBO_OP_SUM: *pval += child_val; break; + case COBO_OP_NOOP: break; + default: + err_printf("Illegal op (%d). Ignoring.\n", op); + break; + } + } + + /* forward data to parent if we're not rank 0, otherwise set the recvbuf */ + if (cobo_me != 0) { + /* not the root, so forward our reduction result to our parent */ + if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { + err_printf("Sending reduced data to parent failed\n"); + exit(1); + } + } + + /* broadcast result of reduction from rank 0 to all tasks */ + cobo_bcast_tree(pval, sizeof(int64_t)); + + return COBO_SUCCESS; +} + /* provide list of ports and number of ports as input, get number of tasks and my rank as output */ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* num_ranks) { diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index edacd4b1..30cc9673 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -55,6 +55,7 @@ extern "C" { #define cobo_allgather COMBINE(COBO_NAMESPACE, cobo_allgather) #define cobo_alltoall COMBINE(COBO_NAMESPACE, cobo_alltoall ) #define cobo_allgather_str COMBINE(COBO_NAMESPACE, cobo_allgather_str) +#define cobo_allreduce COMBINE(COBO_NAMESPACE, cobo_allreduce) #define cobo_server_open COMBINE(COBO_NAMESPACE, cobo_server_open) #define cobo_server_close COMBINE(COBO_NAMESPACE, cobo_server_close) #define cobo_server_get_root_socket COMBINE(COBO_NAMESPACE, cobo_server_get_root_socket) @@ -67,6 +68,19 @@ extern "C" { #define cobo_register_preconnect_cb COMBINE(COBO_NAMESPACE, cobo_register_preconnect_cb) #endif +// Used for cobo_allreduce(). +typedef enum{ + COBO_OP_MIN, + COBO_OP_MAX, + COBO_OP_BITWISE_AND, + COBO_OP_BITWISE_OR, + COBO_OP_LOGICAL_AND, + COBO_OP_LOGICAL_OR, + COBO_OP_SUM, + COBO_OP_NOOP, + NUM_COBO_OP +} cobo_op_t; + /* * ========================================================================== * ========================================================================== @@ -128,6 +142,8 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); +int cobo_allreduce(int64_t *pval, cobo_op_t op); + /* * ========================================================================== * ========================================================================== From 9ac821f060e0c32f3628be2b428f092a9bfb9af7 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:54:01 -0700 Subject: [PATCH 06/19] Cachepath: Adds parameters to config_mgr --- src/fe/startup/config_mgr.cc | 23 +++++++++++++++++++++++ src/fe/startup/config_mgr.h | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index 2d89eea7..8e954c63 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -56,6 +56,12 @@ using namespace std; #define SPINDLE_LOC_STR "$TMPDIR" #endif +#if defined(CACHEPATHS) +#define SPINDLE_CACHEPATHS_STR CACHEPATHS +#else +#define SPINDLE_CACHEPATHS_STR "$TMPDIR" +#endif + #if defined(SPINDLE_LOCAL_PREFIX) #define SPINDLE_LOCAL_PREFIX_STR SPINDLE_LOCAL_PREFIX #else @@ -269,6 +275,8 @@ void initOptionsList() "Strip debug and symbol information from binaries before distributing them." }, { confLocation, "location", shortLocation, groupMisc, cvString, {}, SPINDLE_LOC_STR, "Back-end directory for storing relocated files. Should be a non-shared location such as a ramdisk." }, + { confCachePaths, "cachepaths", shortCachePaths, groupMisc, cvString, {}, SPINDLE_CACHEPATHS_STR, + "Colon-separated list of candidate paths for cached libraries."}, { confNoclean, "noclean", shortNoClean, groupMisc, cvBool, {}, "false", "Don't remove local file cache after execution." }, { confDisableLogging, "disable-logging", shortDisableLogging, groupMisc, cvBool, {}, DISABLE_LOGGING_STR, @@ -740,6 +748,21 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const args.location = strdup(loc.c_str()); break; } + case confCachePaths:{ + // Paramemter values are colon-separated lists of paths. + // Append "/spindle.$NUMBER" to each path in the list. + string paths = strresult; + size_t idx = paths.find(":"); + string number_var_with_colon("/spindle.$NUMBER:"); + string number_var_without_colon("/spindle.$NUMBER"); + while( idx != string::npos ){ + paths.replace(idx, 1, number_var_with_colon); + idx = paths.find(":", idx + number_var_with_colon.size()); + }; + paths += number_var_without_colon; + args.candidate_cachepaths = strdup(paths.c_str()); + break; + } case confCachePrefix: case confPythonPrefix: if (args.pythonprefix) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 8e70daa6..27be1ae8 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -30,6 +30,7 @@ enum SpindleConfigID { confPort, confNumPorts, confLocation, + confCachePaths, confCachePrefix, confPythonPrefix, confLocalPrefix, @@ -125,7 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso + shortPatchLdso, + shortCachePaths, }; enum CmdlineGroups { From abcfeaaafc8e00cd9868f4bf20432efed350cd66 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:55:39 -0700 Subject: [PATCH 07/19] Cachepath: adds flux parameter support --- src/flux/flux-spindle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/flux/flux-spindle.c b/src/flux/flux-spindle.c index 67d2424f..ee7dc11a 100644 --- a/src/flux/flux-spindle.c +++ b/src/flux/flux-spindle.c @@ -379,7 +379,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) const char *relocaout = NULL, *reloclibs = NULL, *relocexec = NULL, *relocpython = NULL; const char *followfork = NULL, *preload = NULL, *level = NULL; const char *pyprefix = NULL, *location = NULL; - char *numafiles = NULL; + char *numafiles = NULL, *cachepaths = NULL; if (flux_shell_getopt_unpack (shell, "spindle", "o", &opts) < 0) return -1; @@ -401,7 +401,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) * supplied by the user, but not unpacked (This handles typos, etc). */ if (json_unpack_ex (opts, &error, JSON_STRICT, - "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s}", + "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s s?s}", "noclean", &noclean, "nostrip", &nostrip, "push", &push, @@ -416,7 +416,8 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) "numa", &numa, "numa-files", &numafiles, "preload", &preload, - "level", &level) < 0) + "level", &level, + "cachepaths", &cachepaths) < 0) logerrno_printf_and_return(1, "Error in spindle option: %s\n", error.text); if (noclean) @@ -459,6 +460,9 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) free (ctx->params.pythonprefix); ctx->params.pythonprefix = tmp; } + if( cachepaths ){ + ctx->params.candidate_cachepaths = cachepaths; + } if (location) { ctx->params.location = (char *) location; } From 694d941bef596daef15c640735fa52bf1105715c Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:05 -0700 Subject: [PATCH 08/19] Cachepath: Adds logging support. --- src/logging/spindle_logd.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/logging/spindle_logd.cc b/src/logging/spindle_logd.cc index 62618e62..0a78db49 100644 --- a/src/logging/spindle_logd.cc +++ b/src/logging/spindle_logd.cc @@ -201,7 +201,7 @@ class TestVerifier std::vector err_strings; std::set > target_libs; std::set > libs_loaded; - char *location; + char *cachepath; void logerror(std::string s) { @@ -248,7 +248,8 @@ class TestVerifier tmp_s = getenv("TEMPDIR"); if (!tmp_s) tmp_s = "/tmp"; - location = strdup(tmp_s); + // These are reasonable fallbacks that should be replaced via messages, below. + cachepath = strdup(tmp_s); } ~TestVerifier() @@ -267,7 +268,7 @@ class TestVerifier strstr(filename, "retzero") == NULL && strstr(filename, ".py") == NULL) return true; - bool is_from_temp = (strstr(filename, location) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); + bool is_from_temp = (strstr(filename, cachepath) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); bool is_local_test = strstr(filename, "liblocal") != NULL; if (is_from_temp && !is_local_test && ret_code == -1) { @@ -293,12 +294,12 @@ class TestVerifier char buffer[4096]; int ret; - if (strstr(s, " location=" ) == s ){ - free( location ); - const char *loc_start = strstr( s, "=") + 1; - size_t loc_len = strlen( loc_start ); - location = strdup( loc_start ); - location[ loc_len - 1 ] = '\0'; // Remove trailing '\n'. + if (strstr(s, " cachepath=" ) == s ){ + free( cachepath ); + const char *cachepath_start = strstr( s, "=") + 1; + size_t cachepath_len = strlen( cachepath_start ); + cachepath = strdup( cachepath_start ); + cachepath[ cachepath_len - 1 ] = '\0'; // Remove trailing '\n'. } if (strstr(s, "open(") == s) { const char *first_quote, *last_quote, *equals; From 9da4214249df7d2e27c104cf7d9f49b54325db10 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:34 -0700 Subject: [PATCH 09/19] Cachepath: Removes out-of-root cleanup checks. --- src/server/auditserver/cleanup_proc.cc | 4 ++-- testsuite/test_driver.c | 23 ----------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/server/auditserver/cleanup_proc.cc b/src/server/auditserver/cleanup_proc.cc index 8a789f39..2076fefd 100644 --- a/src/server/auditserver/cleanup_proc.cc +++ b/src/server/auditserver/cleanup_proc.cc @@ -71,7 +71,8 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) continue; if (strncmp(prefix_dir, componentpath.c_str(), prefix_size) != 0) { - err_printf("Tried to clean a file %s that wasn't in our prefix %s\n", componentpath.c_str(), prefix_dir); + // We have multiple directory roots. Not a problem if the directory + // we're looking for isn't in this one. continue; } unlink(componentpath.c_str()); @@ -82,7 +83,6 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) sort(ordered_dirs.begin(), ordered_dirs.end(), longest_str_first); for (vector::iterator i = ordered_dirs.begin(); i != ordered_dirs.end(); i++) { if (strncmp(prefix_dir, i->c_str(), prefix_size) != 0) { - err_printf("Tried to rmdir directory %s that wasn't in our prefix %s\n", i->c_str(), prefix_dir); continue; } rmdir(i->c_str()); diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index a4b3b46f..70bb6c89 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1142,27 +1142,6 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } -static int checkLinkForLeak(const char *path, const char *spindle_loc) -{ - char link_target[4096]; - int result, error; - memset(link_target, 0, sizeof(link_target)); - - result = readlink(path, link_target, sizeof(link_target)); - if (result == -1) { - error = errno; - err_printf("Failed to read link %s: %s\n", path, strerror(error)); - return -1; - } - - if (strstr(link_target, spindle_loc)) { - err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); - return -1; - } - - return 0; -} - static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) { if (strstr(path, spindle_loc)) { @@ -1259,9 +1238,7 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); - checkLinkForLeak(path, spindle_loc); } - checkLinkForLeak("/proc/self/exe", spindle_loc); /** * Check link_maps for leaked spindle paths From 3b44b7836468f7ac13c1af93f226017bcbd7b46e Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:57:06 -0700 Subject: [PATCH 10/19] Cachepath: Set of small, miscellaneous patches. --- src/include/spindle_launch.h | 5 +++++ src/server/auditserver/ldcs_audit_server_md.h | 3 +++ src/server/auditserver/ldcs_audit_server_process.c | 5 +++-- src/server/auditserver/ldcs_audit_server_process.h | 5 +++++ src/server/startup/spindle_be.cc | 4 +++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 47a4d92e..5476734d 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -126,6 +126,11 @@ typedef struct { /* The local-disk location where Spindle will store its cache */ char *location; + /* Path[s] for cached libraries. */ + char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ + char *chosen_cachepath; /* The consensus path (same across all nodes). */ + uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ + /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index eb5bf9f6..ba7943e2 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -107,6 +107,9 @@ int ldcs_audit_server_md_broadcast_noncontig(ldcs_process_data_t *ldcs_process_d int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); + #if defined(__cplusplus) } diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 41342571..04b59e7e 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -141,6 +141,9 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; + ldcs_process_data.cachepaths = args->candidate_cachepaths; + ldcs_process_data.cachepath = args->chosen_cachepath; + ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -190,8 +193,6 @@ int ldcs_audit_server_process(spindle_args_t *args) } ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; - debug_printf3("Initializing file cache location %s\n", ldcs_process_data.location); - ldcs_audit_server_filemngt_init(ldcs_process_data.location); if (ldcs_process_data.opts & OPT_PROCCLEAN) init_cleanup_proc(ldcs_process_data.location); diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 985ecfc1..99f87119 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -126,6 +126,11 @@ struct ldcs_process_data_struct ldcs_dist_model_t dist_model; ldcs_client_t* client_table; char *location; + char *cachepaths; + char *cachepath; + char *symbolic_cachepath; + char *parsed_cachepath; + int64_t cachepath_bitidx; char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 7493c020..4f583756 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -59,6 +59,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->startup_type, buf, pos); unpack_param(args->shm_cache_size, buf, pos); unpack_param(args->location, buf, pos); + unpack_param(args->candidate_cachepaths, buf, pos); unpack_param(args->pythonprefix, buf, pos); unpack_param(args->preloadfile, buf, pos); unpack_param(args->bundle_timeout_ms, buf, pos); @@ -152,7 +153,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i debug_printf("Translated location from %s to %s\n", args.location, new_location); free(args.location); args.location = new_location; - test_printf(" location=%s\n", args.location); + + determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); result = ldcs_audit_server_process(&args); if (result == -1) { From 957c792281d859bf5976b0a48f2eb640ad61c5a2 Mon Sep 17 00:00:00 2001 From: Barry Date: Tue, 21 Oct 2025 12:21:37 -0700 Subject: [PATCH 11/19] Fixes per Matt's comments. --- src/client/client/intercept_exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 8480b75e..27928d35 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -196,7 +196,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 20; + new_size = orig_size + 9; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); From 755f34626544237179876465eabbd964644973b2 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 12:20:35 -0700 Subject: [PATCH 12/19] Single source of truth for client cachepath. Previously, chosen_realized_cachepath was copied into set_intercept_readlink_cachepath() chosen_realized_cachepath and chosen_parsed_cachepath were copied into set_should_intercept_cachepath() This PR removes both setter functions and makes the original pointers global. --- src/client/client/client.c | 7 ++----- src/client/client/intercept.h | 1 - src/client/client/intercept_readlink.c | 12 +++--------- src/client/client/should_intercept.c | 17 +++++------------ src/client/client/should_intercept.h | 1 - 5 files changed, 10 insertions(+), 28 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index 199066ff..ad4742c5 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -71,7 +71,7 @@ static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; static char *location; -static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; +char *chosen_realized_cachepath, *chosen_parsed_cachepath; number_t number; static int have_stat_patches; @@ -258,10 +258,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); - set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index aae968f7..4ace2328 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,7 +89,6 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index 8d0dea8e..43675ae2 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,23 +31,17 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -static char *cachepath; - -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - chosen_parsed_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; + extern char *chosen_realized_cachepath; - cachepath_len = strlen(cachepath); + cachepath_len = strlen(chosen_realized_cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(chosen_realized_cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index b3a4376e..597900cc 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,27 +29,20 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); -static char *cachepath, *orig_cachepath; - -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - orig_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} - int is_in_spindle_cache(const char *pathname) { static int cachepath_size = 0; static int orig_cachepath_size = 0; + extern char *chosen_realized_cachepath, *chosen_parsed_cachepath; if (!cachepath_size) { - cachepath_size = strlen(cachepath); + cachepath_size = strlen(chosen_realized_cachepath); } if (!orig_cachepath_size) { - orig_cachepath_size = strlen(orig_cachepath); + orig_cachepath_size = strlen(chosen_parsed_cachepath); } - return ((strncmp(pathname, cachepath, cachepath_size) == 0) || - (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); + return ((strncmp(pathname, chosen_realized_cachepath, cachepath_size) == 0) || + (strncmp(pathname, chosen_parsed_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index 6a545913..f6a9b510 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,7 +27,6 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From 9ad266b97565da16f91ccb98f929dc56dc7f46b9 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 14:39:23 -0700 Subject: [PATCH 13/19] Comments the cachepath variables. --- .../auditserver/ldcs_audit_server_process.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 99f87119..55122e07 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -125,12 +125,15 @@ struct ldcs_process_data_struct int exit_readys_recvd; ldcs_dist_model_t dist_model; ldcs_client_t* client_table; - char *location; - char *cachepaths; - char *cachepath; - char *symbolic_cachepath; - char *parsed_cachepath; - int64_t cachepath_bitidx; + char *location; /* Single user-specified path for fifo, daemons, etc. */ + /* (Everything except the cachepath.) */ + char *cachepaths; /* Up to 64 colon-separated list of candidate cachepaths. */ + char *cachepath; /* The earliest path in the list available to all servers. */ + /* (Environment variables replaced, symbolic links realized.) */ + char *symbolic_cachepath; /* The original representation of the cachepath. */ + char *parsed_cachepath; /* The cachepath with environment variables replaced. */ + /* (Symbolic links, if any, remain.) */ + int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; From 3f5fa5a862d7b5c4a7f3c742384ef9ecf1c77b0d Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 13:07:21 -0700 Subject: [PATCH 14/19] Removes internal vars from spindle_launch.h Removes chosen_cachepath and cachepath_bitindex from spindle_launch.h Updates initialization of matching variables in ldcs_process_data. determineValidCachePaths() moved from spindle_be.cc to ldcs_audit_server_process.c to get ldcs_process_data visibility. Added #include "parseloc.h" to ldcs_audit_server_process.c to get declaration of determineValidCachePaths(). Relocated "parseloc.h" to src/util so ldcs_audit_server_process.c could find it. Trued up signedness of types caused my making "parseloc.h" more visible, e.g., cachepath_bitidx is now uint64_t everywhere. --- src/cobo/cobo.c | 2 +- src/cobo/ldcs_cobo.h | 2 +- src/include/spindle_launch.h | 2 -- src/server/auditserver/ldcs_audit_server_process.c | 9 +++++++-- src/server/auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 2 -- src/{client/beboot => utils}/parseloc.h | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) rename src/{client/beboot => utils}/parseloc.h (91%) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index b1db3305..a427a444 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1432,7 +1432,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } -int cobo_allreduce( int64_t *pval, cobo_op_t op ){ +int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* if i have any children, receive their data */ int64_t child_val; diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 30cc9673..fafbda6a 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -142,7 +142,7 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); -int cobo_allreduce(int64_t *pval, cobo_op_t op); +int cobo_allreduce(uint64_t *pval, cobo_op_t op); /* * ========================================================================== diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 5476734d..81c0728e 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -128,8 +128,6 @@ typedef struct { /* Path[s] for cached libraries. */ char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ - char *chosen_cachepath; /* The consensus path (same across all nodes). */ - uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 04b59e7e..c77c0312 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -37,6 +37,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "msgbundle.h" #include "exitnote.h" #include "cleanup_proc.h" +#include "parseloc.h" //#define GPERFTOOLS #if defined(GPERFTOOLS) @@ -142,8 +143,8 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; ldcs_process_data.cachepaths = args->candidate_cachepaths; - ldcs_process_data.cachepath = args->chosen_cachepath; - ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; + ldcs_process_data.cachepath = NULL; + ldcs_process_data.cachepath_bitidx = 0; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -229,6 +230,10 @@ int ldcs_audit_server_process(spindle_args_t *args) if (fd != -1) { ldcs_listen_register_fd(fd, serverid, forceExitCB, (void *) &ldcs_process_data); } + determineValidCachePaths( + &ldcs_process_data.cachepath_bitidx, + ldcs_process_data.cachepaths, + ldcs_process_data.number ); return 0; } diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 55122e07..812b4a2e 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -133,7 +133,7 @@ struct ldcs_process_data_struct char *symbolic_cachepath; /* The original representation of the cachepath. */ char *parsed_cachepath; /* The cachepath with environment variables replaced. */ /* (Symbolic links, if any, remain.) */ - int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ + uint64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 4f583756..733d3244 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -154,8 +154,6 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i free(args.location); args.location = new_location; - determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); - result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); diff --git a/src/client/beboot/parseloc.h b/src/utils/parseloc.h similarity index 91% rename from src/client/beboot/parseloc.h rename to src/utils/parseloc.h index 1731906a..a99409c3 100644 --- a/src/client/beboot/parseloc.h +++ b/src/utils/parseloc.h @@ -28,7 +28,7 @@ char *parse_location_noerr(char *loc, number_t number); char *realize(char *path); char **parse_colonsep_prefixes(char *colonsep_list, number_t number); int is_local_prefix(const char *path, char **local_prefixes); -static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); From d390792561dbb0c2fa09436ec2750039bea2a856 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:18:59 -0700 Subject: [PATCH 15/19] Client cachepath message now uses single response. The three-message-reply response is now a single message with two strings. The symbolic version of the cachepath is no longer communicated as it was not being used. --- src/client/client/client.c | 2 +- src/client/client_comlib/client_api.c | 31 ++++--------------- src/client/client_comlib/client_api.h | 2 +- .../auditserver/ldcs_audit_server_handlers.c | 19 +++--------- 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index ad4742c5..d3df30ae 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -258,7 +258,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 509326c2..511c1cc9 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -36,9 +36,9 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) - -int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath){ ldcs_message_t message; char buffer[MAX_PATH_LEN+1]; buffer[MAX_PATH_LEN] = '\0'; @@ -59,32 +59,13 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose err_printf("Got unexpected message of type %d\n", (int) message.header.type); assert(0); } + char *local_crc = strdup( buffer ); + char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); if( chosen_realized_cachepath ){ - *chosen_realized_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + *chosen_realized_cachepath = local_crc; } if( chosen_parsed_cachepath ){ - *chosen_parsed_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); - } - if( chosen_symbolic_cachepath ){ - *chosen_symbolic_cachepath = strdup( buffer ); + *chosen_parsed_cachepath = local_cpc; } return 0; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 982c4b1c..3d7c41be 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,7 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); -int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath); int get_python_prefix(int fd, char **prefix); diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index eb1f29ca..93daadd5 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2994,21 +2994,12 @@ static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; - msg.header.len = strlen(procdata->cachepath) + 1; - msg.data = procdata->cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->parsed_cachepath) + 1; - msg.data = procdata->parsed_cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->symbolic_cachepath) + 1; - msg.data = procdata->symbolic_cachepath; + msg.header.len = strlen(procdata->cachepath) + 1 + strlen(procdata->parsed_cachepath) + 1; + msg.data = calloc( 1, msg.header.len ); + strcpy( msg.data, procdata->cachepath ); + strcpy( &msg.data[ strlen(procdata->cachepath)+1 ], procdata->parsed_cachepath ); ldcs_send_msg(connid, &msg); + free( msg.data ); procdata->server_stat.clientmsg.cnt++; procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; From 0a423ae5ed2a46141b966e7845eea307f91ec995 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:33:07 -0700 Subject: [PATCH 16/19] Removes assert(0) in network error paths. --- src/client/client_comlib/client_api.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 511c1cc9..8eae4f19 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -57,7 +57,7 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } char *local_crc = strdup( buffer ); char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); @@ -102,7 +102,7 @@ int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { if (message.header.type != LDCS_MSG_FILE_QUERY_ANSWER) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } if (message.header.len > sizeof(int)) { @@ -195,7 +195,7 @@ int send_existance_test(int fd, char *path, int *exists) if (message.header.type != LDCS_MSG_EXISTS_ANSWER || message.header.len != sizeof(uint32_t)) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } memcpy(exists, buffer, sizeof(*exists)); @@ -232,7 +232,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath) if (message.header.type != LDCS_MSG_ORIGPATH_ANSWER || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } strncpy(newpath, buffer, MAX_PATH_LEN+1); @@ -380,7 +380,7 @@ int send_ldso_info_request(int fd, const char *ldso_path, char *result_path) if (message.header.type != LDCS_MSG_LOADER_DATA_RESP) { err_printf("Got unexpected message after ldso req: %d\n", (int) message.header.type); - assert(0); + return -1; } return 0; } @@ -422,7 +422,7 @@ int send_rankinfo_query(int fd, int *mylrank, int *mylsize, int *mymdrank, int * if (message.header.type != LDCS_MSG_MYRANKINFO_QUERY_ANSWER || message.header.len != 4*sizeof(int)) { err_printf("Received incorrect response to rankinfo query %d\n", message.header.type); *mylrank = *mylsize = *mymdrank = *mymdsize = -1; - assert(0); + return -1; } p = (int *) message.data; @@ -457,7 +457,7 @@ int send_procmaps_query(int fd, int pid, char *result) if (message.header.type != LDCS_MSG_PROCMAPS_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } memcpy(result, buffer, MAX_PATH_LEN); @@ -488,7 +488,7 @@ int send_pickone_query(int fd, char *key, int *result) if (message.header.type != LDCS_MSG_PICKONE_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } *result = *((int *) message.data); From 7ab02fec478efa6832f074bb62585d6bba9286b8 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:18:40 -0700 Subject: [PATCH 17/19] Renames ldcs_audit_server_md_consensus(). New name is ldcs_audit_server_md_allreduce_AND(). If we get to the point where we're using other allreduce operations we can solve the problem of duplicating the op list in md-land and cobo-land. For now, we're only using one op in md-land, so the op can go into the function name. --- src/server/auditserver/ldcs_audit_server_handlers.c | 2 +- src/server/auditserver/ldcs_audit_server_md.h | 2 +- src/server/auditserver/ldcs_audit_server_md_cobo.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 93daadd5..c5455110 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2957,7 +2957,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag msgbundle_force_flush(procdata); } - ldcs_audit_server_md_consensus(procdata, msg); + ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index ba7943e2..a4640370 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -108,7 +108,7 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ); #if defined(__cplusplus) diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index bffa3ae7..4e527fb6 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -398,8 +398,6 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) return num_childs; } -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ - if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ - cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); - } +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ){ + cobo_allreduce( val, COBO_OP_BITWISE_AND ); } From c4fc8c947041804c8f83e52f99849998aba0fd77 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:27:27 -0700 Subject: [PATCH 18/19] Adds explicit enum values to CmdlineShortOptions. --- src/fe/startup/config_mgr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 27be1ae8..c2d3cd7e 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -126,8 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso, - shortCachePaths, + shortPatchLdso = 299, + shortCachePaths = 300, }; enum CmdlineGroups { From a221c9406aa9e4ecf372a1d2c9c425d8e73e9d1e Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:38:19 -0700 Subject: [PATCH 19/19] Return instead of exit on network errors. --- src/cobo/cobo.c | 58 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index a427a444..e8ee2a79 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -160,7 +160,7 @@ static char* cobo_getenv(char* envvar, int type) char* str = getenv(envvar); if (str == NULL && type == ENV_REQUIRED) { err_printf("Missing required environment variable: %s\n", envvar); - exit(1); + return NULL; } return str; } @@ -171,7 +171,7 @@ static void* cobo_malloc(size_t n, char* msg) void* p = malloc(n); if (!p) { err_printf("Call to malloc(%lu) failed: %s (%m errno %d)\n", n, msg, errno); - exit(1); + return NULL; } return p; } @@ -514,7 +514,7 @@ static int cobo_connect_hostname(char* hostname, int rank) break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -769,7 +769,7 @@ static int cobo_open_tree() if (sockfd < 0) { err_printf("Creating parent socket (socket() %m errno=%d)\n", errno); - exit(1); + return -1; } setCloseOnExec(sockfd); @@ -818,7 +818,7 @@ static int cobo_open_tree() if (!port_is_bound) { /* TODO: would like to send an abort back to server */ err_printf("Failed to open socket on any port\n"); - exit(1); + return -1; } /* accept a connection from parent and receive socket table */ @@ -838,7 +838,7 @@ static int cobo_open_tree() break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -908,26 +908,26 @@ static int cobo_open_tree() /* read our rank number */ if (cobo_read_fd(cobo_parent_fd, &cobo_me, sizeof(int)) < 0) { err_printf("Receiving my rank from parent failed\n"); - exit(1); + return -1; } /* discover how many ranks are in our world */ if (cobo_read_fd(cobo_parent_fd, &cobo_nprocs, sizeof(int)) < 0) { err_printf("Receiving number of tasks from parent failed\n"); - exit(1); + return -1; } /* read the size of the hostlist (in bytes) */ if (cobo_read_fd(cobo_parent_fd, &cobo_hostlist_size, sizeof(int)) < 0) { err_printf("Receiving size of hostname table from parent failed\n"); - exit(1); + return -1; } /* allocate space for the hostlist and read it in */ cobo_hostlist = (void*) cobo_malloc(cobo_hostlist_size, "Hostlist data buffer"); if (cobo_read_fd(cobo_parent_fd, cobo_hostlist, cobo_hostlist_size) < 0) { err_printf("Receiving hostname table from parent failed\n"); - exit(1); + return -1; } /* @@ -970,7 +970,7 @@ static int cobo_open_tree() if (cobo_child_fd[i] == -1) { err_printf("Failed to connect to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* tell child what rank he is and forward the hostname table to him */ @@ -979,7 +979,7 @@ static int cobo_open_tree() if (forward != COBO_SUCCESS) { err_printf("Failed to forward hostname table to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* free the child hostname string */ @@ -1034,7 +1034,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_me != 0) { if (cobo_read_fd(cobo_parent_fd, buf, size) < 0) { err_printf("Receiving broadcast data from parent failed\n"); - exit(1); + return -1; } } @@ -1043,7 +1043,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } @@ -1059,7 +1059,7 @@ int cobo_bcast_down(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } return rc; @@ -1081,7 +1081,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(child_val)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's max to our current max */ @@ -1095,7 +1095,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, &max_val, sizeof(max_val)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } else { /* we're the root, got the result, set the recvbuf */ @@ -1130,7 +1130,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_read_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Gathering data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1139,7 +1139,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_me != 0) { if (cobo_write_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Sending gathered data to parent failed\n"); - exit(1); + return -1; } cobo_free(bigbuf); } @@ -1159,7 +1159,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) bigbuf = (void*) cobo_malloc(bigcount, "Temporary scatter buffer in cobo_scatter_tree"); if (cobo_read_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Receiving scatter data from parent failed\n"); - exit(1); + return -1; } } @@ -1170,7 +1170,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_write_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Scattering data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1258,7 +1258,7 @@ int cobo_bcast(void* buf, int sendcount, int root) rc = cobo_bcast_tree(buf, sendcount); } else { err_printf("Cannot execute bcast from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1284,7 +1284,7 @@ int cobo_gather(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_gather_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute gather to non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1310,7 +1310,7 @@ int cobo_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_scatter_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute scatter from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1352,7 +1352,7 @@ int cobo_alltoall(void* sendbuf, int sendcount, void* recvbuf) int rc = COBO_SUCCESS; err_printf("Cannot execute alltoall\n"); - exit(1); + return -1; cobo_gettimeofday(&end); debug_printf3("Exiting cobo_alltoall(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); @@ -1440,7 +1440,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* read int64_t from child */ if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's val to our current val */ @@ -1464,7 +1464,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } @@ -1522,7 +1522,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_ports = cobo_int_dup(portlist, num_ports); if (cobo_ports == NULL) { err_printf("Failed to copy port list\n"); - exit(1); + return -1; } /* open the tree */ @@ -1531,7 +1531,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* /* need to check that tree opened successfully before returning, so do a barrier */ if (cobo_barrier() != COBO_SUCCESS) { err_printf("Failed to open tree\n"); - exit(1); + return -1; } if (cobo_me == 0) {