From 69f30b1f492afae31a9d58d349e0963fc427f918 Mon Sep 17 00:00:00 2001 From: crivella Date: Wed, 10 Sep 2025 18:19:51 +0200 Subject: [PATCH 1/5] Test generating command with oversubscription flags --- easybuild/tools/toolchain/mpi.py | 58 +++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/easybuild/tools/toolchain/mpi.py b/easybuild/tools/toolchain/mpi.py index 2f501d2fec..34fce34642 100644 --- a/easybuild/tools/toolchain/mpi.py +++ b/easybuild/tools/toolchain/mpi.py @@ -48,7 +48,7 @@ _log = fancylogger.getLogger('tools.toolchain.mpi', fname=False) -def get_mpi_cmd_template(mpi_family, params, mpi_version=None): +def get_mpi_cmd_template(mpi_family, params, mpi_version=None, oversubscribe=False): """ Return template for MPI command, for specified MPI family. @@ -123,6 +123,50 @@ def get_mpi_cmd_template(mpi_family, params, mpi_version=None): else: raise EasyBuildError("Don't know which template MPI command to use for MPI family '%s'", mpi_family) + if oversubscribe: + osub_cmd = '' + if mpi_family in [toolchain.OPENMPI,]: + if mpi_version is None: + raise EasyBuildError("OpenMPI version unknown, can't determine how to handle oversubscription!") + if LooseVersion(mpi_version) < '5': + varname = 'OMPI_MCA_rmaps_base_oversubscribe' + varvalue = os.getenv(varname) + if varvalue and varvalue != '1': + _log.warning("Overwriting existing %s=%s with %s=1", varname, varvalue, varname) + osub_cmd = f'{varname}=1' + else: + varname = 'PRTE_MCA_rmaps_default_mapping_policy' + varvalue = os.getenv(varname) + + # This logic should account for: + # - var not set -> set to 'core:oversubscribe' + # - unit set to value without `:` eg package -> 'package:oversubscribe' + # - unit set to value with `:` eg ppr:4:numa -> 'ppr:4:numa:oversubscribe' + # - all of the above but with oversubscribe already in flags + flags = '' + if varvalue.startswith('ppr'): + _log.warning("Can't handle ppr mapping with oversubscription yet, overwriting unit with 'core'") + unit = 'core' + flags = 'oversubscribe' + else: + unit, flags = (varvalue.rsplit(':', maxsplit=1) + [''])[:2] + unit = unit or 'core' + flags = list(filter(None, flags.split(','))) + if 'oversubscribe' not in flags: + flags.append('oversubscribe') + newvalue = f"{unit}:{','.join(flags)}" + + osub_cmd = f'{varname}={newvalue}' + elif mpi_family in [toolchain.INTELMPI,]: + _log.info("INTELMPI always oversubscribe by default, nothing to do...") + elif mpi_family in [toolchain.MVAPICH2, toolchain.MPICH, toolchain.MPICH2]: + _log.info("MPICH always oversubscribe by default, nothing to do...") + else: + raise EasyBuildError("Oversubscribe not supported for MPI family '%s'", mpi_family) + + mpi_cmd_template = f'%(oversubscribe)s {mpi_cmd_template}' + params.update({'oversubscribe': osub_cmd}) # just a placeholder + missing = [] for key in sorted(params.keys()): tmpl = '%(' + key + ')s' @@ -270,7 +314,7 @@ def mpi_cmd_prefix(self, nr_ranks=1): return result - def mpi_cmd_for(self, cmd, nr_ranks): + def mpi_cmd_for(self, cmd, nr_ranks, oversubscribe=False): """Construct an MPI command for the given command and number of ranks.""" # parameter values for mpirun command @@ -281,20 +325,18 @@ def mpi_cmd_for(self, cmd, nr_ranks): mpi_family = self.mpi_family() - mpi_version = None + # this fails when it's done too early (before modules for toolchain/dependencies are loaded), + # but it's safe to ignore this + mpi_version = self.get_software_version(self.MPI_MODULE_NAME, required=False)[0] if mpi_family == toolchain.INTELMPI: - # for Intel MPI, try to determine impi version - # this fails when it's done too early (before modules for toolchain/dependencies are loaded), - # but it's safe to ignore this - mpi_version = self.get_software_version(self.MPI_MODULE_NAME, required=False)[0] if not mpi_version: self.log.debug("Ignoring error when trying to determine %s version", self.MPI_MODULE_NAME) # impi version is required to determine correct MPI command template, # so we have to return early if we couldn't determine the impi version... return None - mpi_cmd_template, params = get_mpi_cmd_template(mpi_family, params, mpi_version=mpi_version) + mpi_cmd_template, params = get_mpi_cmd_template(mpi_family, params, mpi_version=mpi_version, oversubscribe=oversubscribe) self.log.info("Using MPI command template '%s' (params: %s)", mpi_cmd_template, params) try: From b8d6f88e79746e1de2a82dcced9f3cf55bd9d5ab Mon Sep 17 00:00:00 2001 From: crivella Date: Wed, 10 Sep 2025 18:24:55 +0200 Subject: [PATCH 2/5] lint --- easybuild/tools/toolchain/mpi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/easybuild/tools/toolchain/mpi.py b/easybuild/tools/toolchain/mpi.py index 34fce34642..35ecc28188 100644 --- a/easybuild/tools/toolchain/mpi.py +++ b/easybuild/tools/toolchain/mpi.py @@ -125,7 +125,7 @@ def get_mpi_cmd_template(mpi_family, params, mpi_version=None, oversubscribe=Fal if oversubscribe: osub_cmd = '' - if mpi_family in [toolchain.OPENMPI,]: + if mpi_family in [toolchain.OPENMPI]: if mpi_version is None: raise EasyBuildError("OpenMPI version unknown, can't determine how to handle oversubscription!") if LooseVersion(mpi_version) < '5': @@ -157,7 +157,7 @@ def get_mpi_cmd_template(mpi_family, params, mpi_version=None, oversubscribe=Fal newvalue = f"{unit}:{','.join(flags)}" osub_cmd = f'{varname}={newvalue}' - elif mpi_family in [toolchain.INTELMPI,]: + elif mpi_family in [toolchain.INTELMPI]: _log.info("INTELMPI always oversubscribe by default, nothing to do...") elif mpi_family in [toolchain.MVAPICH2, toolchain.MPICH, toolchain.MPICH2]: _log.info("MPICH always oversubscribe by default, nothing to do...") @@ -336,7 +336,9 @@ def mpi_cmd_for(self, cmd, nr_ranks, oversubscribe=False): # so we have to return early if we couldn't determine the impi version... return None - mpi_cmd_template, params = get_mpi_cmd_template(mpi_family, params, mpi_version=mpi_version, oversubscribe=oversubscribe) + mpi_cmd_template, params = get_mpi_cmd_template( + mpi_family, params, mpi_version=mpi_version, oversubscribe=oversubscribe + ) self.log.info("Using MPI command template '%s' (params: %s)", mpi_cmd_template, params) try: From 07fda0bbf2ddc425c84f71b6f640d15dcffd2ced Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 8 Dec 2025 11:24:02 +0100 Subject: [PATCH 3/5] Fix flag separation with colons and add tests --- easybuild/tools/toolchain/mpi.py | 28 ++++++++----------------- test/framework/toolchain.py | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/easybuild/tools/toolchain/mpi.py b/easybuild/tools/toolchain/mpi.py index 35ecc28188..0e44457133 100644 --- a/easybuild/tools/toolchain/mpi.py +++ b/easybuild/tools/toolchain/mpi.py @@ -136,27 +136,15 @@ def get_mpi_cmd_template(mpi_family, params, mpi_version=None, oversubscribe=Fal osub_cmd = f'{varname}=1' else: varname = 'PRTE_MCA_rmaps_default_mapping_policy' - varvalue = os.getenv(varname) + varvalue = os.getenv(varname, '') - # This logic should account for: - # - var not set -> set to 'core:oversubscribe' - # - unit set to value without `:` eg package -> 'package:oversubscribe' - # - unit set to value with `:` eg ppr:4:numa -> 'ppr:4:numa:oversubscribe' - # - all of the above but with oversubscribe already in flags - flags = '' - if varvalue.startswith('ppr'): - _log.warning("Can't handle ppr mapping with oversubscription yet, overwriting unit with 'core'") - unit = 'core' - flags = 'oversubscribe' - else: - unit, flags = (varvalue.rsplit(':', maxsplit=1) + [''])[:2] - unit = unit or 'core' - flags = list(filter(None, flags.split(','))) + # This logic should account for adding a `:oversubscribe` flag to the mapping policy if not + # already present. + # See https://docs.open-mpi.org/en/main/man-openmpi/man1/mpirun.1.html#the-map-by-option + flags = varvalue.lower().split(':') if 'oversubscribe' not in flags: - flags.append('oversubscribe') - newvalue = f"{unit}:{','.join(flags)}" - - osub_cmd = f'{varname}={newvalue}' + varvalue = f'{varvalue}:oversubscribe' + osub_cmd = f'{varname}={varvalue}' elif mpi_family in [toolchain.INTELMPI]: _log.info("INTELMPI always oversubscribe by default, nothing to do...") elif mpi_family in [toolchain.MVAPICH2, toolchain.MPICH, toolchain.MPICH2]: @@ -165,7 +153,7 @@ def get_mpi_cmd_template(mpi_family, params, mpi_version=None, oversubscribe=Fal raise EasyBuildError("Oversubscribe not supported for MPI family '%s'", mpi_family) mpi_cmd_template = f'%(oversubscribe)s {mpi_cmd_template}' - params.update({'oversubscribe': osub_cmd}) # just a placeholder + params.update({'oversubscribe': osub_cmd}) missing = [] for key in sorted(params.keys()): diff --git a/test/framework/toolchain.py b/test/framework/toolchain.py index e306e9df30..378d864fbe 100644 --- a/test/framework/toolchain.py +++ b/test/framework/toolchain.py @@ -1884,6 +1884,41 @@ def test_get_mpi_cmd_template(self): self.assertTrue(regex.match(nodesfile), "'%s' should match pattern '%s'" % (nodesfile, regex.pattern)) self.assertExists(nodesfile.split(' ')[1]) + # Test oversubscription + # With OpenMPI < 5 + mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.OPENMPI, {}, mpi_version='4.9', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], 'OMPI_MCA_rmaps_base_oversubscribe=1') + + # With OpenMPI >= 5 + mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.OPENMPI, {}, mpi_version='5', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], 'PRTE_MCA_rmaps_default_mapping_policy=:oversubscribe') + + # With OpenMPI >= 5 and pre-existing PRTE_MCA_rmaps_default_mapping_policy (override ppr) + os.environ['PRTE_MCA_rmaps_default_mapping_policy'] = 'ppr:4:package' + mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.OPENMPI, {}, mpi_version='5', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], 'PRTE_MCA_rmaps_default_mapping_policy=ppr:4:package:oversubscribe') + + # With OpenMPI >= 5 and pre-existing PRTE_MCA_rmaps_default_mapping_policy (add to unit) + os.environ['PRTE_MCA_rmaps_default_mapping_policy'] = 'package' + mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.OPENMPI, {}, mpi_version='5', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], 'PRTE_MCA_rmaps_default_mapping_policy=package:oversubscribe') + + # With OpenMPI >= 5 and pre-existing PRTE_MCA_rmaps_default_mapping_policy (add to unit) + os.environ['PRTE_MCA_rmaps_default_mapping_policy'] = 'core:oversubscribe' + mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.OPENMPI, {}, mpi_version='5', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], 'PRTE_MCA_rmaps_default_mapping_policy=core:oversubscribe') + + # With IntelMPI and MPICH + for mpi_fam in [toolchain.INTELMPI, toolchain.MPICH, toolchain.MPICH2, toolchain.MVAPICH2]: + mpi_cmd_tmpl, params = get_mpi_cmd_template(mpi_fam, input_params, mpi_version='1.0', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], '') + def test_prepare_deps(self): """Test preparing for a toolchain when dependencies are involved.""" tc = self.get_toolchain('GCC', version='6.4.0-2.28') From cf71f6856207f6770bf82a3ee1180400de7d2a9d Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 8 Dec 2025 11:44:27 +0100 Subject: [PATCH 4/5] lint --- test/framework/toolchain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/framework/toolchain.py b/test/framework/toolchain.py index 378d864fbe..e3b6b29f68 100644 --- a/test/framework/toolchain.py +++ b/test/framework/toolchain.py @@ -1915,9 +1915,9 @@ def test_get_mpi_cmd_template(self): # With IntelMPI and MPICH for mpi_fam in [toolchain.INTELMPI, toolchain.MPICH, toolchain.MPICH2, toolchain.MVAPICH2]: - mpi_cmd_tmpl, params = get_mpi_cmd_template(mpi_fam, input_params, mpi_version='1.0', oversubscribe=True) - self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) - self.assertEqual(params['oversubscribe'], '') + mpi_cmd_tmpl, params = get_mpi_cmd_template(mpi_fam, input_params, mpi_version='1.0', oversubscribe=True) + self.assertTrue('%(oversubscribe)s' in mpi_cmd_tmpl) + self.assertEqual(params['oversubscribe'], '') def test_prepare_deps(self): """Test preparing for a toolchain when dependencies are involved.""" From 0ca54ae34c9057aefd5f766ca7cf48b33c228511 Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 8 Dec 2025 15:40:22 +0100 Subject: [PATCH 5/5] Avoid CrayCCE not having `MPI_MODULE_NAME` --- easybuild/tools/toolchain/mpi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/easybuild/tools/toolchain/mpi.py b/easybuild/tools/toolchain/mpi.py index 0e44457133..12bd43bbea 100644 --- a/easybuild/tools/toolchain/mpi.py +++ b/easybuild/tools/toolchain/mpi.py @@ -315,7 +315,10 @@ def mpi_cmd_for(self, cmd, nr_ranks, oversubscribe=False): # this fails when it's done too early (before modules for toolchain/dependencies are loaded), # but it's safe to ignore this - mpi_version = self.get_software_version(self.MPI_MODULE_NAME, required=False)[0] + if self.MPI_MODULE_NAME is None: + mpi_version = None + else: + mpi_version = self.get_software_version(self.MPI_MODULE_NAME, required=False)[0] if mpi_family == toolchain.INTELMPI: if not mpi_version: