From 0846a0e53b764d2d212ccf266fde9178ec10c6a2 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 13 Jul 2018 16:13:19 -0700 Subject: [PATCH 01/10] config: Add support for IBM Spectrum resource manager (jsrun) --- etc/Makefile.am | 6 +++-- etc/rm_ibm_spectrum.conf | 54 ++++++++++++++++++++++++++++++++++++++++ etc/rm_info.conf | 3 +++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 etc/rm_ibm_spectrum.conf diff --git a/etc/Makefile.am b/etc/Makefile.am index a44835a..ec334b0 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -47,7 +47,8 @@ etc_SCRIPTS = \ rm_mchecker.conf \ rm_openrte.conf \ rm_slurm.conf \ - rm_mpiexec_hydra.conf + rm_mpiexec_hydra.conf \ + rm_ibm_spectrum.conf EXTRA_DIST = \ rm_info.conf \ @@ -61,5 +62,6 @@ EXTRA_DIST = \ rm_mchecker.conf \ rm_openrte.conf \ rm_slurm.conf \ - rm_mpiexec_hydra.conf + rm_mpiexec_hydra.conf \ + rm_ibm_spectrum.conf diff --git a/etc/rm_ibm_spectrum.conf b/etc/rm_ibm_spectrum.conf new file mode 100644 index 0000000..0829cea --- /dev/null +++ b/etc/rm_ibm_spectrum.conf @@ -0,0 +1,54 @@ +## $Header: $ +## +## rm_ibm_spectrum.conf +## +##-------------------------------------------------------------------------------- +## Copyright (c) 2008, Lawrence Livermore National Security, LLC. Produced at +## the Lawrence Livermore National Laboratory. Written by Dong H. Ahn . +## LLNL-CODE-409469. All rights reserved. +## +## This file is part of LaunchMON. For details, see +## https://computing.llnl.gov/?set=resources&page=os_projects +## +## Please also read LICENSE -- Our Notice and GNU Lesser General Public License. +## +## +## This program is free software; you can redistribute it and/or modify it under the +## terms of the GNU General Public License (as published by the Free Software +## Foundation) version 2.1 dated February 1999. +## +## This program is distributed in the hope that it will be useful, but WITHOUT ANY +## WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU Lesser General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., 59 Temple +## Place, Suite 330, Boston, MA 02111-1307 USA +##-------------------------------------------------------------------------------- +## +## Update Log: +## Apr 17 2018 DHA: Created file. +## +## +## RM: the name of Resource Manager +## RM_launcher: the name of the launcher command +## RM_launcher_id: the rule to get the launcher id +## (e.g., RM_launcher|sym|srun says the launcher is identify by testing +## RM_launcher's symbol by the name of srun) +## RM_jobid: the rule to get the target jobid +## (e.g., RM_jobid=RM_launcher|sym|totalview_jobid|string says +## jobid can be obtained from the launcher's symbol, totalview_jobid, +## interpreting that as the string type. +## RM_launcher_helper= method or command to launch daemons +## RM_launch_str= options and arguements used for RM_launch_mth. +## + +RM=spectrum +RM_MPIR=STD_COLOC_FIFO +RM_launcher=jsrun +RM_launcher_id=RM_launcher|sym|jsm_env_get_jsrun_port +RM_launch_helper=mpir +RM_signal_for_kill=SIGINT +RM_fail_detection=false +RM_launch_str=%o --lmonsharedsec=%s --lmonsecchk=%c diff --git a/etc/rm_info.conf b/etc/rm_info.conf index 2fb4e7d..670a6c3 100644 --- a/etc/rm_info.conf +++ b/etc/rm_info.conf @@ -73,3 +73,6 @@ rm_bgq_slurm.conf rm_mchecker.conf rm_gupc.conf rm_openrte.conf + +[linux-powerle] +rm_ibm_spectrum.conf From 5758b462700379c94512cfd47da2a948c8cd0d51 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 13 Jul 2018 17:10:28 -0700 Subject: [PATCH 02/10] config: Add IBM Spectrum detection logic into --with-test-rm --- config/x_ac_testnnodes.m4 | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/config/x_ac_testnnodes.m4 b/config/x_ac_testnnodes.m4 index 39be81d..a3d2cb3 100644 --- a/config/x_ac_testnnodes.m4 +++ b/config/x_ac_testnnodes.m4 @@ -79,9 +79,9 @@ AC_DEFUN([X_AC_NCORE_SMP], [ AC_DEFUN([X_AC_TEST_RM], [ - AC_MSG_CHECKING([resource manager to test @<:@slurm bgqrm alps orte mpiexec_hydra@:>@]) + AC_MSG_CHECKING([resource manager to test @<:@slurm bgqrm alps orte mpiexec_hydra ibm_spectrum@:>@]) AC_ARG_WITH([test-rm], - AS_HELP_STRING(--with-test-rm@<:@=RM@:>@,specify a resource manager type to test @<:@slurm bgqrm alps orte mpiexec_hydra@:>@ @<:@default=slurm on linux-x86 and linux-x86_64; alps on Cray; bgqrm on linux-power64@:>@), + AS_HELP_STRING(--with-test-rm@<:@=RM@:>@,specify a resource manager type to test @<:@slurm bgqrm alps orte mpiexec_hydra ibm_spectrum@:>@ @<:@default=slurm on linux-x86 and linux-x86_64; alps on Cray; bgqrm on linux-power64; ibm_spectrum on linux-power64le@:>@), [with_rm=$withval], [with_rm="check"]) @@ -182,6 +182,44 @@ AC_DEFUN([X_AC_TEST_RM], [ # AC_MSG_RESULT($with_rm:$rm_found) + elif test "x$with_rm" = "xibm_spectrum" ; then + # + # Configure for IBM Spectrum (jsrun) + # + if test "x$with_launcher" != "xcheck"; then + # + # launcher path given + # + if test ! -z "$with_launcher" -a -f "$with_launcher"; then + pth=`$srcdir/config/ap $with_launcher` + ac_job_launcher_path=$pth + rm_found="yes" + AC_SUBST(TARGET_JOB_LAUNCHER_PATH,$ac_job_launcher_path) + AC_SUBST(RM_TYPE, RC_ibm_spectrum) + fi + else + rm_default_dirs="/opt/ibm/spectrum_mpi/jsm_pmix/bin/stock /usr/bin /usr/local/bin" + for rm_dir in $rm_default_dirs; do + if test ! -z "$rm_dir" -a ! -d "$rm_dir" ; then + continue; + fi + + if test ! -z "$rm_dir/jsrun" -a -f "$rm_dir/jsrun"; then + pth=`$srcdir/config/ap $rm_dir/jsrun` + ac_job_launcher_path=$pth + rm_found="yes" + AC_SUBST(TARGET_JOB_LAUNCHER_PATH,$ac_job_launcher_path) + AC_SUBST(RM_TYPE, RC_ibm_spectrum) + break + fi + done + fi + + # + # This answers whether RM given and found + # + AC_MSG_RESULT($with_rm:$rm_found) + elif test "x$with_rm" = "xalps" ; then # # Configure for Cray ALPS RM From 572f45ddc9fbec90004be2d08b1d37e23482da39 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 13 Jul 2018 17:12:21 -0700 Subject: [PATCH 03/10] config: Add POWER Little Endian platform support --- config/x_ac_platform.m4 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/config/x_ac_platform.m4 b/config/x_ac_platform.m4 index a26bd4a..fbb7922 100644 --- a/config/x_ac_platform.m4 +++ b/config/x_ac_platform.m4 @@ -28,7 +28,8 @@ # -------------------------------------------------------------------------------- # # Update Log: -# May 02 2018 KMD: Added aarch64 support +# Jul 13 2018 DHA: Add powerle support. +# May 02 2018 KMD: Added aarch64 support. # Apr 01 2015 ADG: Added Cray CTI support. # Feb 20 2015 andrewg@cray.com: Fixes for Cray systems. # Jun 11 2008 DHA: File created. @@ -79,6 +80,12 @@ AC_DEFUN([X_AC_PLATFORM], [ AC_SUBST(LNCHR_BIT_FLAGS, -m32) AC_DEFINE(BIT64, 1, [64bit]) ;; + *powerpc64le*)AC_DEFINE(POWERLE_ARCHITECTURE,1,[Define 1 for POWERLEC_ARCHITECTURE]) + ac_have_known_isa="yes" + ac_target_isa="powerle" + AC_SUBST(LNCHR_BIT_FLAGS, -m64) + AC_DEFINE(BIT64, 1, [64bit]) + ;; *powerpc*)AC_DEFINE(PPC_ARCHITECTURE,1,[Define 1 for PPC_ARCHITECTURE]) ac_have_known_isa="yes" ac_target_isa="power" From c32febe2fa507d7ecb8922e95a0fc0355d82d23b Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 13 Jul 2018 18:23:14 -0700 Subject: [PATCH 04/10] launchmon: Add IBM Spectrum and PowerLE support Adjust for OpenPower ABI functional call convension change: The ABI adds two different entry points, one for intramodule call and the other intermodule call (being done through TOC). Port for IBM Spectrum jsrun. Add new spectrum definition and also adjust back end deamon code as to how it should synchronize itself with Spectrum MPI target. Adjust how to handle new thread creation. When a new thread is created, we will get the SIGTRAP notification via waitpid on the parent thread that spawns a new thread. But on a recent Linux kernal, we have to check the high order bits of waitpid returned status as: leftshift status by 8 bits and then to see if it is equal to (SIGTRAP | (LINUX_TRACER_EVENT_CLONE << 8). Add a work around for jsrun's broken MPIR_attach_fifo. It is expecting ASCII 1 to be sent to the FIFO when it should expect a numeric 1. Need to drop it when IBM ultimately fixes the issue. (Bug filed to IBM). --- .../src/linux/lmon_api/lmon_be_sync_mpi.cxx | 6 + launchmon/src/linux/main.cxx | 2 +- .../src/linux/sdbg_linux_driver_impl.hxx | 4 +- launchmon/src/linux/sdbg_linux_launchmon.cxx | 60 +++++---- launchmon/src/linux/sdbg_linux_mach.cxx | 2 +- launchmon/src/linux/sdbg_linux_mach.hxx | 2 +- launchmon/src/linux/sdbg_linux_std.hxx | 2 +- launchmon/src/linux/sdbg_linux_symtab.hxx | 2 + .../src/linux/sdbg_linux_symtab_impl.hxx | 46 +++++++ launchmon/src/linux/sdbg_proc_service.cxx | 4 +- launchmon/src/lmon_api/lmon_api_std.h | 1 + launchmon/src/sdbg_base_mach.hxx | 6 +- launchmon/src/sdbg_base_symtab.hxx | 20 ++- launchmon/src/sdbg_base_symtab_impl.hxx | 53 +++++++- launchmon/src/sdbg_event_manager_impl.hxx | 115 +++++++----------- launchmon/src/sdbg_opt.cxx | 3 +- launchmon/src/sdbg_rm_map.cxx | 2 + 17 files changed, 216 insertions(+), 114 deletions(-) diff --git a/launchmon/src/linux/lmon_api/lmon_be_sync_mpi.cxx b/launchmon/src/linux/lmon_api/lmon_be_sync_mpi.cxx index e15bc6f..5c8a131 100644 --- a/launchmon/src/linux/lmon_api/lmon_be_sync_mpi.cxx +++ b/launchmon/src/linux/lmon_api/lmon_be_sync_mpi.cxx @@ -116,6 +116,7 @@ lmon_rc_e LMON_be_procctl_init(rm_catalogue_e rmtype, MPIR_PROCDESC_EXT *ptab, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // Call generic Linux init // @@ -170,6 +171,7 @@ lmon_rc_e LMON_be_procctl_stop(rm_catalogue_e rmtype, MPIR_PROCDESC_EXT *ptab, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // Call generic Linux stop // @@ -230,6 +232,7 @@ lmon_rc_e LMON_be_procctl_run(rm_catalogue_e rmtype, int signum, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // Call generic Linux run // @@ -292,6 +295,7 @@ lmon_rc_e LMON_be_procctl_initdone(rm_catalogue_e rmtype, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // Call generic Linux initdone // @@ -346,6 +350,7 @@ lmon_rc_e LMON_be_procctl_done(rm_catalogue_e rmtype, MPIR_PROCDESC_EXT *ptab, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // You need to do nothing for these resource managers // @@ -402,6 +407,7 @@ lmon_rc_e LMON_be_procctl_perf(rm_catalogue_e rmtype, MPIR_PROCDESC_EXT *ptab, case RC_cray: case RC_gupc: case RC_mpiexec_hydra: + case RC_ibm_spectrum: // // You need to do nothing for these resource managers // diff --git a/launchmon/src/linux/main.cxx b/launchmon/src/linux/main.cxx index e0e6cba..67bb23d 100644 --- a/launchmon/src/linux/main.cxx +++ b/launchmon/src/linux/main.cxx @@ -54,7 +54,7 @@ int main(int argc, char* argv[]) { try { int rc = EXIT_FAILURE; #if X86_ARCHITECTURE || X86_64_ARCHITECTURE || PPC_ARCHITECTURE || \ - AARCH64_ARCHITECTURE + AARCH64_ARCHITECTURE || POWERLE_ARCHITECTURE // // driver instantiation for the linux platform. // diff --git a/launchmon/src/linux/sdbg_linux_driver_impl.hxx b/launchmon/src/linux/sdbg_linux_driver_impl.hxx index 82c9f08..e593733 100644 --- a/launchmon/src/linux/sdbg_linux_driver_impl.hxx +++ b/launchmon/src/linux/sdbg_linux_driver_impl.hxx @@ -91,7 +91,7 @@ linux_driver_t::create_process(pid_t pid, #if X86_ARCHITECTURE || X86_64_ARCHITECTURE return_proc = new linux_x86_process_t(pid, mi, md, mt, mc); -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE return_proc = new linux_ppc_process_t(pid, mi, md, mt, mc); #elif IA64_ARCHITECTURE return_proc = new linux_ia64_process_t(pid, mi, md, mt, mc); @@ -122,7 +122,7 @@ linux_driver_t::create_process(pid_t pid, // #if X86_ARCHITECTURE || X86_64_ARCHITECTURE return_proc = new linux_x86_process_t(pid, mi); -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE return_proc = new linux_ppc_process_t(pid, mi); #elif IA64_ARCHITECTURE return_proc = new linux_ia64_process_t(pid, mi); diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index ccb1ed8..225b3ed 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -1055,7 +1055,7 @@ bool linux_launchmon_t::handle_mpir_variables( p.set_launch_hidden_bp(NULL); } la_bp = new linux_breakpoint_t(); - la_bp->set_address_at(launch_bp_sym.get_relocated_address()); + la_bp->set_address_at(launch_bp_sym.get_relocated_lowest_address()); #if PPC_ARCHITECTURE // @@ -1063,8 +1063,8 @@ bool linux_launchmon_t::handle_mpir_variables( // PowerPC Linux has begun to change the linking convention // such that binaries no longer export direct function // symbols. (e.g., .MPIR_Breakpoint). But rather, undotted - // global data symbols (e.g., MPIR_Breakpoint) contains the - // address for the corresponding function. + // global data symbols (e.g., MPIR_Breakpoint) is the function + // descriptor // // Added indirect breakpoint support for that and use this // method on all PPC systems across the board including @@ -1487,16 +1487,15 @@ launchmon_event_e linux_launchmon_t::decipher_an_event( // // Parent gets SIGTRAP when a new thread is created // Used to be: return_ev = LM_STOP_NOT_INTERESTED; - int upper16; - upper16 = event.get_rawstatus() >> 16; - if (upper16 == LINUX_TRACER_EVENT_CLONE) { + int high = event.get_rawstatus() >> 8; + if (high == (SIGTRAP | (LINUX_TRACER_EVENT_CLONE << 8))) { return_ev = LM_STOP_AT_THREAD_CREATION; - } else { - // - // SIGTRAP due to fork for example - // - return_ev = LM_STOP_NOT_INTERESTED; - } + } else { + // + // SIGTRAP due to fork for example + // + return_ev = LM_STOP_NOT_INTERESTED; + } } else if (event.get_signum() == SIGSTOP) { return_ev = LM_RELAY_SIGNAL; @@ -1579,12 +1578,13 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( #if MEASURE_TRACING_COST beginTS = gettimeofdayD(); +#endif + { - self_trace_t::trace(true, // print always + self_trace_t::trace(LEVELCHK(level2), MODULENAME, 0, "The RM process has just been trapped due to attach"); } -#endif bool use_cxt = true; image_base_t *dynloader_im = NULL; @@ -1650,7 +1650,7 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( dynloader_im->get_a_symbol(p.get_loader_breakpoint_sym()); lo_bp = new linux_breakpoint_t(); - addr_dl_bp = dynload_sym.get_relocated_address(); + addr_dl_bp = dynload_sym.get_relocated_lowest_address(); lo_bp->set_address_at(addr_dl_bp); #if PPC_ARCHITECTURE lo_bp->set_use_indirection(); @@ -1683,7 +1683,14 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( get_tracer()->tracer_continue(p, use_cxt); int fifofd = 0; if ((fifofd = open(fifopathbuf, O_WRONLY)) >= 0) { +#if POWERLE_ARCHITECTURE + char wakeup = '1'; + self_trace_t::trace( + true, MODULENAME, 0, + "Warning: Sending ASCII 1 to FIFO to work around a jsrun bug"); +#else char wakeup = (char)1; +#endif if (lmon_write_raw(fifofd, &wakeup, 1) != 1) { self_trace_t::trace( true, MODULENAME, 0, @@ -1782,9 +1789,9 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_exec_event( #endif { - self_trace_t::trace(true, // print always - MODULENAME, 0, - "The RM process has just been forked and exec'ed."); + self_trace_t::trace(LEVELCHK(level2), + MODULENAME, 0, "The RM process (%d) has " + "just been forked and exec'ed.", p.get_pid (true)); } bool use_cxt = true; @@ -1831,7 +1838,7 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_exec_event( // // Corner case; we deal with a heuristics // -#if PPC_ARCHITECTURE +#if PPC_ARCHITECTURE || POWERLE_ARCHITECTURE // // DHA Mar 05 2009 // There're systems that do not directly @@ -1859,7 +1866,7 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_exec_event( #else = p.get_gprset(use_cxt)->get_pc() & 0xffff0000; #endif -#else /* PPC_ARCHITECTURE */ +#else /* PPC_ARCHITECTURE || POWERLE_ARCHITECTURE */ // // This requires the actual page size to compute this loader load // address implictly. Just using the following bits for now. @@ -1916,13 +1923,13 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_exec_event( countHandler++; // accum and countHandler now contain the cost of this handler which // is invoked just once per job -#endif - { self_trace_t::trace( true, // print always MODULENAME, 0, "Just continued the RM process out of the first trap"); } +#endif + set_last_seen(gettimeofdayD()); return LAUNCHMON_OK; @@ -2653,12 +2660,19 @@ launchmon_rc_e linux_launchmon_t::handle_thrcreate_request( memset(&tinfo, '\0', sizeof(tinfo)); tinfo.ti_lid = (lwpid_t)newlwpid; + { + self_trace_t::trace(LEVELCHK(level2), MODULENAME,0, + "thread creation request event handler " + "invoked for thread (%d)", newlwpid); + } + + if (p.get_thrlist().find(tinfo.ti_lid) == p.get_thrlist().end()) { // this thread has not been seen #if X86_ARCHITECTURE || X86_64_ARCHITECTURE thread_base_t *thrinfo = new linux_x86_thread_t(); -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE thread_base_t *thrinfo = new linux_ppc_thread_t(); #elif AARCH64_ARCHITECTURE diff --git a/launchmon/src/linux/sdbg_linux_mach.cxx b/launchmon/src/linux/sdbg_linux_mach.cxx index 5061acf..1c30a12 100644 --- a/launchmon/src/linux/sdbg_linux_mach.cxx +++ b/launchmon/src/linux/sdbg_linux_mach.cxx @@ -520,7 +520,7 @@ bool linux_x86_process_t::basic_init(const std::string& mi, return true; } -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE //////////////////////////////////////////////////////////////////// // diff --git a/launchmon/src/linux/sdbg_linux_mach.hxx b/launchmon/src/linux/sdbg_linux_mach.hxx index 987efb5..afb2413 100644 --- a/launchmon/src/linux/sdbg_linux_mach.hxx +++ b/launchmon/src/linux/sdbg_linux_mach.hxx @@ -516,7 +516,7 @@ struct ps_prochandle { process_base_t* p; }; -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE //! linux_ppc_gpr_set_t: /*! diff --git a/launchmon/src/linux/sdbg_linux_std.hxx b/launchmon/src/linux/sdbg_linux_std.hxx index 0fa7340..91e7af2 100644 --- a/launchmon/src/linux/sdbg_linux_std.hxx +++ b/launchmon/src/linux/sdbg_linux_std.hxx @@ -356,7 +356,7 @@ const T_VA T_UNINIT_HEX = 0xdeadbeef; #define SDBG_LINUX_DFLT_INSTANTIATION \ T_VA, T_WT, T_IT, T_GRS, T_FRS, my_thrinfo_t, elf_wrapper -#elif PPC_ARCHITECTURE +#elif PPC_ARCHITECTURE || POWERLE_ARCHITECTURE // // diff --git a/launchmon/src/linux/sdbg_linux_symtab.hxx b/launchmon/src/linux/sdbg_linux_symtab.hxx index b35fc9b..3a4172f 100644 --- a/launchmon/src/linux/sdbg_linux_symtab.hxx +++ b/launchmon/src/linux/sdbg_linux_symtab.hxx @@ -203,6 +203,8 @@ class linux_image_t : public image_base_t { level); } + VA get_local_entry_point (const unsigned char o); + // For self tracing // std::string MODULENAME; diff --git a/launchmon/src/linux/sdbg_linux_symtab_impl.hxx b/launchmon/src/linux/sdbg_linux_symtab_impl.hxx index 5eb7321..c7e864d 100644 --- a/launchmon/src/linux/sdbg_linux_symtab_impl.hxx +++ b/launchmon/src/linux/sdbg_linux_symtab_impl.hxx @@ -28,6 +28,7 @@ * * * Update Log: + * May 19 2018 DHA: Added OpenPower ABI's dual entry points. * Oct 27 2010 DHA: Added is_defined, is_globally_visible, * is_locally_visible virtual methods. * Dec 20 2009 DHA: Fixed a bug that arose when Mark's patch @@ -482,6 +483,10 @@ linux_image_t::read_linkage_symbols() throw( decode_type(first_sym->st_info, tmp); a_linksym->set_type(tmp); a_linksym->set_defined((first_sym->st_shndx != SHN_UNDEF) ? true : false); + a_linksym->set_info(first_sym->st_info); + a_linksym->set_other(first_sym->st_other); + a_linksym->set_local_entry_offset( + get_local_entry_point(first_sym->st_other)); string keystr(symname); @@ -864,6 +869,47 @@ void linux_image_t::decode_visibility( } } + +//! PRIVATE: linux_image_t::get_local_entry_point -- +/*! + Calculate the entry point used by an intramodule function call +*/ +template +VA +linux_image_t::get_local_entry_point (const unsigned char o) +{ + VA rc = 0; +#if POWERLE_ARCHITECTURE + /* The "OpenPOWER ABI for Linux Supplement, Power Architecture 64-Bit ELF V2 + * ABI, Advance": + * "The OpenPOWER ABI uses the three most-significant bits + * in the symbol st_other field to specify the number of instructions between a + * function's global entry point and local entry point. The global entry point + * is used when it is necessary to set up the TOC pointer (r2) for the + * function. The local entry point is used when r2 is known to already be valid + * for the function. A value of zero in these bits asserts that the function + * does not use r2." + */ + const int code = (o >> 5) & 0x7; + switch (code) + { + case 2: /* 1 instruction */ + case 3: /* 2 instructions */ + case 4: /* 4 instructions */ + case 5: /* 8 instructions */ + case 6: /* 16 instructions */ + rc = (1 << (code - 2)) * 4; /* TODO: augment template param to includ IT */ + break; + case 0: /* local == global */ + case 1: /* local == global */ + case 7: /* Reserved */ + break; + } /* switch */ +#endif + return rc; +} + + template void linux_image_t::set_image_base_address(VA ba) { image_base_t::set_image_base_address(ba); diff --git a/launchmon/src/linux/sdbg_proc_service.cxx b/launchmon/src/linux/sdbg_proc_service.cxx index d9f9966..c115b4e 100644 --- a/launchmon/src/linux/sdbg_proc_service.cxx +++ b/launchmon/src/linux/sdbg_proc_service.cxx @@ -62,7 +62,7 @@ extern "C" { #include } -#if X86_ARCHITECTURE || PPC_ARCHITECTURE +#if X86_ARCHITECTURE || PPC_ARCHITECTURE || POWERLE_ARCHITECTURE #ifndef PTRACE_GET_THREAD_AREA #define PTRACE_GET_THREAD_AREA 25 @@ -243,7 +243,7 @@ extern "C" ps_err_e ps_get_thread_area(const struct ps_prochandle *ph, lwpid_t lpid, int x, psaddr_t *addr) { bool use_cxt = true; -#if X86_ARCHITECTURE || PPC_ARCHITECTURE +#if X86_ARCHITECTURE || PPC_ARCHITECTURE || POWERLE_ARCHITECTURE /* * How to fetch thread-specific area for x86/linux and powerPC/linux * diff --git a/launchmon/src/lmon_api/lmon_api_std.h b/launchmon/src/lmon_api/lmon_api_std.h index 9cdb96e..2473033 100644 --- a/launchmon/src/lmon_api/lmon_api_std.h +++ b/launchmon/src/lmon_api/lmon_api_std.h @@ -119,6 +119,7 @@ typedef enum _rm_catalogue_e RC_orte, RC_mpiexec_hydra, RC_gupc, + RC_ibm_spectrum, RC_none /* new RMs should be added here as LaunchMON is ported diff --git a/launchmon/src/sdbg_base_mach.hxx b/launchmon/src/sdbg_base_mach.hxx index 040b8fa..8f6f323 100644 --- a/launchmon/src/sdbg_base_mach.hxx +++ b/launchmon/src/sdbg_base_mach.hxx @@ -27,6 +27,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Jul 13 2018 DHA: Remove event_entity support * May 02 2018 ADG: Added aarch64 support * Sep 02 2010 DHA: Added MPIR_attach_fifo support * May 08 2008 DHA: Added an alias (is_master_thread) @@ -157,8 +158,6 @@ enum debug_event_e { EV_INVALID }; -enum eventing_entity_e { EV_ENTITY_THREAD, EV_ENTITY_PROCESS, EV_ENTITY_NONE }; - class debug_event_t { public: debug_event_t() { @@ -167,13 +166,11 @@ class debug_event_t { } ~debug_event_t() {} void set_ev(const enum debug_event_e e) { ev = e; } - void set_en(const enum eventing_entity_e t) { en = t; } void set_signum(const int s) { u.signum = s; } void set_exitcode(const int ec) { u.exitcode = ec; } void set_rawstatus(const int st) { rawstatus = st; } void set_id(const int i) { id = i; } const debug_event_e get_ev() const { return ev; } - const eventing_entity_e get_en() const { return en; } const int get_signum() const { return u.signum; } const int get_exitcode() const { return u.exitcode; } const int get_rawstatus() const { return rawstatus; } @@ -181,7 +178,6 @@ class debug_event_t { private: debug_event_e ev; - eventing_entity_e en; union { int signum; int exitcode; diff --git a/launchmon/src/sdbg_base_symtab.hxx b/launchmon/src/sdbg_base_symtab.hxx index 96cfc32..262e6e2 100644 --- a/launchmon/src/sdbg_base_symtab.hxx +++ b/launchmon/src/sdbg_base_symtab.hxx @@ -27,6 +27,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * May 19 2018 DHA: Added OpenPower ABI's dual entry points. * Oct 27 2010 DHA: Added is_defined, is_globally_visible, * is_locally_visible virtual methods. * Feb 09 2008 DHA: Added LLNS Copyright @@ -112,7 +113,10 @@ class symbol_base_t { symbol_base_t(const std::string &n, const std::string &bln, const VA rd = SYMTAB_UNINIT_ADDR, - const VA rla = SYMTAB_UNINIT_ADDR); + const VA rla = SYMTAB_UNINIT_ADDR, + const VA lo=SYMTAB_UNINIT_ADDR, + const char i='\0', + const char o='\0'); symbol_base_t(const symbol_base_t &sobj); @@ -125,10 +129,17 @@ class symbol_base_t { void set_base_lib_name(const std::string &bln); void set_raw_address(const VA &ra); void set_relocated_address(const VA &ra); + void set_local_entry_offset (const VA &ra); + void set_other (const char o); + void set_info (const char i); const std::string &get_name() const; const std::string &get_base_lib_name() const; - const VA &get_raw_address() const; - const VA &get_relocated_address() const; + const VA get_raw_address() const; + const VA get_relocated_address() const; + const VA get_local_entry_offset() const; + const VA get_relocated_lowest_address() const; + const char get_other() const; + const char get_info() const; virtual bool is_defined() const { return false; } virtual bool is_globally_visible() const { return false; } @@ -146,6 +157,9 @@ class symbol_base_t { std::string base_lib_name; VA raw_address; VA relocated_address; + VA local_entry_offset; // Support arch like OpenPower with dual entry points + unsigned char info; + unsigned char other; }; //! ltstr diff --git a/launchmon/src/sdbg_base_symtab_impl.hxx b/launchmon/src/sdbg_base_symtab_impl.hxx index ae9afb6..b08cd3b 100644 --- a/launchmon/src/sdbg_base_symtab_impl.hxx +++ b/launchmon/src/sdbg_base_symtab_impl.hxx @@ -27,6 +27,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * May 19 2018 DHA: Added dual entry points for IBM OpenPower ABI * Feb 09 2008 DHA: Added LLNS Copyright * Jan 10 2006 DHA: Created file. */ @@ -79,11 +80,17 @@ template symbol_base_t::symbol_base_t(const std::string &n, const std::string &bln, const VA rd, - const VA rla) { + const VA rla, + const VA lo, + const char i, + const char o) { name = n; base_lib_name = bln; raw_address = rd; relocated_address = rla; + local_entry_offset = lo; + info = i; + other = o; } //! symbol_base_t<> destructor @@ -122,13 +129,33 @@ const std::string &symbol_base_t::get_base_lib_name() return base_lib_name; } +template +const char symbol_base_t::get_other() const { + return other; +} + +template +void symbol_base_t::set_other(const char o) { + other = o; +} + +template +const char symbol_base_t::get_info() const { + return info; +} + +template +void symbol_base_t::set_info(const char i) { + info = i; +} + template void symbol_base_t::set_raw_address(const VA &ra) { raw_address = ra; } template -const VA &symbol_base_t::get_raw_address() const { +const VA symbol_base_t::get_raw_address() const { return raw_address; } @@ -139,10 +166,30 @@ void symbol_base_t::set_relocated_address( } template -const VA &symbol_base_t::get_relocated_address() const { +const VA symbol_base_t::get_relocated_address() const { return relocated_address; } +template +void symbol_base_t::set_local_entry_offset ( + const VA &lo) { + local_entry_offset = lo; +} + +template +const VA symbol_base_t::get_local_entry_offset () const { + return local_entry_offset; +} + +template +const VA symbol_base_t::get_relocated_lowest_address () + const { + return (local_entry_offset == SYMTAB_UNINIT_ADDR) + ? relocated_address + : relocated_address + local_entry_offset; +} + + //////////////////////////////////////////////////////////////////// // // PUBLIC INTERFACES (class image_base_t<>) diff --git a/launchmon/src/sdbg_event_manager_impl.hxx b/launchmon/src/sdbg_event_manager_impl.hxx index 59cd219..589bc20 100644 --- a/launchmon/src/sdbg_event_manager_impl.hxx +++ b/launchmon/src/sdbg_event_manager_impl.hxx @@ -112,23 +112,13 @@ bool monitor_proc_thread_t::wait_for_all( pid_t rpid; int status; bool rs = false; - eventing_entity_e entity = EV_ENTITY_THREAD; - rpid = waitpid(-1, &status, WNOHANG | WUNTRACED); - if (rpid <= 0) { - rpid = waitpid(-1, &status, WNOHANG | __WCLONE); - } else { - entity = EV_ENTITY_PROCESS; - } - - if (rpid <= 0) { + if ((rpid = waitpid(-1, &status, WNOHANG | WUNTRACED | __WCLONE)) <= 0) { rc.set_ev(EV_NOCHILD); - rc.set_en(EV_ENTITY_NONE); rc.set_id(rpid); return rs; } - rc.set_en(entity); rc.set_id(rpid); if (WIFEXITED(status)) { @@ -202,69 +192,52 @@ bool event_manager_t::poll_processes( launchmon_rc_e rc = LAUNCHMON_OK; launchmon_event_e ev; - if (ev_monitor->wait_for_all(event)) { - if (event.get_en() == EV_ENTITY_PROCESS) { - // - // A process event is reported - // - if (event.get_id() == p.get_pid(false)) { - // - // The target RM_process reported - // - p.make_context(event.get_id()); - ev = lm.decipher_an_event(p, event); - rc = lm.invoke_handler(p, ev, event.get_signum()); - p.check_and_undo_context(event.get_id()); - } else if (event.get_id() == lm.get_toollauncherpid()) { - // RM_process that launched tool daemons reported - // error handling semantics for C.2 - // - if ((event.get_ev() == EV_EXITED) || - (event.get_ev() == EV_TERMINATED)) { - // - // this means that back-end daemons have exited - // Enforcing C.2 error handling semantics. - // - rc = lm.handle_daemon_exit_event(p); - } - // - // in this case rpid won't be part of the thread list - // so that the following loop body won't be executed. - // - } else { - // - // a unknown new process reported - // - if (event.get_ev() == EV_STOPPED) { - p.make_context(event.get_id()); - rc = lm.invoke_handler(p, LM_STOP_NEW_FORKED_PROCESS, event.get_id()); - p.check_and_undo_context(event.get_id()); - } - } - } else if (event.get_en() == EV_ENTITY_THREAD) { - // - // A thread of the target RM_process reported + if (!ev_monitor->wait_for_all(event)) + goto done; + + if (event.get_id() == p.get_pid(false)) { + // + // The target RM_process reported + // + p.make_context ( event.get_id()); + ev = lm.decipher_an_event ( p, event ); + rc = lm.invoke_handler ( p, ev, event.get_signum() ); + p.check_and_undo_context ( event.get_id() ); + } else if (event.get_id() == lm.get_toollauncherpid()) { + // RM_process that launched tool daemons reported + // error handling semantics for C.2 + // + if ((event.get_ev() == EV_EXITED) + || (event.get_ev() == EV_TERMINATED)) { + // this means that back-end daemons have exited + // Enforcing C.2 error handling semantics. // - map*, ltstr>& tl = - p.get_thrlist(); - - if (tl.find(event.get_id()) == tl.end()) { - // - // Possibly an unknown thread to pick up - // - if (event.get_ev() == EV_STOPPED) { - rc = lm.invoke_handler(p, LM_REQUEST_NEW_THREAD, event.get_id()); - } - } - - if (tl.find(event.get_id()) != tl.end()) { - p.make_context(event.get_id()); - ev = lm.decipher_an_event(p, event); - rc = lm.invoke_handler(p, ev, event.get_signum()); - p.check_and_undo_context(event.get_id()); - } + rc = lm.handle_daemon_exit_event(p); } + } else if ( p.get_thrlist().find (event.get_id()) != p.get_thrlist().end()) { + p.make_context(event.get_id()); + ev = lm.decipher_an_event(p, event); + rc = lm.invoke_handler (p, ev, event.get_signum()); + p.check_and_undo_context(event.get_id()); + } else { + // + // a new process reported -- don't follow + // + if (event.get_ev() == EV_STOPPED) { + p.make_context (event.get_id()); + rc = lm.invoke_handler(p, + LM_STOP_NEW_FORKED_PROCESS, + event.get_id() ); + p.check_and_undo_context (event.get_id()); + } + //p.make_context(event.get_id()); + //ev = lm.decipher_an_event(p, event); + //rc = lm.invoke_handler (p, ev, event.get_signum()); + //lm.handle_thrcreate_trap_event (p); + //p.check_and_undo_context(event.get_id()); } + +done: return ((rc == LAUNCHMON_OK) ? true : false); } diff --git a/launchmon/src/sdbg_opt.cxx b/launchmon/src/sdbg_opt.cxx index c7fd2d6..7938bd9 100644 --- a/launchmon/src/sdbg_opt.cxx +++ b/launchmon/src/sdbg_opt.cxx @@ -352,7 +352,8 @@ bool opts_args_t::process_args(int *argc, char ***argv) { // alternative way to set the engine's verbose level // char *l; - if ((l = getenv("LMON_ENGINE_VERBOSE_LEVEL")) != NULL) { + if ((l = getenv("LMON_ENGINE_VERBOSE_LEVEL")) != NULL + || (l = getenv("LMON_VERBOSITY")) != NULL) { int il = atoi(l); self_trace_verbosity verbo; diff --git a/launchmon/src/sdbg_rm_map.cxx b/launchmon/src/sdbg_rm_map.cxx index 3428c58..3daae0a 100644 --- a/launchmon/src/sdbg_rm_map.cxx +++ b/launchmon/src/sdbg_rm_map.cxx @@ -197,6 +197,8 @@ void resource_manager_t::fill_rm_type(const std::string &v) { rm = RC_mpiexec_hydra; } else if (v == std::string("gupc")) { rm = RC_gupc; + } else if (v == std::string("spectrum")) { + rm = RC_ibm_spectrum; } else { rm = RC_none; } From baefbba6b63978df111db7a9aebefd093dc6c4f5 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 13 Jul 2018 23:09:38 -0700 Subject: [PATCH 05/10] test: Add IBM Spectrum support into test programs --- test/src/fe_launch_middleware.cxx | 8 ++++++++ test/src/fe_launch_smoketest.cxx | 12 ++++++++++++ test/src/fe_launch_usrpayload_test.cxx | 8 ++++++++ test/src/test.attach_1.in | 4 ++++ test/src/test.attach_1_mem_fetcher.in | 4 ++++ test/src/test.attach_1_pdebugmax.in | 4 ++++ test/src/test.attach_1_remote.in | 4 ++++ test/src/test.attach_2_uneven.in | 4 ++++ test/src/test.attach_4_detach.in | 4 ++++ test/src/test.attach_4_kill.in | 4 ++++ test/src/test.attach_4_shutdownbe.in | 4 ++++ test/src/test.fe_regStatusCB.in | 3 +++ test/src/test.jobsnap_1.in | 3 +++ test/src/test.launch_7_kill.in | 3 ++- test/src/test.launch_7_shutdownbe.in | 3 ++- 15 files changed, 70 insertions(+), 2 deletions(-) diff --git a/test/src/fe_launch_middleware.cxx b/test/src/fe_launch_middleware.cxx index d03cb25..08afdd3 100644 --- a/test/src/fe_launch_middleware.cxx +++ b/test/src/fe_launch_middleware.cxx @@ -35,6 +35,7 @@ * * * Update Log: + * Jul 18 2018 DHA: Add IBM JSM Spectrum support * Jun 01 2012 DHA: Copied from 0.8-middleware-support branch and merged * with 1.0-BGQ * Aug 03 2020 DHA: Created file. @@ -213,6 +214,13 @@ int main(int argc, char *argv[]) { launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); + } else if (rmenv_str == std::string("RC_ibm_spectrum")) { + numprocs_opt = string("-p") + string(argv[2]); + launcher_argv = (char **) malloc (4*sizeof(char*)); + launcher_argv[0] = strdup(mylauncher); + launcher_argv[1] = strdup(numprocs_opt.c_str()); + launcher_argv[2] = strdup(argv[1]); + launcher_argv[3] = NULL; } fprintf(stderr, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); diff --git a/test/src/fe_launch_smoketest.cxx b/test/src/fe_launch_smoketest.cxx index 097f785..c9e6a63 100644 --- a/test/src/fe_launch_smoketest.cxx +++ b/test/src/fe_launch_smoketest.cxx @@ -29,6 +29,7 @@ * ./fe_launch_smoketest.debug /bin/hostname 9 5 pdebug `pwd`/be_kicker.debug * * Update Log: + * Jul 16 2018 DHA: Add IBM JSM Spectrum support. * Oct 25 2011 DHA: Added BGQ support. * Oct 21 2011 DHA: Added dynamic RM support. * Nov 12 2009 DHA: Change BG mpirun options to cover /P running under @@ -240,6 +241,17 @@ int main(int argc, char *argv[]) { launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); + } else if (rmenv_str == std::string("RC_ibm_spectrum")) { + numprocs_opt = string("-p") + string(argv[2]); + launcher_argv = (char **) malloc (4*sizeof(char*)); + launcher_argv[0] = strdup(mylauncher); + launcher_argv[1] = strdup(numprocs_opt.c_str()); + launcher_argv[2] = strdup(argv[1]); + launcher_argv[3] = NULL; + } else { + fprintf(stdout, "[LMON FE] Unknown Resource Manger: %s\n", + rmenv_str.c_str()); + return EXIT_FAILURE; } if ((rc = LMON_fe_init(LMON_VERSION)) != LMON_OK) { diff --git a/test/src/fe_launch_usrpayload_test.cxx b/test/src/fe_launch_usrpayload_test.cxx index 9cccec4..ae04f05 100644 --- a/test/src/fe_launch_usrpayload_test.cxx +++ b/test/src/fe_launch_usrpayload_test.cxx @@ -27,6 +27,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Jul 16 2018 DHA: Add IBM JSM Spectrum support. * Mar 04 2008 DHA: Added generic BlueGene support * Jun 17 2008 DHA: Added BlueGene support * Jun 12 2008 DHA: Added GNU build system support @@ -273,6 +274,13 @@ int main(int argc, char *argv[]) { launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); + } else if (rmenv_str == std::string("RC_ibm_spectrum")) { + numprocs_opt = string("-p") + string(argv[2]); + launcher_argv = (char **) malloc (4*sizeof(char*)); + launcher_argv[0] = strdup(mylauncher); + launcher_argv[1] = strdup(numprocs_opt.c_str()); + launcher_argv[2] = strdup(argv[1]); + launcher_argv[3] = NULL; } if ((rc = LMON_fe_init(LMON_VERSION)) != LMON_OK) { diff --git a/test/src/test.attach_1.in b/test/src/test.attach_1.in index 86dd188..d973760 100644 --- a/test/src/test.attach_1.in +++ b/test/src/test.attach_1.in @@ -30,6 +30,7 @@ # Attach the tool to a running program and unlock the hang. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Oct 21 2011 DHA: Added dynamic RM detection support. # Dec 17 2009 DHA: Added minimum WAITAMOUNT @@ -84,6 +85,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_1_mem_fetcher.in b/test/src/test.attach_1_mem_fetcher.in index 1758d06..4e6c4e5 100644 --- a/test/src/test.attach_1_mem_fetcher.in +++ b/test/src/test.attach_1_mem_fetcher.in @@ -30,6 +30,7 @@ # Attach the tool to a running program and unlock the hang. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Oct 21 2011 DHA: Added dynamic RM detection support. # Dec 17 2009 DHA: Added minimum WAITAMOUNT @@ -82,6 +83,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_1_pdebugmax.in b/test/src/test.attach_1_pdebugmax.in index f4f00d8..da19389 100644 --- a/test/src/test.attach_1_pdebugmax.in +++ b/test/src/test.attach_1_pdebugmax.in @@ -30,6 +30,7 @@ # Attach the tool to a running program and unlock the hang. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Oct 21 2011 DHA: Added dynamic RM detection support. # Mar 06 2009 DHA: Changed bglrm to bgrm @@ -71,6 +72,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_1_remote.in b/test/src/test.attach_1_remote.in index a84ba5a..635da75 100644 --- a/test/src/test.attach_1_remote.in +++ b/test/src/test.attach_1_remote.in @@ -31,6 +31,7 @@ # of its associated job. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Oct 21 2011 DHA: Added dynamic RM detection support. # Dec 17 2009 DHA: Added minimum waittime @@ -77,6 +78,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_2_uneven.in b/test/src/test.attach_2_uneven.in index a1b6656..c093fcd 100644 --- a/test/src/test.attach_2_uneven.in +++ b/test/src/test.attach_2_uneven.in @@ -31,6 +31,7 @@ # number of processors per node, and unlock the hang. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Oct 21 2011 DHA: Added dynamic RM detection support. # Mar 06 2009 DHA: Changed bglrm to bgrm @@ -82,6 +83,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_4_detach.in b/test/src/test.attach_4_detach.in index 355d63b..1e1bbe9 100644 --- a/test/src/test.attach_4_detach.in +++ b/test/src/test.attach_4_detach.in @@ -31,6 +31,7 @@ # and kill the BE daemons. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Aug 02 2012 DHA: Created file. # @@ -75,6 +76,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_4_kill.in b/test/src/test.attach_4_kill.in index 32f7eea..be891eb 100644 --- a/test/src/test.attach_4_kill.in +++ b/test/src/test.attach_4_kill.in @@ -31,6 +31,7 @@ # and kill the BE daemons. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Jun 05 2012 DHA: Added subtest support # Oct 21 2011 DHA: Added dynamic RM detection support. @@ -80,6 +81,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.attach_4_shutdownbe.in b/test/src/test.attach_4_shutdownbe.in index 2313b68..92e182d 100644 --- a/test/src/test.attach_4_shutdownbe.in +++ b/test/src/test.attach_4_shutdownbe.in @@ -31,6 +31,7 @@ # shut down BE daemons . # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM support. # May 04 2016 DHA: Add test in-tree and installed support # Jun 05 2012 DHA: Added subtest support # Oct 21 2011 DHA: Added dynamic RM detection support. @@ -82,6 +83,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.fe_regStatusCB.in b/test/src/test.fe_regStatusCB.in index cf6eb66..25163a6 100644 --- a/test/src/test.fe_regStatusCB.in +++ b/test/src/test.fe_regStatusCB.in @@ -77,6 +77,9 @@ elif test "x$RM_TYPE" = "xRC_orte" ; then elif test "x$RM_TYPE" = "xRC_mpiexec_hydra" ; then WAITAMOUNT=`expr $WAITAMOUNT` $MPI_JOB_LAUNCHER_PATH -n $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.jobsnap_1.in b/test/src/test.jobsnap_1.in index eba4808..d61b032 100644 --- a/test/src/test.jobsnap_1.in +++ b/test/src/test.jobsnap_1.in @@ -61,6 +61,9 @@ if test "x$RM_TYPE" = "xRC_slurm" ; then $MPI_JOB_LAUNCHER_PATH -n$NUMTASKS -N$NUMNODES -ppdebug `pwd`/hang_on_SIGUSR1@EXE@ & elif test "x$RM_TYPE" = "xRC_bgrm" ; then $MPI_JOB_LAUNCHER_PATH -verbose 1 -np $NUMTASKS -exe `pwd`/hang_on_SIGUSR1@EXE@ -cwd `pwd` & +elif test "x$RM_TYPE" = "xRC_ibm_spectrum" ; then + WAITAMOUNT=`expr $WAITAMOUNT` + $MPI_JOB_LAUNCHER_PATH -p $NUMTASKS `pwd`/hang_on_SIGUSR1@EXE@ & else echo "This RM is not supported yet" fi diff --git a/test/src/test.launch_7_kill.in b/test/src/test.launch_7_kill.in index 0f05cbb..f4f6a16 100644 --- a/test/src/test.launch_7_kill.in +++ b/test/src/test.launch_7_kill.in @@ -31,6 +31,7 @@ # one daemon per node, which unlocks the initial hang of all tasks. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM Spectrum support. # May 04 2016 DHA: Add test in-tree and installed support # Aug 1 2012 DHA: Created file. # @@ -48,7 +49,7 @@ export LMON_FE_KILL_TEST=1 NUMNODES=@NNODES@ NOHUP="" -if test "x$RM_TYPE" = "xRC_bglrm" -o "x$RM_TYPE" = "xRC_bgprm"; then +if test "x$RM_TYPE" = "xRC_bglrm" -o "x$RM_TYPE" = "xRC_bgprm" -o "x$RM_TYPE" = "xRC_ibm_spectrum"; then NOHUP=nohup rm -f nohup.out fi diff --git a/test/src/test.launch_7_shutdownbe.in b/test/src/test.launch_7_shutdownbe.in index ee26759..848ca15 100644 --- a/test/src/test.launch_7_shutdownbe.in +++ b/test/src/test.launch_7_shutdownbe.in @@ -31,6 +31,7 @@ # one daemon per node, which unlocks the initial hang of all tasks. # # Update Log: +# Jul 16 2018 DHA: Add IBM JSM Spectrum support # May 04 2016 DHA: Add test in-tree and installed support # Aug 01 2012 DHA: Created the file # @@ -48,7 +49,7 @@ export LMON_FE_SHUTDOWNBE_TEST=1 NUMNODES=@NNODES@ NOHUP="" -if test "x$RM_TYPE" = "xRC_bglrm" -o "x$RM_TYPE" = "xRC_bgprm"; then +if test "x$RM_TYPE" = "xRC_bglrm" -o "x$RM_TYPE" = "xRC_bgprm" -o "x$RM_TYPE" = "xRC_ibm_spectrum"; then NOHUP=nohup rm -f nohup.out fi From 8d0c9c68b724f0025194d90f9887a1fe0a90c276 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Mon, 16 Jul 2018 20:51:27 -0700 Subject: [PATCH 06/10] test: cleanup Remove BG deadwoods from test programs Add misaligned header comments Refactor printing of launcher path --- test/src/fe_launch_middleware.cxx | 39 ++++++-------------------- test/src/fe_launch_smoketest.cxx | 18 ++---------- test/src/fe_launch_usrpayload_test.cxx | 17 ++--------- 3 files changed, 12 insertions(+), 62 deletions(-) diff --git a/test/src/fe_launch_middleware.cxx b/test/src/fe_launch_middleware.cxx index 08afdd3..115603a 100644 --- a/test/src/fe_launch_middleware.cxx +++ b/test/src/fe_launch_middleware.cxx @@ -1,39 +1,31 @@ /* - * $Header: $ *-------------------------------------------------------------------------------- - * Copyright (c) 2008-2010, Lawrence Livermore National Security, LLC. Produced - *at + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. Produced at * the Lawrence Livermore National Laboratory. Written by Dong H. Ahn - *. - * LLNL-CODE-409469. All rights reserved. + * . LLNL-CODE-409469. All rights reserved. * * This file is part of LaunchMON. For details, see * https://computing.llnl.gov/?set=resources&page=os_projects * * Please also read LICENSE.txt -- Our Notice and GNU Lesser General Public - *License. + * License. * * * This program is free software; you can redistribute it and/or modify it under - *the - * terms of the GNU General Public License (as published by the Free Software - * Foundation) version 2.1 dated February 1999. + * the terms of the GNU General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. * * This program is distributed in the hope that it will be useful, but WITHOUT - *ANY - * WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License - *along - * with this program; if not, write to the Free Software Foundation, Inc., 59 - *Temple + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple * Place, Suite 330, Boston, MA 02111-1307 USA *-------------------------------------------------------------------------------- * - * - * * Update Log: * Jul 18 2018 DHA: Add IBM JSM Spectrum support * Jun 01 2012 DHA: Copied from 0.8-middleware-support branch and merged @@ -142,17 +134,7 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup("--exe"); launcher_argv[6] = strdup(argv[1]); - // manually fill the block - // launcher_argv[7] = strdup("--block"); - // launcher_argv[8] = strdup("R00-M0-N04"); - // manually fill the corner - // launcher_argv[9] = strdup("--corner"); - // launcher_argv[10] = strdup("R00-M0-N04-J07"); - // manually fill the shape - // launcher_argv[11] = strdup("--shape"); - // launcher_argv[12] = strdup("1x1x1x1x1"); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if ((rmenv_str == std::string("RC_bgq_slurm"))) { launcher_argv = (char **)malloc(7 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -162,8 +144,6 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup(argv[1]); launcher_argv[6] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", - "mylauncher"); } else if ((rmenv_str == std::string("RC_bglrm")) || (rmenv_str == std::string("RC_bgprm"))) { launcher_argv = (char **)malloc(8 * sizeof(char *)); @@ -175,7 +155,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup("-exe"); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_slurm")) { numprocs_opt = string("-n") + string(argv[2]); numnodes_opt = string("-N") + string(argv[3]); @@ -205,7 +184,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup(argv[2]); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_mpiexec_hydra")) { launcher_argv = (char **)malloc(5 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -213,7 +191,6 @@ int main(int argc, char *argv[]) { launcher_argv[2] = strdup(argv[2]); launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_ibm_spectrum")) { numprocs_opt = string("-p") + string(argv[2]); launcher_argv = (char **) malloc (4*sizeof(char*)); diff --git a/test/src/fe_launch_smoketest.cxx b/test/src/fe_launch_smoketest.cxx index c9e6a63..ca3af06 100644 --- a/test/src/fe_launch_smoketest.cxx +++ b/test/src/fe_launch_smoketest.cxx @@ -168,18 +168,7 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup("--exe"); launcher_argv[6] = strdup(argv[1]); - // manually fill the block - // launcher_argv[7] = strdup("--block"); - // launcher_argv[8] = strdup("R00-M0-N04"); - // manually fill the corner - // launcher_argv[9] = strdup("--corner"); - // launcher_argv[10] = strdup("R00-M0-N04-J07"); - // manually fill the shape - // launcher_argv[11] = strdup("--shape"); - // launcher_argv[12] = strdup("1x1x1x1x1"); - // launcher_argv[13] = NULL; launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if ((rmenv_str == std::string("RC_bgq_slurm"))) { launcher_argv = (char **)malloc(7 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -189,8 +178,6 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup(argv[1]); launcher_argv[6] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", - "mylauncher"); } else if ((rmenv_str == std::string("RC_bglrm")) || (rmenv_str == std::string("RC_bgprm"))) { launcher_argv = (char **)malloc(8 * sizeof(char *)); @@ -202,7 +189,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup("-exe"); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_slurm")) { numprocs_opt = string("-n") + string(argv[2]); numnodes_opt = string("-N") + string(argv[3]); @@ -232,7 +218,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup(argv[2]); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_mpiexec_hydra")) { launcher_argv = (char **)malloc(5 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -240,7 +225,6 @@ int main(int argc, char *argv[]) { launcher_argv[2] = strdup(argv[2]); launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_ibm_spectrum")) { numprocs_opt = string("-p") + string(argv[2]); launcher_argv = (char **) malloc (4*sizeof(char*)); @@ -254,6 +238,8 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } + fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); + if ((rc = LMON_fe_init(LMON_VERSION)) != LMON_OK) { fprintf(stdout, "[LMON FE] FAILED\n"); return EXIT_FAILURE; diff --git a/test/src/fe_launch_usrpayload_test.cxx b/test/src/fe_launch_usrpayload_test.cxx index ae04f05..24bdeab 100644 --- a/test/src/fe_launch_usrpayload_test.cxx +++ b/test/src/fe_launch_usrpayload_test.cxx @@ -202,17 +202,7 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup("--exe"); launcher_argv[6] = strdup(argv[1]); - // manually fill the block - // launcher_argv[7] = strdup("--block"); - // launcher_argv[8] = strdup("R00-M0-N04"); - // manually fill the corner - // launcher_argv[9] = strdup("--corner"); - // launcher_argv[10] = strdup("R00-M0-N04-J07"); - // manually fill the shape - // launcher_argv[11] = strdup("--shape"); - // launcher_argv[12] = strdup("1x1x1x1x1"); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if ((rmenv_str == std::string("RC_bgq_slurm"))) { launcher_argv = (char **)malloc(7 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -222,8 +212,6 @@ int main(int argc, char *argv[]) { launcher_argv[4] = strdup(argv[2]); launcher_argv[5] = strdup(argv[1]); launcher_argv[6] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", - "mylauncher"); } else if ((rmenv_str == std::string("RC_bglrm")) || (rmenv_str == std::string("RC_bgprm"))) { launcher_argv = (char **)malloc(8 * sizeof(char *)); @@ -235,7 +223,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup("-exe"); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_slurm")) { numprocs_opt = string("-n") + string(argv[2]); numnodes_opt = string("-N") + string(argv[3]); @@ -265,7 +252,6 @@ int main(int argc, char *argv[]) { launcher_argv[5] = strdup(argv[2]); launcher_argv[6] = strdup(argv[1]); launcher_argv[7] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_mpiexec_hydra")) { launcher_argv = (char **)malloc(5 * sizeof(char *)); launcher_argv[0] = strdup(mylauncher); @@ -273,7 +259,6 @@ int main(int argc, char *argv[]) { launcher_argv[2] = strdup(argv[2]); launcher_argv[3] = strdup(argv[1]); launcher_argv[4] = NULL; - fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); } else if (rmenv_str == std::string("RC_ibm_spectrum")) { numprocs_opt = string("-p") + string(argv[2]); launcher_argv = (char **) malloc (4*sizeof(char*)); @@ -283,6 +268,8 @@ int main(int argc, char *argv[]) { launcher_argv[3] = NULL; } + fprintf(stdout, "[LMON_FE] launching the job/daemons via %s\n", mylauncher); + if ((rc = LMON_fe_init(LMON_VERSION)) != LMON_OK) { fprintf(stdout, "[LMON FE] LMON_fe_init FAILED\n"); return EXIT_FAILURE; From 97c8ee637ef08754fb530e829399ce1e6c505f6f Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Mon, 17 Sep 2018 16:03:17 -0700 Subject: [PATCH 07/10] engine: Remove an MPIR_attach_fifo work-around jsrun had a bug where MPIR_attach_fifo is expecting a byte that contains an ASCII '1' (49) when it should expect a byte that contains integer 1. The specification is at https://www.mpi-forum.org/docs/mpir-specification-10-11-2010.pdf --- launchmon/src/linux/sdbg_linux_launchmon.cxx | 7 ------- 1 file changed, 7 deletions(-) diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index 225b3ed..4f90942 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -1683,14 +1683,7 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( get_tracer()->tracer_continue(p, use_cxt); int fifofd = 0; if ((fifofd = open(fifopathbuf, O_WRONLY)) >= 0) { -#if POWERLE_ARCHITECTURE - char wakeup = '1'; - self_trace_t::trace( - true, MODULENAME, 0, - "Warning: Sending ASCII 1 to FIFO to work around a jsrun bug"); -#else char wakeup = (char)1; -#endif if (lmon_write_raw(fifofd, &wakeup, 1) != 1) { self_trace_t::trace( true, MODULENAME, 0, From 0e320134cfc9e7bf061d01becf523fcfb03e83a2 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 5 Oct 2018 14:29:52 -0700 Subject: [PATCH 08/10] engine: Add a warning note for RM designers Warn about a common bug where the target RM process cannot detect the FIFO poke. When launchmon send integer 1 to the FIFO while the RM process is still stopped, the sent can go undetected even when the RM process resumes its execution depending on how the FIFO is polled. --- launchmon/src/linux/sdbg_linux_launchmon.cxx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index 4f90942..87544b0 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -1676,11 +1676,16 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( std::string fip = fifopathbuf; p.rmgr()->set_attach_fifo_path(fip); + get_tracer()->tracer_continue(p, use_cxt); // - // We have to continue the target process before starting FIFO - // otherwise open on the FIFO will block + // NOTE: Depending on how FIFO is polled within the target + // RM process, poking the FIFO may not be effected. RM designer + // must ensure that FIFO is being polled in a way such that + // when 1 is sent to the FIFO while it is being stopped, it + // will ultimately picked up when the process resumes execution. // - get_tracer()->tracer_continue(p, use_cxt); + self_trace_t::trace(LEVELCHK(level2), MODULENAME, + 0, "MPIR_attach_fifo: %s", fifopathbuf); int fifofd = 0; if ((fifofd = open(fifopathbuf, O_WRONLY)) >= 0) { char wakeup = (char)1; From 057676013a2ce1fc9fdbc5f00e82a8de2ef4654a Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 5 Oct 2018 14:33:46 -0700 Subject: [PATCH 09/10] engine: Drop some traces that always print -- deadwood? --- launchmon/src/linux/sdbg_linux_launchmon.cxx | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index 87544b0..daeb05a 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -1752,12 +1752,6 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_attach_event( // is invoked just once per job #endif - { - self_trace_t::trace( - true, // print always - MODULENAME, 0, "Just continued the RM process out of the first trap"); - } - set_last_seen(gettimeofdayD()); return LAUNCHMON_OK; @@ -1919,13 +1913,6 @@ launchmon_rc_e linux_launchmon_t::handle_trap_after_exec_event( endTS = gettimeofdayD(); accum += endTS - beginTS; countHandler++; - // accum and countHandler now contain the cost of this handler which - // is invoked just once per job - { - self_trace_t::trace( - true, // print always - MODULENAME, 0, "Just continued the RM process out of the first trap"); - } #endif From f75344d0f70df6f9c0581b82fee0dba855a8f4e0 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 5 Oct 2018 14:34:02 -0700 Subject: [PATCH 10/10] engine: Add a update comment --- launchmon/src/linux/sdbg_linux_launchmon.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index daeb05a..0ac0b39 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -27,6 +27,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Oct 10 2018 DHA: Added PowerLE support for Sierra * May 02 2018 KMD: Added aarch64 support * Jul 22 2015 ADG: Fix for on demand proctable * Feb 20 2015 andrewg@cray.com: Added support for RMs that build the