From 474802143b3e3f2f43c558df5d0525d02879c025 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 24 May 2023 17:33:34 +0200
Subject: [PATCH 01/37] Add new unit-test for eWiseApply(matrices)

---
 tests/unit/CMakeLists.txt                |   4 +
 tests/unit/eWiseApplyMatrix_variants.cpp | 284 +++++++++++++++++++++++
 tests/unit/unittests.sh                  |  14 +-
 3 files changed, 298 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/eWiseApplyMatrix_variants.cpp
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 16999fd42..815db9d2b 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -85,6 +85,10 @@ add_grb_executables( ewiseapply ewiseapply.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+add_grb_executables( eWiseApplyMatrix_variants eWiseApplyMatrix_variants.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
new file mode 100644
index 000000000..0e57b8f58
--- /dev/null
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -0,0 +1,284 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Benjamin Lozes
+ * @date 24th of May, 2023
+ * 
+ * @brief Test for eWiseApply(Matrix, Monoid) 
+ *		  and eWiseApply(Matrix, Operator) variants
+ * 
+ * This test is meant to ensure the behaviour of the eWiseApply(Matrix, Monoid)
+ * and eWiseApply(Matrix, Operator) variants is correct. Precisely, we expect
+ * the following behaviour:
+ * 		- eWiseApply(Matrix, Monoid) should apply the monoid to all elements of
+ * 		  the two matrices, INCLUDING the couples (non_zero, zero), using the
+ * 		  provided identity value for the zero elements.
+ * 		- eWiseApply(Matrix, Operator) should apply the operator to all elements
+ * 		  of the two matrices, EXCLUDING the couples (non_zero, zero)
+ * 
+ */
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <vector>
+
+#include <graphblas.hpp>
+
+#define _DEBUG
+
+using nz_type = int;
+
+constexpr size_t M = 10;
+constexpr size_t N = 10;
+constexpr nz_type A_INITIAL_VALUE = 1;
+constexpr nz_type B_INITIAL_VALUE = 3;
+
+namespace utils {
+	template< class Iterator >
+	void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+#ifndef _DEBUG
+		return;
+#endif
+		std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+		if( rows > 50 || cols > 50 ) {
+			os << "   Matrix too large to print" << std::endl;
+		} else {
+			// os.precision( 3 );
+			for( size_t y = 0; y < rows; y++ ) {
+				os << std::string( 3, ' ' );
+				for( size_t x = 0; x < cols; x++ ) {
+					auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+						return a.first.first == y && a.first.second == x;
+					} );
+					if( nnz_val != end )
+						os << std::fixed << ( *nnz_val ).second;
+					else
+						os << '_';
+					os << " ";
+				}
+				os << std::endl;
+			}
+		}
+		os << "]" << std::endl;
+		std::flush( os );
+	}
+
+	template< typename D >
+	void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+		grb::wait( mat );
+		printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+	}
+
+	template< typename D >
+	bool equals_matrix( const grb::Matrix< D > & A, const grb::Matrix< D > & B ) {
+		if( grb::nrows( A ) != grb::nrows( B ) || grb::ncols( A ) != grb::ncols( B ) )
+			return false;
+		grb::wait( A );
+		grb::wait( B );
+		std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() );
+		std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() );
+		return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
+	}
+} // namespace utils
+
+template< class Monoid >
+struct input_t {
+	const grb::Matrix< nz_type > & A;
+	const grb::Matrix< nz_type > & B;
+	const grb::Matrix< nz_type > & C_monoid;
+	const grb::Matrix< nz_type > & C_operator;
+	const Monoid & monoid;
+
+	input_t( 
+		const grb::Matrix< nz_type > & A = {0,0},
+		const grb::Matrix< nz_type > & B = {0,0},
+		const grb::Matrix< nz_type > & C_monoid = {0,0},
+		const grb::Matrix< nz_type > & C_operator = {0,0},
+		const Monoid & monoid = Monoid() ) :
+		A( A ), B( B ), C_monoid( C_monoid ), C_operator( C_operator ), monoid( monoid ) {}
+};	
+
+struct output_t {
+	grb::RC rc;
+};
+
+template< class Monoid >
+void grb_program( const input_t< Monoid > & input, output_t & output ) {
+	static_assert( grb::is_monoid< Monoid >::value, "Monoid required" );
+	const auto & op = input.monoid.getOperator();
+	grb::wait( input.A );
+	grb::wait( input.B );
+
+	auto & rc = output.rc;
+
+	utils::printSparseMatrix( input.A, "A" );
+	utils::printSparseMatrix( input.B, "B" );
+
+	{ // Operator variant
+		std::cout << "-- eWiseApply using Operator, supposed to be annihilating non-zeroes -> INTERSECTION\n";
+		grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) );
+		rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::RESIZE );
+		grb::wait( C );
+		if( rc != grb::RC::SUCCESS ) {
+			std::cerr << "Error: Phase::RESIZE\n";
+			return;
+		}
+		rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::EXECUTE );
+		grb::wait( C );
+		if( rc != grb::RC::SUCCESS ) {
+			std::cerr << "Error: Phase::EXECUTE\n";
+			return;
+		}
+
+		if( ! utils::equals_matrix( C, input.C_operator ) ) {
+			std::cerr << "Error: Wrong result\n";
+			utils::printSparseMatrix( C, "Obtained (operator)", std::cerr );
+			utils::printSparseMatrix( input.C_operator, "Truth (operator)", std::cerr );
+			rc = grb::RC::FAILED;
+			return;
+		}
+
+		std::cout << "Result (operator) is correct\n";
+	}
+
+	{ // Monoid variant
+		std::cout << "-- eWiseApply using Monoid, supposed to consider non-zeroes as the identity -> UNION\n";
+		grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) );
+		rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::RESIZE );
+		grb::wait( C );
+		if( rc != grb::RC::SUCCESS ) {
+			std::cerr << "Error: Phase::RESIZE\n";
+			return;
+		}
+		rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::EXECUTE );
+		grb::wait( C );
+		if( rc != grb::RC::SUCCESS ) {
+			std::cerr << "Error: Phase::EXECUTE\n";
+			return;
+		}
+
+		if( ! utils::equals_matrix( C, input.C_monoid ) ) {
+			std::cerr << "Error: Wrong result\n";
+			utils::printSparseMatrix( C, "Obtained (monoid)", std::cerr );
+			utils::printSparseMatrix( input.C_monoid, "Truth (monoid)", std::cerr );
+			rc = grb::RC::FAILED;
+			return;
+		}
+
+		std::cout << "Result (monoid) is correct\n";
+	}
+
+	rc = grb::RC::SUCCESS;
+}
+
+int main( int argc, char ** argv ) {
+	(void) argc;
+	(void) argv;
+
+	if(argc > 1) std::cout << "Usage: " << argv[ 0 ] << std::endl;
+
+	std::cout << "This is functional test " << argv[ 0 ] << std::endl;
+	grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher;
+	grb::RC rc = grb::RC::SUCCESS;
+
+	// Create input data
+	/** Matrix A: Row matrix filled with A_INITIAL_VALUE
+	 *  X X X X X
+	 * 	_ _ _ _ _
+	 * 	_ _ _ _ _ (...)
+	 * 	_ _ _ _ _
+	 * 	_ _ _ _ _
+	 * 	  (...)
+	 */
+	grb::Matrix< nz_type > A( M, N, N );
+	std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
+	std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
+	std::iota( A_cols.begin(), A_cols.end(), 0 );
+	rc = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::SEQUENTIAL );
+	assert( rc == grb::RC::SUCCESS );
+
+	/** Matrix B: Column matrix filled with B_INITIAL_VALUE
+	 *  Y _ _ _ _
+	 * 	Y _ _ _ _
+	 * 	Y _ _ _ _ (...)
+	 * 	Y _ _ _ _
+	 * 	Y _ _ _ _
+	 * 	  (...)
+	 */
+	grb::Matrix< nz_type > B( M, N, N );
+	std::vector< size_t > B_rows( M, 0 ), B_cols( M, 0 );
+	std::vector< nz_type > B_values( M, B_INITIAL_VALUE );
+	std::iota( B_rows.begin(), B_rows.end(), 0 );
+	rc = grb::buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), grb::IOMode::SEQUENTIAL );
+	assert( rc == grb::RC::SUCCESS );
+
+	{
+		/** Matrix C_monoid_truth: Union of A and B
+		 * X+Y  X   X   X   X
+		 * Y  ___ ___ ___ ___
+		 * Y  ___ ___ ___ ___ (...)
+		 * Y  ___ ___ ___ ___
+		 * Y  ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		grb::Matrix< nz_type > C_monoid_truth( M, N );
+		size_t nvalues = grb::nrows( A ) + grb::ncols( B ) - 1;
+		std::vector< size_t > C_monoid_truth_rows( nvalues, 0 ), C_monoid_truth_cols( nvalues, 0 );
+		std::vector< nz_type > C_monoid_truth_values( nvalues, 0 );
+		C_monoid_truth_values[ 0 ] = A_INITIAL_VALUE + B_INITIAL_VALUE;
+		std::iota( C_monoid_truth_rows.begin() + grb::nrows( A ), C_monoid_truth_rows.end(), 1 );
+		std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + grb::nrows( A ), 1 );
+		std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + grb::nrows( A ), A_INITIAL_VALUE );
+		std::fill( C_monoid_truth_values.begin() + grb::nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE );
+		rc = grb::buildMatrixUnique( C_monoid_truth, C_monoid_truth_rows.data(), C_monoid_truth_cols.data(), C_monoid_truth_values.data(), C_monoid_truth_values.size(), grb::IOMode::SEQUENTIAL );
+		assert( rc == grb::RC::SUCCESS );
+
+		/** Matrix C_op_truth: Intersection of A and B
+		 *  X+Y ___ ___ ___ ___
+		 * 	___ ___ ___ ___ ___
+		 * 	___ ___ ___ ___ ___ (...)
+		 * 	___ ___ ___ ___ ___
+		 * 	___ ___ ___ ___ ___
+		 * 	       (...)
+		 */
+		grb::Matrix< nz_type > C_op_truth( M, N );
+		std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 );
+		std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE );
+		rc = grb::buildMatrixUnique( C_op_truth, C_op_truth_rows.data(), C_op_truth_cols.data(), C_op_truth_values.data(), C_op_truth_values.size(), grb::IOMode::SEQUENTIAL );
+		assert( rc == grb::RC::SUCCESS );
+
+		{ /** Test using addition operator, same type for lhs and rhs
+		   */
+			input_t< grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero > > input { A, B, C_monoid_truth, C_op_truth,
+				grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero >() };
+			output_t output { grb::RC::SUCCESS };
+			// Run the test
+			rc = launcher.exec( &grb_program, input, output, false );
+			// Check the result
+			assert( rc == grb::RC::SUCCESS );
+			if( output.rc != grb::RC::SUCCESS ) {
+				std::cout << "Test FAILED (" << grb::toString( output.rc ) << ")" << std::endl;
+				return 1;
+			}
+		}
+	}
+
+	std::cout << "Test OK" << std::endl;
+	return 0;
+}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 3817164c8..f34229c16 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -539,10 +539,16 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::eWiseLambda (matrices)"
-				$runner ${TEST_BIN_DIR}/eWiseMatrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
-				grep 'Test OK' ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo ">>>      [x]           [ ]       Testing grb::id on vectors and matrices"
+				$runner ${TEST_BIN_DIR}/id_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
+				echo ">>>      [x]           [ ]       Testing grb::eWiseApply (matrices, Monoid / Operator)"
+				$runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::zip on two vectors of doubles and"

From 0294312403fe3ac88571e431ad09b3d6974fb76c Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 29 Jun 2023 14:20:11 +0200
Subject: [PATCH 02/37] Implement Monoid variant of BLAS3::eWiseApply

---
 include/graphblas/reference/blas3.hpp | 333 ++++++++++++++++++++++----
 1 file changed, 285 insertions(+), 48 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index f3f918734..e77478564 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -26,6 +26,7 @@
 
 #include <graphblas/base/blas3.hpp>
 #include <graphblas/utils/iterators/MatrixVectorIterator.hpp>
+#include <vector>
 
 #include "io.hpp"
 #include "matrix.hpp"
@@ -928,22 +929,20 @@ namespace grb {
 		 *                      \a allow_void is true; otherwise, will be ignored.
 		 * \endinternal
 		 */
-
 		template<
 			bool allow_void,
 			Descriptor descr,
-			class MulMonoid, class Operator,
+			class Operator,
 			typename OutputType, typename InputType1, typename InputType2,
 			typename RIT1, typename CIT1, typename NIT1,
 			typename RIT2, typename CIT2, typename NIT2,
 			typename RIT3, typename CIT3, typename NIT3
 		>
-		RC eWiseApply_matrix_generic(
+		RC eWiseApply_matrix_generic_intersection(
 			Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
 			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
 			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
 			const Operator &oper,
-			const MulMonoid &mulMonoid,
 			const Phase &phase,
 			const typename std::enable_if<
 				!grb::is_object< OutputType >::value &&
@@ -958,15 +957,14 @@ namespace grb {
 				     std::is_same< InputType1, void >::value ||
 				     std::is_same< InputType2, void >::value
 				) ),
-				"grb::internal::eWiseApply_matrix_generic: the non-monoid version of "
+				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of "
 				"elementwise mxm can only be used if neither of the input matrices "
 				"is a pattern matrix (of type void)" );
 			assert( phase != TRY );
 
 #ifdef _DEBUG
-			std::cout << "In grb::internal::eWiseApply_matrix_generic\n";
+			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
 #endif
-
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
 			constexpr bool trans_right = descr & descriptors::transpose_right;
@@ -992,31 +990,6 @@ namespace grb {
 			auto &C_raw = internal::getCRS( C );
 			auto &CCS_raw = internal::getCCS( C );
 
-#ifdef _DEBUG
-			std::cout << "\t\t A offset array = { ";
-			for( size_t i = 0; i <= m_A; ++i ) {
-				std::cout << A_raw.col_start[ i ] << " ";
-			}
-			std::cout << "}\n";
-			for( size_t i = 0; i < m_A; ++i ) {
-				for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-					std::cout << "\t\t ( " << i << ", " << A_raw.row_index[ k ] << " ) = "
-						<< A_raw.getPrintValue( k ) << "\n";
-				}
-			}
-			std::cout << "\t\t B offset array = { ";
-			for( size_t j = 0; j <= m_B; ++j ) {
-				std::cout << B_raw.col_start[ j ] << " ";
-			}
-			std::cout << "}\n";
-			for( size_t j = 0; j < m_B; ++j ) {
-				for( size_t k = B_raw.col_start[ j ]; k < B_raw.col_start[ j + 1 ]; ++k ) {
-					std::cout << "\t\t ( " << B_raw.row_index[ k ] << ", " << j << " ) = "
-						<< B_raw.getPrintValue( k ) << "\n";
-				}
-			}
-#endif
-
 			// retrieve buffers
 			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
 			arr1 = arr2 = buf1 = buf2 = nullptr;
@@ -1146,11 +1119,9 @@ namespace grb {
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
-						valbuf[ k_col ] = A_raw.getValue( k,
-							mulMonoid.template getIdentity< typename Operator::D1 >() );
+						valbuf[ k_col ] = A_raw.values[ k ];
 #ifdef _DEBUG
-						std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k,
-							mulMonoid.template getIdentity< typename Operator::D1 >() ) << ", ";
+						std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.values[ k ] << ", ";
 #endif
 					}
 #ifdef _DEBUG
@@ -1160,11 +1131,9 @@ namespace grb {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( coors1.assigned( l_col ) ) {
 							coors2.assign( l_col );
-							(void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.getValue( l,
-								mulMonoid.template getIdentity< typename Operator::D2 >() ), oper );
+							(void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.values[ l ], oper );
 #ifdef _DEBUG
-							std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l,
-								mulMonoid.template getIdentity< typename Operator::D2 >() )
+							std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.values[ l ]
 							<< " to yield C( " << i << ", " << l_col << " ), ";
 #endif
 						}
@@ -1190,6 +1159,278 @@ namespace grb {
 #endif
 				}
 
+#ifndef NDEBUG
+				for( size_t j = 0; j < n; ++j ) {
+					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
+				}
+#endif
+
+				// set final number of nonzeroes in output matrix
+				internal::setCurrentNonzeroes( C, nzc );
+			}
+
+			// done
+			return SUCCESS;
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply
+		 *           variants refer to.
+		 * @param[in] oper The operator corresponding to \a mulMonoid if
+		 *                 \a allow_void is true; otherwise, an arbitrary operator
+		 *                 under which to perform the eWiseApply.
+		 * @param[in] mulMonoid The monoid under which to perform the eWiseApply if
+		 *                      \a allow_void is true; otherwise, will be ignored.
+		 * \endinternal
+		 */
+		template<
+			bool allow_void,
+			Descriptor descr,
+			class Monoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
+		>
+		RC eWiseApply_matrix_generic_union(
+			Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
+			const Monoid &monoid,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+		) {
+
+			assert( !(descr & descriptors::force_row_major ) );
+			static_assert( allow_void ||
+				( !(
+				     std::is_same< InputType1, void >::value ||
+				     std::is_same< InputType2, void >::value
+				) ),
+				"grb::internal::eWiseApply_matrix_generic_union: the non-monoid version of "
+				"elementwise mxm can only be used if neither of the input matrices "
+				"is a pattern matrix (of type void)" );
+			assert( phase != TRY );
+#ifdef _DEBUG
+			std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n";
+#endif
+			// get whether the matrices should be transposed prior to execution
+			constexpr bool trans_left = descr & descriptors::transpose_left;
+			constexpr bool trans_right = descr & descriptors::transpose_right;
+
+			// run-time checks
+			const size_t m = grb::nrows( C );
+			const size_t n = grb::ncols( C );
+			const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A );
+			const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A );
+			const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B );
+			const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B );
+
+			// Identities
+			const auto identity_A = monoid.template getIdentity< OutputType >();
+			const auto identity_B = monoid.template getIdentity< OutputType >();
+
+			if( m != m_A || m != m_B || n != n_A || n != n_B ) {
+				return MISMATCH;
+			}
+
+			const auto oper = monoid.getOperator();
+			const auto &A_raw = !trans_left ?
+				internal::getCRS( A ) :
+				internal::getCCS( A );
+			const auto &B_raw = !trans_right ?
+				internal::getCRS( B ) :
+				internal::getCCS( B );
+			auto &C_raw = internal::getCRS( C );
+			auto &CCS_raw = internal::getCCS( C );
+
+
+			// retrieve buffers
+			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
+			arr1 = arr2 = buf1 = buf2 = nullptr;
+			InputType1 * vbuf1 = nullptr;
+			InputType2 * vbuf2 = nullptr;
+			OutputType * valbuf = nullptr;
+			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
+			internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B );
+			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
+			// end buffer retrieval
+
+			// initialisations
+			internal::Coordinates< reference > coors1, coors2;
+			coors1.set( arr1, false, buf1, n );
+			coors2.set( arr2, false, buf2, n );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+			#pragma omp parallel
+			{
+				size_t start, end;
+				config::OMP::localRange( start, end, 0, n + 1 );
+#else
+				const size_t start = 0;
+				const size_t end = n + 1;
+#endif
+				for( size_t j = start; j < end; ++j ) {
+					CCS_raw.col_start[ j ] = 0;
+				}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+			}
+#endif
+			// end initialisations
+
+			// nonzero count
+			size_t nzc = 0;
+
+			// symbolic phase
+			if( phase == RESIZE ) {
+				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						coors1.assign( k_col );
+						(void)++nzc;
+					}
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( not coors1.assigned( l_col ) ) {
+							(void)++nzc;
+						}
+					}
+				}
+
+				const RC ret = grb::resize( C, nzc );
+#ifdef _DEBUG
+				std::cout << "grb::resize( C, " << nzc << " ) = " << ret << "\n";
+#endif
+				return ret;
+			}
+
+			// computational phase
+			if( phase == EXECUTE ) {
+				// retrieve additional buffer
+				config::NonzeroIndexType * const C_col_index = internal::template
+					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
+
+				// perform column-wise nonzero count
+				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						coors1.assign( k_col );
+						(void) ++nzc;
+						(void) ++CCS_raw.col_start[ k_col + 1 ];
+					}
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( not coors1.assigned( l_col ) ) {
+							(void) ++nzc;
+							(void) ++CCS_raw.col_start[ l_col + 1 ];
+						}
+					}
+				}
+
+				// check capacity
+				if( nzc > capacity( C ) ) {
+#ifdef _DEBUG
+					std::cout << "\t detected insufficient capacity "
+						<< "for requested operation\n";
+#endif
+					const RC clear_rc = clear( C );
+					if( clear_rc != SUCCESS ) {
+						return PANIC;
+					} else {
+						return FAILED;
+					}
+				}
+
+				// prefix sum for CCS_raw.col_start
+				assert( CCS_raw.col_start[ 0 ] == 0 );
+				for( size_t j = 1; j < n; ++j ) {
+					CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+				}
+				assert( CCS_raw.col_start[ n ] == nzc );
+
+				// set C_col_index to all zero
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel for simd 
+#endif
+				for( size_t j = 0; j < n; j++ ) {
+					C_col_index[ j ] = 0;
+				}
+
+
+				// do computations
+				std::vector< bool > columns( n, false );
+				size_t nzc = 0;
+				C_raw.col_start[ 0 ] = 0;
+				for( size_t i = 0; i < m; ++i ) {
+					std::fill( columns.begin(), columns.end(), false );
+
+#ifdef _DEBUG
+						std::cout << "  -- i: " << i << "\n";
+#endif
+
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						columns[ k_col ] = true;
+						valbuf[ k_col ] = A_raw.getValue( k, identity_A );
+#ifdef _DEBUG
+						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n";
+#endif
+					}
+
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( columns[ l_col ] ) { // Intersection case
+							const auto valbuf_value_before = valbuf[ l_col ];
+							(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper );
+#ifdef _DEBUG
+							std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, identity_B )
+							<< "  &&  A(" << i << ";" << l_col << ")=" << valbuf_value_before
+							<< "  ==>  C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n";
+#endif
+						} else { // Union case
+#ifdef _DEBUG
+							std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n";
+#endif
+							columns[ l_col ] = true;
+							valbuf[ l_col ] = B_raw.getValue( l, identity_B );
+						}
+					}
+
+					for( size_t j_unsigned = columns.size() ; j_unsigned > 0 ; j_unsigned-- ) {
+						const size_t j = j_unsigned - 1;
+						if( not columns[ j ] ) {
+							continue;
+						}
+						// update CRS
+						C_raw.row_index[ nzc ] = j;
+						C_raw.setValue( nzc, valbuf[ j ] );
+						// update CCS
+						C_col_index[ j ]++;
+						const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
+						CCS_raw.row_index[ CCS_index ] = i;
+						CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						// update count
+						(void)++nzc;
+					}
+					C_raw.col_start[ i + 1 ] = nzc;
+				}
+
+#ifdef _DEBUG	
+				std::cout << "CCS_raw.col_start = [ ";
+				for( size_t j = 0; j <= n; ++j )
+					std::cout << CCS_raw.col_start[ j ] << " ";
+				std::cout << "]\n";
+				std::cout << "C_col_index =       [ ";
+				for( size_t j = 0; j < n; ++j )
+					std::cout << C_col_index[ j ] << " ";
+				std::cout << "]\n";
+#endif
 #ifndef NDEBUG
 				for( size_t j = 0; j < n; ++j ) {
 					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
@@ -1257,8 +1498,8 @@ namespace grb {
 		std::cout << "In grb::eWiseApply_matrix_generic (reference, monoid)\n";
 #endif
 
-		return internal::eWiseApply_matrix_generic< true, descr >(
-			C, A, B, mulmono.getOperator(), mulmono, phase
+		return internal::eWiseApply_matrix_generic_union< true, descr >(
+			C, A, B, mulmono, phase
 		);
 	}
 
@@ -1317,12 +1558,8 @@ namespace grb {
 			"input matrices is a pattern matrix (of type void)"
 		);
 
-		typename grb::Monoid<
-			grb::operators::mul< double >,
-			grb::identities::one
-		> dummyMonoid;
-		return internal::eWiseApply_matrix_generic< false, descr >(
-			C, A, B, mulOp, dummyMonoid, phase
+		return internal::eWiseApply_matrix_generic_intersection< false, descr >(
+			C, A, B, mulOp, phase
 		);
 	}
 

From 34130f3302d6b1183bdf0be027dc28dc0330603b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 29 Jun 2023 14:20:45 +0200
Subject: [PATCH 03/37] Nonblocking implementation fix

---
 include/graphblas/nonblocking/blas3.hpp | 98 ++++++++++---------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
index 5a222c7f2..5ecb1fffa 100644
--- a/include/graphblas/nonblocking/blas3.hpp
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -416,56 +416,6 @@ namespace grb {
 		);
 	}
 
-	namespace internal {
-
-		template<
-			bool allow_void,
-			Descriptor descr,
-			class MulMonoid, class Operator,
-			typename OutputType, typename InputType1, typename InputType2,
-			typename RIT1, typename CIT1, typename NIT1,
-			typename RIT2, typename CIT2, typename NIT2,
-			typename RIT3, typename CIT3, typename NIT3
-		>
-		RC eWiseApply_matrix_generic(
-			Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C,
-			const Matrix< InputType1, nonblocking, RIT2, CIT2, NIT2 > &A,
-			const Matrix< InputType2, nonblocking, RIT3, CIT3, NIT3 > &B,
-			const Operator &oper,
-			const MulMonoid &mulMonoid,
-			const Phase &phase,
-			const typename std::enable_if<
-				!grb::is_object< OutputType >::value &&
-				!grb::is_object< InputType1 >::value &&
-				!grb::is_object< InputType2 >::value &&
-				grb::is_operator< Operator >::value,
-			void >::type * const = nullptr
-		) {
-			if( internal::NONBLOCKING::warn_if_not_native &&
-				config::PIPELINE::warn_if_not_native
-			) {
-				std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
-					<< "blocking implementation.\n"
-					<< "         Further similar such warnings will be suppressed.\n";
-				internal::NONBLOCKING::warn_if_not_native = false;
-			}
-
-			// nonblocking execution is not supported
-			// first, execute any computation that is not completed
-			le.execution();
-
-			// second, delegate to the reference backend
-			return eWiseApply_matrix_generic<
-					allow_void, descr,
-					MulMonoid, Operator
-				>(
-					getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
-					oper, mulMonoid, phase
-				);
-		}
-
-	} // namespace internal
-
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class MulMonoid,
@@ -507,11 +457,26 @@ namespace grb {
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::eWiseApply_matrix_generic (nonblocking, monoid)\n";
+		std::cout << "In grb::eWiseApply (nonblocking, monoid)\n";
 #endif
+		if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) {
+			std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
+				<< "blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
 
-		return internal::eWiseApply_matrix_generic< true, descr >(
-			C, A, B, mulmono.getOperator(), mulmono, phase
+		// second, delegate to the reference backend
+		return eWiseApply< descr >(
+			internal::getRefMatrix( C ), 
+			internal::getRefMatrix( A ), 
+			internal::getRefMatrix( B ),
+			mulmono, 
+			phase
 		);
 	}
 
@@ -561,13 +526,28 @@ namespace grb {
 			"the operator version of eWiseApply cannot be used if either of the "
 			"input matrices is a pattern matrix (of type void)"
 		);
+		if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) {
+			std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
+				<< "blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
 
-		typename grb::Monoid<
-			grb::operators::mul< double >,
-			grb::identities::one
-		> dummyMonoid;
-		return internal::eWiseApply_matrix_generic< false, descr >(
-			C, A, B, mulOp, dummyMonoid, phase
+#ifdef _DEBUG
+		std::cout << "In grb::eWiseApply (nonblocking, op)\n";
+#endif
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return eWiseApply< descr >(
+			internal::getRefMatrix( C ), 
+			internal::getRefMatrix( A ), 
+			internal::getRefMatrix( B ),
+			mulOp, 
+			phase
 		);
 	}
 

From 535e89c38d6ccbbd923690b8b78ced86752bfaa2 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 29 Jun 2023 18:23:29 +0200
Subject: [PATCH 04/37] Fix eWiseApplyMatrixReference unit-test to handle
 Monoid variant

---
 tests/unit/eWiseApplyMatrixReference.cpp | 384 ++++++++++++++---------
 1 file changed, 238 insertions(+), 146 deletions(-)

diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp
index 63c2ad6df..18f98df0d 100644
--- a/tests/unit/eWiseApplyMatrixReference.cpp
+++ b/tests/unit/eWiseApplyMatrixReference.cpp
@@ -15,221 +15,314 @@
  * limitations under the License.
  */
 
+#include <iomanip>
 #include <iostream>
 #include <sstream>
+#include <vector>
 
 #include <graphblas.hpp>
 
-// static data corresponding to small matrices
-
-static const size_t I_A[] = { 0, 0, 1, 1, 2, 2, 3, 3 };
-static const size_t J_A[] = { 0, 2, 1, 2, 2, 3, 0, 2 };
-static const double V_A[] = { 1, 3, 4, 2, 6, 7, 5, 8 };
+#define _DEBUG
 
-static const size_t I_B[] = { 0, 0, 1, 2, 3, 3 };
-static const size_t J_B[] = { 0, 3, 1, 1, 2, 3 };
-static const double V_B[] = { 9, 10, 11, 12, 14, 13 };
-
-static const size_t I_C[] = { 0, 1, 3 };
-static const size_t J_C[] = { 0, 1, 2 };
-static const double V_C[] = { 9, 44, 112 };
+template< class Iterator >
+void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+#ifndef _DEBUG
+	return;
+#endif
+	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+	if( rows > 50 || cols > 50 ) {
+		os << "   Matrix too large to print" << std::endl;
+	} else {
+		os.precision( 3 );
+		for( size_t y = 0; y < rows; y++ ) {
+			os << std::string( 3, ' ' );
+			for( size_t x = 0; x < cols; x++ ) {
+				auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+					return a.first.first == y && a.first.second == x;
+				} );
+				if( nnz_val != end )
+					os << std::fixed << std::setw( 3 ) << ( *nnz_val ).second;
+				else
+					os << "___";
+				os << " ";
+			}
+			os << std::endl;
+		}
+	}
+	os << "]" << std::endl;
+	std::flush( os );
+}
 
-static const size_t rowlens[] = { 1, 1, 0, 1 };
-static const size_t collens[] = { 1, 1, 1, 0 };
+template< typename D >
+void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+	grb::wait( mat );
+	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+}
 
-static const double expect1_CRS[] = { 9, 44, 112 };
-static const double expect1_CCS[] = { 9, 44, 112 };
+// static data corresponding to small matrices
 
-static const double expect2_CRS[] = { 1, 4, 8 };
-static const double expect2_CCS[] = { 1, 4, 8 };
+/**
+ * A:
+ * 1 _ 3 _
+ * _ 4 2 _
+ * _ _ 6 7
+ * 5 _ _ 8
+ */
+static const std::vector< size_t > I_A { 0, 0, 1, 1, 2, 2, 3, 3 };
+static const std::vector< size_t > J_A { 0, 2, 1, 2, 2, 3, 0, 2 };
+static const std::vector< int > V_A { 1, 3, 4, 2, 6, 7, 5, 8 };
+
+/**
+ * B:
+ *  9 __ __ __
+ * __ 11 12 __
+ * __ 14 __ __
+ * __ __ __ 13
+ */
+static const std::vector< size_t > I_B { 0, 0, 1, 2, 3, 3 };
+static const std::vector< size_t > J_B { 0, 3, 1, 1, 2, 3 };
+static const std::vector< int > V_B { 9, 10, 11, 12, 14, 13 };
+
+/**
+ * C_intersection:
+ *   9 ___ ___ ___
+ * ___  44 ___ ___
+ * ___ ___ ___ ___
+ * ___ ___ 112 ___
+ */
+static const std::vector< size_t > I_C_intersection { 0, 1, 3 };
+static const std::vector< size_t > J_C_intersection { 0, 1, 2 };
+static const std::vector< int > V_C_intersection { 9, 44, 112 };
+
+/**
+ * C_union_A_B:
+ *   9 ___   3  10
+ * ___  44   2 ___
+ * ___  12   6   7
+ *   5 ___ 112  13
+ */
 
-static const double expect3_CRS[] = { 9, 11, 14 };
-static const double expect3_CCS[] = { 9, 11, 14 };
+static const std::vector< size_t > I_C_union { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 };
+static const std::vector< size_t > J_C_union { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 };
+static const std::vector< int > V_C_union_A_B { 9, 3, 10, 44, 2, 12, 6, 7, 5, 112, 13 };
 
-static const double expect4_CRS[] = { 1, 1, 1 };
-static const double expect4_CCS[] = { 1, 1, 1 };
+/**
+ * C_union_A_B_pattern:
+ * 1 _ 3 1
+ * _ 4 2 _
+ * _ 1 6 7
+ * 5 _ 8 1
+ */
+static const std::vector< size_t > I_C_union_A_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 };
+static const std::vector< size_t > J_C_union_A_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 };
+static const std::vector< int > V_C_union_A_B_pattern { 1, 3, 1, 4, 2, 1, 6, 7, 5, 8, 1 };
+
+/**
+ * C_union_A_pattern_B:
+ *  9 __  1 10
+ * __ 11  1 __
+ * __ 12  1  1
+ *  1 __ 14 13
+ */
+static const std::vector< size_t > I_C_union_A_pattern_B { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 };
+static const std::vector< size_t > J_C_union_A_pattern_B { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 };
+static const std::vector< int > V_C_union_A_pattern_B { 9, 1, 10, 11, 1, 12, 1, 1, 1, 14, 13 };
+
+/**
+ * C_union_A_pattern_B_pattern:
+ *  1 _ 1 1
+ * _ 1 1 _
+ * _ 1 1 1
+ * 1 _ 1 1
+ */
+static const std::vector< size_t > I_C_union_A_pattern_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 };
+static const std::vector< size_t > J_C_union_A_pattern_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 };
+static const std::vector< int > V_C_union_A_pattern_B_pattern {    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 
 // helper function to check internal data structures
 // of the reference backend
 template< typename T >
-void checkCRSandCCS( const grb::Matrix< T > &C,
-	const size_t n,
-	const size_t * rlens,
-	const size_t * clens,
-	const size_t * I,
-	const size_t * J,
-	const double * expect_CRS,
-	const double * expect_CCS,
-	grb::RC &rc
-) {
-	// check CRS output
-	const auto &crs1 = grb::internal::getCRS( C );
-	for( size_t i = 0; i < n; ++i ) {
-		const size_t entries = crs1.col_start[ i + 1 ] - crs1.col_start[ i ];
-		if( entries != rlens[ i ] ) {
-			std::cerr << "Error: unexpected number of entries " << entries << ", "
-			       << " expected " << rlens[ i ] << " (CRS).\n";
-			rc = grb::FAILED;
-		}
-		for( size_t k = crs1.col_start[ i ]; k < crs1.col_start[ i + 1 ]; ++k ) {
-			if( crs1.row_index[ k ] != J[ k ] ) {
-				std::cerr << "Error: unexpected entry at ( " << i << ", "
-					<< crs1.row_index[ k ] << " ), "
-					<< "expected one at ( " << i << ", " << J[ k ] << " ) "
-					<< "instead (CRS).\n";
-				rc = grb::FAILED;
-			}
-			if( crs1.values[ k ] != expect_CRS[ k ] ) {
-				std::cerr << "Error: unexpected value " << crs1.values[ k ] << "; "
-					<< "expected " << expect_CRS[ k ] << " (CRS).\n";
-				rc = grb::FAILED;
+void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) {
+	{ // check CRS output
+		const auto & crsObtained = grb::internal::getCRS( obtained );
+		const auto & crsExpected = grb::internal::getCRS( expected );
+		for( size_t i = 0; i < grb::nrows( obtained ); ++i ) {
+			for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) {
+				if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) {
+					std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), "
+							  << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) "
+							  << "instead (CRS).\n";
+					rc = grb::FAILED;
+				}
+				if( crsObtained.values[ k ] != crsExpected.values[ k ] ) {
+					std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; "
+							  << "expected " << crsExpected.values[ k ] << " (CRS).\n";
+					rc = grb::FAILED;
+				}
 			}
 		}
 	}
 
-	// check CCS output
-	const auto &ccs1 = grb::internal::getCCS( C );
-	for( size_t j = 0; j < n; ++j ) {
-		const size_t entries = ccs1.col_start[ j + 1 ] - ccs1.col_start[ j ];
-		if( entries != clens[ j ] ) {
-			std::cerr << "Error: unexpected number of entries " << entries << ", "
-				<< "expected " << clens[ j ] << " (CCS).\n";
-			rc = grb::FAILED;
-		}
-		for( size_t k = ccs1.col_start[ j ]; k < ccs1.col_start[ j + 1 ]; ++k ) {
-			if( ccs1.row_index[ k ] != I[ k ] ) {
-				std::cerr << "Error: unexpected entry at "
-					<< "( " << ccs1.row_index[ k ] << ", " << j << " ), "
-					<< "expected one at ( " << I[ k ] << ", " << j << " ) "
-					<< "instead (CCS).\n";
-				rc = grb::FAILED;
-			}
-			if( ccs1.values[ k ] != expect_CCS[ k ] ) {
-				std::cerr << "Error: unexpected value " << ccs1.values[ k ] << "; "
-					<< "expected " << expect_CCS[ k ] << " (CCS).\n";
-				rc = grb::FAILED;
+	{ // check CCS output
+		const auto & ccsObtained = grb::internal::getCCS( obtained );
+		const auto & ccsExpected = grb::internal::getCCS( expected );
+		for( size_t j = 0; j < grb::ncols( obtained ); ++j ) {
+			for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) {
+				if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) {
+					std::cerr << "Error: unexpected entry at "
+							  << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), "
+							  << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) "
+							  << "instead (CCS).\n";
+					rc = grb::FAILED;
+				}
+				if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) {
+					std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; "
+							  << "expected " << ccsExpected.values[ k ] << " (CCS).\n";
+					rc = grb::FAILED;
+				}
 			}
 		}
 	}
 }
 
-void grbProgram( const void *, const size_t, grb::RC &rc ) {
+void grbProgram( const void *, const size_t, grb::RC & rc ) {
 
 	// initialize test
-	grb::Monoid< grb::operators::mul< double >, grb::identities::one > mulmono;
+	grb::Monoid< grb::operators::mul< int >, grb::identities::one > mulmono;
 
 	const size_t n = 4;
 	const size_t nelts_A = 8;
 	const size_t nelts_B = 6;
 
-	grb::Matrix< double > A( n, n );
-	grb::Matrix< double > B( n, n );
+	grb::Matrix< int > A( n, n );
+	grb::Matrix< int > B( n, n );
 	grb::Matrix< void > A_pattern( n, n );
 	grb::Matrix< void > B_pattern( n, n );
-	grb::Matrix< double > C( n, n );
+	grb::Matrix< int > C( n, n );
 
 	rc = grb::resize( A, nelts_A );
 	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( A, I_A, J_A, V_A, nelts_A, grb::SEQUENTIAL );
+		rc = grb::buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, grb::SEQUENTIAL );
 	}
 	if( rc == grb::SUCCESS ) {
 		rc = grb::resize( B, nelts_B );
 	}
 	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( B, I_B, J_B, V_B, nelts_B, grb::SEQUENTIAL );
+		rc = grb::buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, grb::SEQUENTIAL );
 	}
 	if( rc == grb::SUCCESS ) {
 		rc = grb::resize( A_pattern, nelts_A );
 	}
 	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( A_pattern, I_A, J_A, nelts_A, grb::SEQUENTIAL );
+		rc = grb::buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, grb::SEQUENTIAL );
 	}
 	if( rc == grb::SUCCESS ) {
 		rc = grb::resize( B_pattern, nelts_B );
 	}
 	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( B_pattern, I_B, J_B, nelts_B, grb::SEQUENTIAL );
+		rc = grb::buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, grb::SEQUENTIAL );
 	}
 	if( rc != grb::SUCCESS ) {
 		std::cerr << "\tinitialisation FAILED\n";
 		return;
 	}
 
-	// test 1: compute with the monoid mxm_elementwise
-	std::cout << "\t Verifying the monoid version of mxm_elementwise, "
-		<< "A and B value matrices\n";
-	rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE );
-	rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono );
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Call to grb::eWiseApply FAILED\n";
-		return;
-	}
-
-	checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect1_CRS, expect1_CCS, rc );
-
-	if( rc != grb::SUCCESS ) {
-		return;
-	}
+	printSparseMatrix( A, "A" );
+	printSparseMatrix( B, "B" );
+
+	{ // test 1: compute with the monoid mxm_elementwise
+		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
+				  << "A and B value matrices\n";
+		rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE );
+		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono );
+		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Call to grb::eWiseApply FAILED\n";
+			return;
+		}
+		grb::Matrix< int > union_A_B( n, n );
+		grb::buildMatrixUnique( union_A_B, I_C_union.data(), J_C_union.data(), V_C_union_A_B.data(), I_C_union.size(), grb::SEQUENTIAL );
+		checkCRSandCCS( C, union_A_B, rc );
 
-	// test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n";
-	std::cout << "\t Verifying the monoid version of mxm_elementwise, "
-		<< "A value matrix, B pattern matrix\n";
-	rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE );
-	rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono );
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Call to grb::eWiseApply FAILED\n";
-		return;
+		if( rc != grb::SUCCESS ) {
+			return;
+		}
 	}
 
-	checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect2_CRS, expect2_CCS, rc );
-
-	if( rc != grb::SUCCESS ) {
-		return;
-	}
+	{ // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n";
+		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
+				  << "A value matrix, B pattern matrix\n";
+		rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE );
+		rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono );
+		printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Call to grb::eWiseApply FAILED\n";
+			return;
+		}
+		grb::Matrix< int > union_A_B_pattern( n, n );
+		grb::buildMatrixUnique( union_A_B_pattern, I_C_union_A_B_pattern.data(), J_C_union_A_B_pattern.data(), V_C_union_A_B_pattern.data(), I_C_union_A_B_pattern.size(), grb::SEQUENTIAL );
+		checkCRSandCCS( C, union_A_B_pattern, rc );
 
-	// test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n";
-	std::cout << "\t Verifying the monoid version of mxm_elementwise, "
-		<< "A pattern matrix, B value matrix\n";
-	rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE );
-	rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono );
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Call to grb::eWiseApply FAILED\n";
-		return;
+		if( rc != grb::SUCCESS ) {
+			return;
+		}
 	}
 
-	checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect3_CRS, expect3_CCS, rc );
-
-	if( rc != grb::SUCCESS ) {
-		return;
-	}
+	{ // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n";
+		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
+				  << "A pattern matrix, B value matrix\n";
+		rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE );
+		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono );
+		printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Call to grb::eWiseApply FAILED\n";
+			return;
+		}
+		grb::Matrix< int > union_A_pattern_B( n, n );
+		grb::buildMatrixUnique( union_A_pattern_B, I_C_union_A_pattern_B.data(), J_C_union_A_pattern_B.data(), V_C_union_A_pattern_B.data(), I_C_union_A_pattern_B.size(), grb::SEQUENTIAL );
+		checkCRSandCCS( C, union_A_pattern_B, rc );
 
-	// test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n";
-	std::cout << "\t Verifying the monoid version of mxm_elementwise, "
-		<< "A pattern matrix, B pattern matrix\n";
-	rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE );
-	rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono );
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Call to grb::eWiseApply FAILED\n";
-		return;
+		if( rc != grb::SUCCESS ) {
+			return;
+		}
 	}
 
-	checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect4_CRS, expect4_CCS, rc );
+	{ // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n";
+		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
+				  << "A pattern matrix, B pattern matrix\n";
+		rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE );
+		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono );
+		printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Call to grb::eWiseApply FAILED\n";
+			return;
+		}
+		grb::Matrix< int > union_A_pattern_B_pattern( n, n );
+		grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL );
+		checkCRSandCCS( C, union_A_pattern_B_pattern, rc );
 
-	if( rc != grb::SUCCESS ) {
-		return;
+		if( rc != grb::SUCCESS ) {
+			return;
+		}
 	}
 
-	// test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n";
-	std::cout << "\t Verifying the operator version of mxm_elementwise "
-		<< "(only value matrices)\n";
-	rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE );
-	rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() );
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Call to grb::eWiseApply FAILED\n";
-		return;
+	{ // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n";
+		std::cout << "\t Verifying the operator version of mxm_elementwise "
+				  << "(only value matrices)\n";
+		rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE );
+		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() );
+		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Call to grb::eWiseApply FAILED\n";
+			return;
+		}
+		grb::Matrix< int > intersection_A_B( n, n );
+		grb::buildMatrixUnique( intersection_A_B, I_C_intersection.data(), J_C_intersection.data(), V_C_intersection.data(), I_C_intersection.size(), grb::SEQUENTIAL );
+		checkCRSandCCS( C, intersection_A_B, rc );
+		if( rc != grb::SUCCESS ) {
+			return;
+		}
 	}
-
-	checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect1_CRS, expect1_CCS, rc );
 }
 
 int main( int argc, char ** argv ) {
@@ -252,4 +345,3 @@ int main( int argc, char ** argv ) {
 	// done
 	return 0;
 }
-

From a7bf8dfb8380719c3031cc7e212fedd376a46ebc Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 3 Jul 2023 14:50:59 +0200
Subject: [PATCH 05/37] Stackless implementation of the operator variant

---
 include/graphblas/reference/blas3.hpp    | 348 ++++++++++-------------
 tests/unit/eWiseApplyMatrixReference.cpp |  68 ++++-
 2 files changed, 214 insertions(+), 202 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index e77478564..228c2122b 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -929,34 +929,31 @@ namespace grb {
 		 *                      \a allow_void is true; otherwise, will be ignored.
 		 * \endinternal
 		 */
-		template<
-			bool allow_void,
+		template< bool allow_void,
 			Descriptor descr,
 			class Operator,
-			typename OutputType, typename InputType1, typename InputType2,
-			typename RIT1, typename CIT1, typename NIT1,
-			typename RIT2, typename CIT2, typename NIT2,
-			typename RIT3, typename CIT3, typename NIT3
-		>
-		RC eWiseApply_matrix_generic_intersection(
-			Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
-			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
-			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
-			const Operator &oper,
-			const Phase &phase,
-			const typename std::enable_if<
-				!grb::is_object< OutputType >::value &&
-				!grb::is_object< InputType1 >::value &&
-				!grb::is_object< InputType2 >::value &&
-				grb::is_operator< Operator >::value,
-			void >::type * const = nullptr
-		) {
-			assert( !(descr & descriptors::force_row_major ) );
-			static_assert( allow_void ||
-				( !(
-				     std::is_same< InputType1, void >::value ||
-				     std::is_same< InputType2, void >::value
-				) ),
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename RIT1,
+			typename CIT1,
+			typename NIT1,
+			typename RIT2,
+			typename CIT2,
+			typename NIT2,
+			typename RIT3,
+			typename CIT3,
+			typename NIT3 >
+		RC eWiseApply_matrix_generic_intersection( Matrix< OutputType, reference, RIT1, CIT1, NIT1 > & C,
+			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > & A,
+			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > & B,
+			const Operator & oper,
+			const Phase & phase,
+			const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value &&
+					grb::is_operator< Operator >::value,
+				void >::type * const = nullptr ) {
+			assert( ! ( descr & descriptors::force_row_major ) );
+			static_assert( allow_void || ( ! ( std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ),
 				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of "
 				"elementwise mxm can only be used if neither of the input matrices "
 				"is a pattern matrix (of type void)" );
@@ -965,212 +962,167 @@ namespace grb {
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
 #endif
-			// get whether the matrices should be transposed prior to execution
-			constexpr bool trans_left = descr & descriptors::transpose_left;
-			constexpr bool trans_right = descr & descriptors::transpose_right;
+			RC rc = SUCCESS;
 
-			// run-time checks
-			const size_t m = grb::nrows( C );
-			const size_t n = grb::ncols( C );
-			const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A );
-			const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A );
-			const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B );
-			const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B );
+			if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) {
+				return rc;
+			}
 
-			if( m != m_A || m != m_B || n != n_A || n != n_B ) {
+			const auto & A_raw = descr & grb::descriptors::transpose_left ? internal::getCCS( A ) : internal::getCRS( A );
+			const auto & B_raw = descr & grb::descriptors::transpose_right ? internal::getCCS( B ) : internal::getCRS( B );
+			const auto & C_crs_raw = internal::getCRS( C );
+			const auto & C_ccs_raw = internal::getCCS( C );
+			const size_t m_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? ncols( A ) : nrows( A );
+			const size_t n_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? nrows( A ) : ncols( A );
+			const size_t m_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? ncols( B ) : nrows( B );
+			const size_t n_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? nrows( B ) : ncols( B );
+			const size_t m_C = nrows( C );
+			const size_t n_C = ncols( C );
+
+			// Check mask dimensions
+			if( m_A != m_B || n_A != n_B || m_A != m_C || n_A != n_C ) {
+#ifdef _DEBUG
+				std::cout << "Dimensions of matrices do not match!\n";
+#endif
 				return MISMATCH;
 			}
 
-			const auto &A_raw = !trans_left ?
-				internal::getCRS( A ) :
-				internal::getCCS( A );
-			const auto &B_raw = !trans_right ?
-				internal::getCRS( B ) :
-				internal::getCCS( B );
-			auto &C_raw = internal::getCRS( C );
-			auto &CCS_raw = internal::getCCS( C );
-
-			// retrieve buffers
-			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
-			arr1 = arr2 = buf1 = buf2 = nullptr;
-			InputType1 * vbuf1 = nullptr;
-			InputType2 * vbuf2 = nullptr;
-			OutputType * valbuf = nullptr;
-			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
-			internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B );
-			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
-			// end buffer retrieval
-
-			// initialisations
-			internal::Coordinates< reference > coors1, coors2;
-			coors1.set( arr1, false, buf1, n );
-			coors2.set( arr2, false, buf2, n );
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#pragma omp parallel
-			{
-				size_t start, end;
-				config::OMP::localRange( start, end, 0, n + 1 );
-#else
-				const size_t start = 0;
-				const size_t end = n + 1;
-#endif
-				for( size_t j = start; j < end; ++j ) {
-					CCS_raw.col_start[ j ] = 0;
-				}
+			if( phase == Phase::RESIZE ) {
+				size_t nzc = 0;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			}
+#pragma omp parallel for reduction( + : nzc ) default( none ) shared( B_raw, A_raw ) firstprivate( m_A )
 #endif
-			// end initialisations
+				for( size_t i = 0; i < m_A; ++i ) {
+					auto B_k = B_raw.col_start[ i ];
+					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+						const auto j = A_raw.row_index[ A_k ];
 
-			// nonzero count
-			size_t nzc = 0;
-
-			// symbolic phase
-			if( phase == RESIZE ) {
-				for( size_t i = 0; i < m; ++i ) {
-					coors1.clear();
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-					}
-					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
-						if( coors1.assigned( l_col ) ) {
-							(void)++nzc;
+						while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+							B_k++;
+						}
+						if( B_k >= B_raw.col_start[ i + 1 ] ) {
+							break;
+						}
+						if( B_raw.row_index[ B_k ] == j ) {
+							nzc += 1;
 						}
 					}
 				}
-
-				const RC ret = grb::resize( C, nzc );
-				if( ret != SUCCESS ) {
-					return ret;
-				}
+#ifdef _DEBUG
+				std::cout << "RESIZE phase: resize( C, " << nzc << " )\n";
+#endif
+				return resize( C, nzc );
 			}
 
-			// computational phase
-			if( phase == EXECUTE ) {
-				// retrieve additional buffer
-				config::NonzeroIndexType * const C_col_index = internal::template
-					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
+			const size_t nzc = capacity( C );
 
-				// perform column-wise nonzero count
-				for( size_t i = 0; i < m; ++i ) {
-					coors1.clear();
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
+			C_crs_raw.col_start[ 0 ] = 0;
+			C_ccs_raw.col_start[ 0 ] = 0;
+			// Prefix sum computation into L.CRS.col_start
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel for default( none ) shared( B_raw, A_raw, C_crs_raw, std::cout ) firstprivate( m_A )
+#endif
+			for( size_t i = 0; i < m_A; i++ ) {
+				auto B_k = B_raw.col_start[ i ];
+				size_t cumul = 0UL;
+				for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+					const auto j = A_raw.row_index[ A_k ];
+
+					while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+						B_k++;
 					}
-					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
-						if( coors1.assigned( l_col ) ) {
-							(void) ++nzc;
-							(void) ++CCS_raw.col_start[ l_col + 1 ];
-						}
+					if( B_k >= B_raw.col_start[ i + 1 ]) {
+						break;
+					}
+					if( B_raw.row_index[ B_k ] == j ) {
+						cumul += 1;
 					}
 				}
+				C_crs_raw.col_start[ i + 1 ] = cumul;
+			}
 
-				// check capacity
-				if( nzc > capacity( C ) ) {
+			// Print the CRS prefix sum
 #ifdef _DEBUG
-					std::cout << "\t detected insufficient capacity "
-						<< "for requested operation\n";
+			std::cout << "CRS prefix sum: ";
+			for( size_t i = 0; i <= m_A; i++ ) {
+				std::cout << C_crs_raw.col_start[ i ] << " ";
+			}
+			std::cout << "\n";
 #endif
-					const RC clear_rc = clear( C );
-					if( clear_rc != SUCCESS ) {
-						return PANIC;
-					} else {
-						return FAILED;
-					}
-				}
 
-				// prefix sum for CCS_raw.col_start
-				assert( CCS_raw.col_start[ 0 ] == 0 );
-				for( size_t j = 1; j < n; ++j ) {
-					CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
-				}
-				assert( CCS_raw.col_start[ n ] == nzc );
+			// Apply the prefix sum
+			for( size_t i = 1; i <= m_A; i++ ) {
+				C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ];
+				C_ccs_raw.col_start[ i ] = C_crs_raw.col_start[ i ];
+			}
 
-				// set C_col_index to all zero
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel
-				{
-					size_t start, end;
-					config::OMP::localRange( start, end, 0, n );
-#else
-					const size_t start = 0;
-					const size_t end = n;
+			// Check if the number of nonzeros is greater than the capacity
+			if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) {
+#ifdef _DEBUG
+				std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n"
+						  << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros, but capacity is " << nzc << "\n";
 #endif
-					for( size_t j = start; j < end; ++j ) {
-						C_col_index[ j ] = 0;
-					}
+				return RC::MISMATCH;
+			}
+
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				}
+#pragma omp parallel for simd
 #endif
+			for( size_t i = 0; i < m_A; i++ )
+				C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0;
 
-				// do computations
-				size_t nzc = 0;
-				C_raw.col_start[ 0 ] = 0;
-				for( size_t i = 0; i < m; ++i ) {
-					coors1.clear();
-					coors2.clear();
-#ifdef _DEBUG
-					std::cout << "\t The elements ";
-#endif
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-						valbuf[ k_col ] = A_raw.values[ k ];
-#ifdef _DEBUG
-						std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.values[ k ] << ", ";
-#endif
-					}
-#ifdef _DEBUG
-					std::cout << "are multiplied pairwise with ";
+			RC local_rc = rc;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel default( none ) shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) firstprivate( local_rc, m_A, oper )
 #endif
-					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
-						if( coors1.assigned( l_col ) ) {
-							coors2.assign( l_col );
-							(void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.values[ l ], oper );
-#ifdef _DEBUG
-							std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.values[ l ]
-							<< " to yield C( " << i << ", " << l_col << " ), ";
+			{
+				size_t start_row = 0;
+				size_t end_row = m_A;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				config::OMP::localRange( start_row, end_row, 0, m_A );
 #endif
+
+				for( auto i = start_row; i < end_row; ++i ) {
+					auto B_k = B_raw.col_start[ i ];
+					auto C_k = C_crs_raw.col_start[ i ];
+					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto j = A_raw.row_index[ k ];
+						while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+							B_k++;
 						}
-					}
+						if( B_k >= B_raw.col_start[ i + 1 ] ) {
+							break;
+						}
+						if( B_raw.row_index[ B_k ] != j ) {
+							continue;
+						}
+
+						const auto a_val = A_raw.values[ k ];
+						const auto b_val = B_raw.values[ B_k ];
+						OutputType c_val;
+						local_rc = local_rc ? local_rc : grb::apply< descr >( c_val, a_val, b_val, oper );
+
+						C_crs_raw.row_index[ C_k ] = j;
+						C_crs_raw.values[ C_k ] = c_val;
+						C_ccs_raw.row_index[ C_k ] = i;
+						C_ccs_raw.values[ C_k ] = c_val;
 #ifdef _DEBUG
-					std::cout << "\n";
+						std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n";
+						std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n";
+						std::cerr << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n";
 #endif
-					for( size_t k = 0; k < coors2.nonzeroes(); ++k ) {
-						const size_t j = coors2.index( k );
-						// update CRS
-						C_raw.row_index[ nzc ] = j;
-						C_raw.setValue( nzc, valbuf[ j ] );
-						// update CCS
-						const size_t CCS_index = C_col_index[ j ]++ + CCS_raw.col_start[ j ];
-						CCS_raw.row_index[ CCS_index ] = i;
-						CCS_raw.setValue( CCS_index, valbuf[ j ] );
-						// update count
-						(void)++nzc;
+						C_k += 1;
 					}
-					C_raw.col_start[ i + 1 ] = nzc;
-#ifdef _DEBUG
-					std::cout << "\n";
-#endif
-				}
-
-#ifndef NDEBUG
-				for( size_t j = 0; j < n; ++j ) {
-					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
 				}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp critical
 #endif
-
-				// set final number of nonzeroes in output matrix
-				internal::setCurrentNonzeroes( C, nzc );
+				{ rc = rc ? rc : local_rc; }
 			}
 
-			// done
-			return SUCCESS;
+			internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] );
+			
+			return rc;
 		}
 
 		/**
@@ -1356,7 +1308,7 @@ namespace grb {
 
 				// set C_col_index to all zero
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd 
+				#pragma omp parallel for simd
 #endif
 				for( size_t j = 0; j < n; j++ ) {
 					C_col_index[ j ] = 0;
@@ -1421,7 +1373,7 @@ namespace grb {
 					C_raw.col_start[ i + 1 ] = nzc;
 				}
 
-#ifdef _DEBUG	
+#ifdef _DEBUG
 				std::cout << "CCS_raw.col_start = [ ";
 				for( size_t j = 0; j <= n; ++j )
 					std::cout << CCS_raw.col_start[ j ] << " ";
diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp
index 18f98df0d..27bbe93fb 100644
--- a/tests/unit/eWiseApplyMatrixReference.cpp
+++ b/tests/unit/eWiseApplyMatrixReference.cpp
@@ -22,8 +22,6 @@
 
 #include <graphblas.hpp>
 
-#define _DEBUG
-
 template< class Iterator >
 void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
 #ifndef _DEBUG
@@ -59,6 +57,45 @@ void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name =
 	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
 }
 
+template< class Storage, typename D >
+void printCompressedStorage( const Storage& storage, const grb::Matrix< D > & mat, std::ostream & os = std::cout ) {
+	os << "  row_index: [ ";
+	for( size_t i = 0; i < grb::nrows( mat ); ++i ) {
+		os << storage.row_index[ i ] << " ";
+	}
+	os << "]" << std::endl;
+	os << "  col_start: [ ";
+	for( size_t i = 0; i <= grb::nrows( mat ); ++i ) {
+		os << storage.col_start[ i ] << " ";
+	}
+	os << "]" << std::endl;
+	os << "  values:    [ ";
+	for( size_t i = 0; i < grb::nnz( mat ); ++i ) {
+		os << storage.values[ i ] << " ";
+	}
+	os << "]" << std::endl << std::flush;
+}
+
+template< typename D >
+void printCRS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) {
+#ifndef _DEBUG
+	return;
+#endif
+	grb::wait( mat );
+	os << "CRS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl;
+	printCompressedStorage(  grb::internal::getCRS( mat ), mat, os );
+}
+
+template< typename D >
+void printCCS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) {
+#ifndef _DEBUG
+	return;
+#endif
+	grb::wait( mat );
+	os << "CCS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl;
+	printCompressedStorage(  grb::internal::getCCS( mat ), mat, os );
+}
+
 // static data corresponding to small matrices
 
 /**
@@ -137,12 +174,22 @@ static const std::vector< int > V_C_union_A_pattern_B { 9, 1, 10, 11, 1, 12, 1,
  */
 static const std::vector< size_t > I_C_union_A_pattern_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 };
 static const std::vector< size_t > J_C_union_A_pattern_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 };
-static const std::vector< int > V_C_union_A_pattern_B_pattern {    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 
 // helper function to check internal data structures
 // of the reference backend
 template< typename T >
 void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) {
+	printCRS( obtained, "obtained" );
+	printCRS( expected, "expected" );
+
+	if( grb::nnz( obtained ) != grb::nnz( expected ) ) {
+		std::cerr << "Error: unexpected number of non-zero entries; "
+				  << "expected " << grb::nnz( expected ) << ", "
+				  << "obtained " << grb::nnz( obtained ) << ".\n";
+		rc = grb::FAILED;
+	}
+
 	{ // check CRS output
 		const auto & crsObtained = grb::internal::getCRS( obtained );
 		const auto & crsExpected = grb::internal::getCRS( expected );
@@ -163,6 +210,9 @@ void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > &
 		}
 	}
 
+	printCCS( obtained, "obtained" );
+	printCCS( expected, "expected" );
+
 	{ // check CCS output
 		const auto & ccsObtained = grb::internal::getCCS( obtained );
 		const auto & ccsExpected = grb::internal::getCCS( expected );
@@ -228,11 +278,16 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	}
 
 	printSparseMatrix( A, "A" );
+	printCRS( A, "A" );
+	printCCS( A, "A" );
 	printSparseMatrix( B, "B" );
+	printCRS( B, "B" );
+	printCCS( B, "B" );
 
 	{ // test 1: compute with the monoid mxm_elementwise
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A and B value matrices\n";
+		grb::clear( C );
 		rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono );
 		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" );
@@ -252,6 +307,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A value matrix, B pattern matrix\n";
+		grb::clear( C );
 		rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono );
 		printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" );
@@ -271,6 +327,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A pattern matrix, B value matrix\n";
+		grb::clear( C );
 		rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono );
 		printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" );
@@ -290,6 +347,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A pattern matrix, B pattern matrix\n";
+		grb::clear( C );
 		rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono );
 		printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" );
@@ -298,7 +356,8 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 			return;
 		}
 		grb::Matrix< int > union_A_pattern_B_pattern( n, n );
-		grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL );
+		grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(),
+			I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL );
 		checkCRSandCCS( C, union_A_pattern_B_pattern, rc );
 
 		if( rc != grb::SUCCESS ) {
@@ -309,6 +368,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n";
 		std::cout << "\t Verifying the operator version of mxm_elementwise "
 				  << "(only value matrices)\n";
+		grb::clear( C );
 		rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() );
 		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" );

From 5eb0b44383ff274ee3bc3e6ec968c44a65f0fe58 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 3 Jul 2023 14:51:14 +0200
Subject: [PATCH 06/37] Convert vector to c-like array for monoid variant

---
 include/graphblas/reference/blas3.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 228c2122b..1207daaf2 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -26,7 +26,6 @@
 
 #include <graphblas/base/blas3.hpp>
 #include <graphblas/utils/iterators/MatrixVectorIterator.hpp>
-#include <vector>
 
 #include "io.hpp"
 #include "matrix.hpp"
@@ -1109,7 +1108,7 @@ namespace grb {
 #ifdef _DEBUG
 						std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n";
 						std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n";
-						std::cerr << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n";
+						std::cout << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n";
 #endif
 						C_k += 1;
 					}
@@ -1316,12 +1315,10 @@ namespace grb {
 
 
 				// do computations
-				std::vector< bool > columns( n, false );
+				bool columns[ n ] = { false };
 				size_t nzc = 0;
 				C_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
-					std::fill( columns.begin(), columns.end(), false );
-
 #ifdef _DEBUG
 						std::cout << "  -- i: " << i << "\n";
 #endif
@@ -1354,7 +1351,7 @@ namespace grb {
 						}
 					}
 
-					for( size_t j_unsigned = columns.size() ; j_unsigned > 0 ; j_unsigned-- ) {
+					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
 						const size_t j = j_unsigned - 1;
 						if( not columns[ j ] ) {
 							continue;
@@ -1371,6 +1368,9 @@ namespace grb {
 						(void)++nzc;
 					}
 					C_raw.col_start[ i + 1 ] = nzc;
+
+					for(size_t i=0; i<n; i++)
+						columns[ i ] = false;
 				}
 
 #ifdef _DEBUG

From c9c5ef310b3b8d9d3bd8903a13b4ddc704bd8467 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 12 Jul 2023 15:37:10 +0200
Subject: [PATCH 07/37] Style fixes

---
 include/graphblas/reference/blas3.hpp    | 236 ++++++++++++------
 tests/unit/eWiseApplyMatrixReference.cpp | 294 +++++++++--------------
 tests/unit/eWiseApplyMatrix_variants.cpp | 287 +++++++++++-----------
 3 files changed, 423 insertions(+), 394 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 1207daaf2..97c585118 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -928,53 +928,90 @@ namespace grb {
 		 *                      \a allow_void is true; otherwise, will be ignored.
 		 * \endinternal
 		 */
-		template< bool allow_void,
+		template<
+			bool allow_void,
 			Descriptor descr,
 			class Operator,
-			typename OutputType,
-			typename InputType1,
-			typename InputType2,
-			typename RIT1,
-			typename CIT1,
-			typename NIT1,
-			typename RIT2,
-			typename CIT2,
-			typename NIT2,
-			typename RIT3,
-			typename CIT3,
-			typename NIT3 >
-		RC eWiseApply_matrix_generic_intersection( Matrix< OutputType, reference, RIT1, CIT1, NIT1 > & C,
-			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > & A,
-			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > & B,
-			const Operator & oper,
-			const Phase & phase,
-			const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value &&
-					grb::is_operator< Operator >::value,
-				void >::type * const = nullptr ) {
-			assert( ! ( descr & descriptors::force_row_major ) );
-			static_assert( allow_void || ( ! ( std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ),
-				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of "
-				"elementwise mxm can only be used if neither of the input matrices "
-				"is a pattern matrix (of type void)" );
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
+		>
+		RC eWiseApply_matrix_generic_intersection(
+			Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
+			const Operator &oper,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value,
+				void
+			>::type * const = nullptr
+		) {
+			assert( !( descr & descriptors::force_row_major ) );
+			static_assert( allow_void ||
+				( !(
+					std::is_same< InputType1, void >::value
+					|| std::is_same< InputType2, void >::value
+					)
+				),
+				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid"
+				" version of elementwise mxm can only be used if neither of the input"
+				" matrices is a pattern matrix (of type void)" );
 			assert( phase != TRY );
 
+			// get whether the matrices should be transposed prior to execution
+			constexpr bool trans_left = descr & descriptors::transpose_left;
+			constexpr bool trans_right = descr & descriptors::transpose_right;
+
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
 #endif
-			RC rc = SUCCESS;
 
 			if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) {
-				return rc;
+#ifdef _DEBUG
+			std::cout << "No nonzeros in input matrices, nothing to compute.\n";
+#endif
+				return SUCCESS;
+			}
+
+			const auto &A_raw = trans_left
+								? internal::getCCS( A )
+								: internal::getCRS( A );
+			const size_t m_A = trans_left
+								? ncols( A )
+								: nrows( A );
+			const size_t n_A = trans_left
+								? nrows( A )
+								: ncols( A );
+			if( m_A == 0 || n_A == 0 ) {
+#ifdef _DEBUG
+			std::cout << "Matrix A is empty, nothing to compute.\n";
+#endif
+				return SUCCESS;
+			}
+
+			const auto &B_raw = trans_right
+								? internal::getCCS( B )
+								: internal::getCRS( B );
+			const size_t m_B = trans_right
+								? ncols( B )
+								: nrows( B );
+			const size_t n_B = trans_right
+								? nrows( B )
+								: ncols( B );
+			if( m_A == 0 || n_A == 0 ) {
+#ifdef _DEBUG
+			std::cout << "Matrix B is empty, nothing to compute.\n";
+#endif
+				return SUCCESS;
 			}
 
-			const auto & A_raw = descr & grb::descriptors::transpose_left ? internal::getCCS( A ) : internal::getCRS( A );
-			const auto & B_raw = descr & grb::descriptors::transpose_right ? internal::getCCS( B ) : internal::getCRS( B );
-			const auto & C_crs_raw = internal::getCRS( C );
-			const auto & C_ccs_raw = internal::getCCS( C );
-			const size_t m_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? ncols( A ) : nrows( A );
-			const size_t n_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? nrows( A ) : ncols( A );
-			const size_t m_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? ncols( B ) : nrows( B );
-			const size_t n_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? nrows( B ) : ncols( B );
+			auto &C_crs_raw = internal::getCRS( C );
+			auto &C_ccs_raw = internal::getCCS( C );
 			const size_t m_C = nrows( C );
 			const size_t n_C = ncols( C );
 
@@ -986,17 +1023,27 @@ namespace grb {
 				return MISMATCH;
 			}
 
+			const auto A_identity = identities::zero< InputType1 >::value();
+			const auto B_identity = identities::zero< InputType2 >::value();
+
+			RC rc = SUCCESS;
 			if( phase == Phase::RESIZE ) {
 				size_t nzc = 0;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for reduction( + : nzc ) default( none ) shared( B_raw, A_raw ) firstprivate( m_A )
+#pragma omp parallel for reduction( + : nzc ) \
+				default( none ) shared( B_raw, A_raw ) \
+				firstprivate( m_A )
 #endif
 				for( size_t i = 0; i < m_A; ++i ) {
 					auto B_k = B_raw.col_start[ i ];
-					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+					const auto A_k_start = A_raw.col_start[ i ];
+					const auto A_k_end = A_raw.col_start[ i + 1 ];
+					for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
 						const auto j = A_raw.row_index[ A_k ];
 
-						while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+						while(  B_k < B_raw.col_start[ i + 1 ]
+							    && B_raw.row_index[ B_k ] > j
+						) {
 							B_k++;
 						}
 						if( B_k >= B_raw.col_start[ i + 1 ] ) {
@@ -1008,7 +1055,7 @@ namespace grb {
 					}
 				}
 #ifdef _DEBUG
-				std::cout << "RESIZE phase: resize( C, " << nzc << " )\n";
+				std::cout << "resize( C, " << nzc << " )\n";
 #endif
 				return resize( C, nzc );
 			}
@@ -1024,10 +1071,14 @@ namespace grb {
 			for( size_t i = 0; i < m_A; i++ ) {
 				auto B_k = B_raw.col_start[ i ];
 				size_t cumul = 0UL;
-				for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+				const auto A_k_start = A_raw.col_start[ i ];
+				const auto A_k_end = A_raw.col_start[ i + 1 ];
+				for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
 					const auto j = A_raw.row_index[ A_k ];
 
-					while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+					while(  B_k < B_raw.col_start[ i + 1 ]
+						    && B_raw.row_index[ B_k ] > j
+					) {
 						B_k++;
 					}
 					if( B_k >= B_raw.col_start[ i + 1 ]) {
@@ -1040,8 +1091,8 @@ namespace grb {
 				C_crs_raw.col_start[ i + 1 ] = cumul;
 			}
 
-			// Print the CRS prefix sum
 #ifdef _DEBUG
+			// Print the CRS prefix sum
 			std::cout << "CRS prefix sum: ";
 			for( size_t i = 0; i <= m_A; i++ ) {
 				std::cout << C_crs_raw.col_start[ i ] << " ";
@@ -1058,21 +1109,25 @@ namespace grb {
 			// Check if the number of nonzeros is greater than the capacity
 			if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) {
 #ifdef _DEBUG
-				std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n"
-						  << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros, but capacity is " << nzc << "\n";
+				std::cout << "Insufficient capacity detected for requested operation.\n"
+						  << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros"
+						  << " but capacity is " << nzc << "\n";
 #endif
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 #pragma omp parallel for simd
 #endif
-			for( size_t i = 0; i < m_A; i++ )
+			for( size_t i = 0; i < m_A; i++ ) {
 				C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0;
+			}
 
 			RC local_rc = rc;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel default( none ) shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) firstprivate( local_rc, m_A, oper )
+#pragma omp parallel default( none ) \
+			shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \
+			firstprivate( local_rc, m_A, oper, A_identity, B_identity )
 #endif
 			{
 				size_t start_row = 0;
@@ -1081,12 +1136,17 @@ namespace grb {
 				config::OMP::localRange( start_row, end_row, 0, m_A );
 #endif
 
-				for( auto i = start_row; i < end_row; ++i ) {
+				for( size_t i = start_row; i < end_row; ++i ) {
 					auto B_k = B_raw.col_start[ i ];
 					auto C_k = C_crs_raw.col_start[ i ];
-					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const auto j = A_raw.row_index[ k ];
-						while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) {
+
+					const auto A_k_start = A_raw.col_start[ i ];
+					const auto A_k_end = A_raw.col_start[ i + 1 ];
+					for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
+						const auto j = A_raw.row_index[ A_k ];
+						while(  B_k < B_raw.col_start[ i + 1 ]
+							    && B_raw.row_index[ B_k ] > j
+						) {
 							B_k++;
 						}
 						if( B_k >= B_raw.col_start[ i + 1 ] ) {
@@ -1096,31 +1156,48 @@ namespace grb {
 							continue;
 						}
 
-						const auto a_val = A_raw.values[ k ];
-						const auto b_val = B_raw.values[ B_k ];
+						const InputType1 a_val = A_raw.getValue( A_k, A_identity );
+						const InputType2 b_val = B_raw.getValue( B_k, B_identity );
 						OutputType c_val;
-						local_rc = local_rc ? local_rc : grb::apply< descr >( c_val, a_val, b_val, oper );
+						local_rc = local_rc
+									? local_rc
+									: grb::apply< descr >( c_val, a_val, b_val, oper );
 
 						C_crs_raw.row_index[ C_k ] = j;
-						C_crs_raw.values[ C_k ] = c_val;
+						C_crs_raw.setValue( C_k, c_val );
 						C_ccs_raw.row_index[ C_k ] = i;
-						C_ccs_raw.values[ C_k ] = c_val;
+						C_ccs_raw.setValue( C_k, c_val );
 #ifdef _DEBUG
-						std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n";
-						std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n";
-						std::cout << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n";
+						std::cout << "A( " + std::to_string( i ) + ";"
+									+ std::to_string( j ) + " ) = "
+									+ std::to_string( a_val ) + "\n";
+						std::cout << "B( " + std::to_string( i ) + ";"
+									+ std::to_string( j ) + " ) = "
+									+ std::to_string( b_val ) + "\n";
+						std::cout << "C.crs( " + std::to_string( i ) + ";"
+									+ std::to_string( j ) + " ) = "
+									+ std::to_string( c_val ) + "\n";
 #endif
 						C_k += 1;
 					}
 				}
+
+				if( local_rc != SUCCESS ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 #pragma omp critical
 #endif
-				{ rc = rc ? rc : local_rc; }
+					{ 
+						rc = rc ? rc : local_rc;
+					}
+				}	
 			}
 
+#ifdef _DEBUG
+			std::cout << "internal::setCurrentNonzeroes( C, "
+					  << C_crs_raw.col_start[ m_A ] << " )\n";
+#endif
 			internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] );
-			
+
 			return rc;
 		}
 
@@ -1163,9 +1240,9 @@ namespace grb {
 				     std::is_same< InputType1, void >::value ||
 				     std::is_same< InputType2, void >::value
 				) ),
-				"grb::internal::eWiseApply_matrix_generic_union: the non-monoid version of "
-				"elementwise mxm can only be used if neither of the input matrices "
-				"is a pattern matrix (of type void)" );
+				"grb::internal::eWiseApply_matrix_generic_union: the non-monoid"
+				" version of elementwise mxm can only be used if neither of the"
+				" input matrices is a pattern matrix (of type void)" );
 			assert( phase != TRY );
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n";
@@ -1175,12 +1252,12 @@ namespace grb {
 			constexpr bool trans_right = descr & descriptors::transpose_right;
 
 			// run-time checks
-			const size_t m = grb::nrows( C );
-			const size_t n = grb::ncols( C );
-			const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A );
-			const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A );
-			const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B );
-			const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B );
+			const size_t m = nrows( C );
+			const size_t n = ncols( C );
+			const size_t m_A = !trans_left ? nrows( A ) : ncols( A );
+			const size_t n_A = !trans_left ? ncols( A ) : nrows( A );
+			const size_t m_B = !trans_right ? nrows( B ) : ncols( B );
+			const size_t n_B = !trans_right ? ncols( B ) : nrows( B );
 
 			// Identities
 			const auto identity_A = monoid.template getIdentity< OutputType >();
@@ -1243,12 +1320,12 @@ namespace grb {
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
-						(void)++nzc;
+						(void) ++nzc;
 					}
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( not coors1.assigned( l_col ) ) {
-							(void)++nzc;
+							(void) ++nzc;
 						}
 					}
 				}
@@ -1353,7 +1430,7 @@ namespace grb {
 
 					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
 						const size_t j = j_unsigned - 1;
-						if( not columns[ j ] ) {
+						if( !columns[ j ] ) {
 							continue;
 						}
 						// update CRS
@@ -1369,8 +1446,9 @@ namespace grb {
 					}
 					C_raw.col_start[ i + 1 ] = nzc;
 
-					for(size_t i=0; i<n; i++)
+					for( size_t i = 0; i < n; i++ ) {
 						columns[ i ] = false;
+					}
 				}
 
 #ifdef _DEBUG
@@ -1390,6 +1468,9 @@ namespace grb {
 #endif
 
 				// set final number of nonzeroes in output matrix
+#ifdef _DEBUG
+				std::cout << "internal::setCurrentNonzeroes( C, " << nzc << " )\n";
+#endif
 				internal::setCurrentNonzeroes( C, nzc );
 			}
 
@@ -1447,7 +1528,7 @@ namespace grb {
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::eWiseApply_matrix_generic (reference, monoid)\n";
+		std::cout << "In grb::eWiseApply_matrix_generic( reference, monoid )\n";
 #endif
 
 		return internal::eWiseApply_matrix_generic_union< true, descr >(
@@ -1509,6 +1590,9 @@ namespace grb {
 			"the operator version of eWiseApply cannot be used if either of the "
 			"input matrices is a pattern matrix (of type void)"
 		);
+#ifdef _DEBUG
+		std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n";
+#endif
 
 		return internal::eWiseApply_matrix_generic_intersection< false, descr >(
 			C, A, B, mulOp, phase
diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp
index 27bbe93fb..6d675aa97 100644
--- a/tests/unit/eWiseApplyMatrixReference.cpp
+++ b/tests/unit/eWiseApplyMatrixReference.cpp
@@ -22,79 +22,8 @@
 
 #include <graphblas.hpp>
 
-template< class Iterator >
-void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-#ifndef _DEBUG
-	return;
-#endif
-	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-	if( rows > 50 || cols > 50 ) {
-		os << "   Matrix too large to print" << std::endl;
-	} else {
-		os.precision( 3 );
-		for( size_t y = 0; y < rows; y++ ) {
-			os << std::string( 3, ' ' );
-			for( size_t x = 0; x < cols; x++ ) {
-				auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-					return a.first.first == y && a.first.second == x;
-				} );
-				if( nnz_val != end )
-					os << std::fixed << std::setw( 3 ) << ( *nnz_val ).second;
-				else
-					os << "___";
-				os << " ";
-			}
-			os << std::endl;
-		}
-	}
-	os << "]" << std::endl;
-	std::flush( os );
-}
-
-template< typename D >
-void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-	grb::wait( mat );
-	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
-}
-
-template< class Storage, typename D >
-void printCompressedStorage( const Storage& storage, const grb::Matrix< D > & mat, std::ostream & os = std::cout ) {
-	os << "  row_index: [ ";
-	for( size_t i = 0; i < grb::nrows( mat ); ++i ) {
-		os << storage.row_index[ i ] << " ";
-	}
-	os << "]" << std::endl;
-	os << "  col_start: [ ";
-	for( size_t i = 0; i <= grb::nrows( mat ); ++i ) {
-		os << storage.col_start[ i ] << " ";
-	}
-	os << "]" << std::endl;
-	os << "  values:    [ ";
-	for( size_t i = 0; i < grb::nnz( mat ); ++i ) {
-		os << storage.values[ i ] << " ";
-	}
-	os << "]" << std::endl << std::flush;
-}
-
-template< typename D >
-void printCRS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) {
-#ifndef _DEBUG
-	return;
-#endif
-	grb::wait( mat );
-	os << "CRS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl;
-	printCompressedStorage(  grb::internal::getCRS( mat ), mat, os );
-}
+using namespace grb;
 
-template< typename D >
-void printCCS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) {
-#ifndef _DEBUG
-	return;
-#endif
-	grb::wait( mat );
-	os << "CCS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl;
-	printCompressedStorage(  grb::internal::getCCS( mat ), mat, os );
-}
 
 // static data corresponding to small matrices
 
@@ -179,127 +108,116 @@ static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1
 // helper function to check internal data structures
 // of the reference backend
 template< typename T >
-void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) {
-	printCRS( obtained, "obtained" );
-	printCRS( expected, "expected" );
-
-	if( grb::nnz( obtained ) != grb::nnz( expected ) ) {
+void checkCRSandCCS( 
+	const Matrix< T > & obtained, 
+	const Matrix< T > & expected, 
+	RC & rc 
+) {
+	if( nnz( obtained ) != nnz( expected ) ) {
 		std::cerr << "Error: unexpected number of non-zero entries; "
-				  << "expected " << grb::nnz( expected ) << ", "
-				  << "obtained " << grb::nnz( obtained ) << ".\n";
-		rc = grb::FAILED;
+				  << "expected " << nnz( expected ) << ", "
+				  << "obtained " << nnz( obtained ) << ".\n";
+		rc = FAILED;
 	}
 
 	{ // check CRS output
-		const auto & crsObtained = grb::internal::getCRS( obtained );
-		const auto & crsExpected = grb::internal::getCRS( expected );
-		for( size_t i = 0; i < grb::nrows( obtained ); ++i ) {
+		const auto & crsObtained = internal::getCRS( obtained );
+		const auto & crsExpected = internal::getCRS( expected );
+		for( size_t i = 0; i < nrows( obtained ); ++i ) {
 			for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) {
 				if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) {
 					std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), "
 							  << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) "
 							  << "instead (CRS).\n";
-					rc = grb::FAILED;
+					rc = FAILED;
 				}
 				if( crsObtained.values[ k ] != crsExpected.values[ k ] ) {
 					std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; "
 							  << "expected " << crsExpected.values[ k ] << " (CRS).\n";
-					rc = grb::FAILED;
+					rc = FAILED;
 				}
 			}
 		}
 	}
 
-	printCCS( obtained, "obtained" );
-	printCCS( expected, "expected" );
-
 	{ // check CCS output
-		const auto & ccsObtained = grb::internal::getCCS( obtained );
-		const auto & ccsExpected = grb::internal::getCCS( expected );
-		for( size_t j = 0; j < grb::ncols( obtained ); ++j ) {
+		const auto & ccsObtained = internal::getCCS( obtained );
+		const auto & ccsExpected = internal::getCCS( expected );
+		for( size_t j = 0; j < ncols( obtained ); ++j ) {
 			for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) {
 				if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) {
 					std::cerr << "Error: unexpected entry at "
 							  << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), "
 							  << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) "
 							  << "instead (CCS).\n";
-					rc = grb::FAILED;
+					rc = FAILED;
 				}
 				if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) {
 					std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; "
 							  << "expected " << ccsExpected.values[ k ] << " (CCS).\n";
-					rc = grb::FAILED;
+					rc = FAILED;
 				}
 			}
 		}
 	}
 }
 
-void grbProgram( const void *, const size_t, grb::RC & rc ) {
+void grbProgram( const void *, const size_t, RC & rc ) {
 
 	// initialize test
-	grb::Monoid< grb::operators::mul< int >, grb::identities::one > mulmono;
+	const grb::Monoid< 	grb::operators::mul< int >, 
+						grb::identities::one > mulmono;
 
 	const size_t n = 4;
 	const size_t nelts_A = 8;
 	const size_t nelts_B = 6;
 
-	grb::Matrix< int > A( n, n );
-	grb::Matrix< int > B( n, n );
-	grb::Matrix< void > A_pattern( n, n );
-	grb::Matrix< void > B_pattern( n, n );
-	grb::Matrix< int > C( n, n );
-
-	rc = grb::resize( A, nelts_A );
-	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, grb::SEQUENTIAL );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::resize( B, nelts_B );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, grb::SEQUENTIAL );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::resize( A_pattern, nelts_A );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, grb::SEQUENTIAL );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::resize( B_pattern, nelts_B );
-	}
-	if( rc == grb::SUCCESS ) {
-		rc = grb::buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, grb::SEQUENTIAL );
-	}
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "\tinitialisation FAILED\n";
-		return;
-	}
-
-	printSparseMatrix( A, "A" );
-	printCRS( A, "A" );
-	printCCS( A, "A" );
-	printSparseMatrix( B, "B" );
-	printCRS( B, "B" );
-	printCCS( B, "B" );
+	Matrix< int > A( n, n );
+	Matrix< int > B( n, n );
+	Matrix< void > A_pattern( n, n );
+	Matrix< void > B_pattern( n, n );
+	Matrix< int > C( n, n );
+
+	assert( SUCCESS == resize( A, nelts_A ) );
+	assert( SUCCESS ==
+		buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, SEQUENTIAL )
+	);
+	assert( SUCCESS == resize( B, nelts_B ) );
+	assert( SUCCESS ==
+		buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, SEQUENTIAL )
+	);
+	assert( SUCCESS == resize( A_pattern, nelts_A ) );
+	assert( SUCCESS ==
+		buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, SEQUENTIAL )
+	);
+	assert( SUCCESS == resize( B_pattern, nelts_B ) );
+	assert( SUCCESS ==
+		buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, SEQUENTIAL )
+	);	
 
 	{ // test 1: compute with the monoid mxm_elementwise
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A and B value matrices\n";
-		grb::clear( C );
-		rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE );
+		clear( C );
+		rc = grb::eWiseApply( C, A, B, mulmono, RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono );
-		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" );
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			std::cerr << "Call to grb::eWiseApply FAILED\n";
 			return;
 		}
-		grb::Matrix< int > union_A_B( n, n );
-		grb::buildMatrixUnique( union_A_B, I_C_union.data(), J_C_union.data(), V_C_union_A_B.data(), I_C_union.size(), grb::SEQUENTIAL );
+		Matrix< int > union_A_B( n, n );
+		assert( SUCCESS ==
+			buildMatrixUnique( 
+			union_A_B, 
+			I_C_union.data(), 
+			J_C_union.data(), 
+			V_C_union_A_B.data(), 
+			I_C_union.size(), 
+			SEQUENTIAL )
+		);
 		checkCRSandCCS( C, union_A_B, rc );
 
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			return;
 		}
 	}
@@ -307,19 +225,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A value matrix, B pattern matrix\n";
-		grb::clear( C );
-		rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE );
+		clear( C );
+		rc = grb::eWiseApply( C, A, B_pattern, mulmono, RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono );
-		printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" );
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			std::cerr << "Call to grb::eWiseApply FAILED\n";
 			return;
 		}
-		grb::Matrix< int > union_A_B_pattern( n, n );
-		grb::buildMatrixUnique( union_A_B_pattern, I_C_union_A_B_pattern.data(), J_C_union_A_B_pattern.data(), V_C_union_A_B_pattern.data(), I_C_union_A_B_pattern.size(), grb::SEQUENTIAL );
+		Matrix< int > union_A_B_pattern( n, n );
+		assert( SUCCESS ==
+			buildMatrixUnique( 
+			union_A_B_pattern, 
+			I_C_union_A_B_pattern.data(), 
+			J_C_union_A_B_pattern.data(), 
+			V_C_union_A_B_pattern.data(), 
+			I_C_union_A_B_pattern.size(), 
+			SEQUENTIAL )
+		);
 		checkCRSandCCS( C, union_A_B_pattern, rc );
 
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			return;
 		}
 	}
@@ -327,19 +252,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A pattern matrix, B value matrix\n";
-		grb::clear( C );
-		rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE );
+		clear( C );
+		rc = grb::eWiseApply( C, A_pattern, B, mulmono, RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono );
-		printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" );
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			std::cerr << "Call to grb::eWiseApply FAILED\n";
 			return;
 		}
-		grb::Matrix< int > union_A_pattern_B( n, n );
-		grb::buildMatrixUnique( union_A_pattern_B, I_C_union_A_pattern_B.data(), J_C_union_A_pattern_B.data(), V_C_union_A_pattern_B.data(), I_C_union_A_pattern_B.size(), grb::SEQUENTIAL );
+		Matrix< int > union_A_pattern_B( n, n );
+		assert( SUCCESS ==
+			buildMatrixUnique( 
+			union_A_pattern_B, 
+			I_C_union_A_pattern_B.data(), 
+			J_C_union_A_pattern_B.data(), 
+			V_C_union_A_pattern_B.data(), 
+			I_C_union_A_pattern_B.size(), 
+			SEQUENTIAL )
+		);
 		checkCRSandCCS( C, union_A_pattern_B, rc );
 
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			return;
 		}
 	}
@@ -347,20 +279,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n";
 		std::cout << "\t Verifying the monoid version of mxm_elementwise, "
 				  << "A pattern matrix, B pattern matrix\n";
-		grb::clear( C );
-		rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE );
+		clear( C );
+		rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono );
-		printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" );
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			std::cerr << "Call to grb::eWiseApply FAILED\n";
 			return;
 		}
-		grb::Matrix< int > union_A_pattern_B_pattern( n, n );
-		grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(),
-			I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL );
+		Matrix< int > union_A_pattern_B_pattern( n, n );
+		assert( SUCCESS ==
+			buildMatrixUnique( 
+				union_A_pattern_B_pattern, 
+				I_C_union_A_pattern_B_pattern.data(), 
+				J_C_union_A_pattern_B_pattern.data(), 
+				V_C_union_A_pattern_B_pattern.data(),
+				I_C_union_A_pattern_B_pattern.size(), 
+				SEQUENTIAL )
+		);
 		checkCRSandCCS( C, union_A_pattern_B_pattern, rc );
 
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			return;
 		}
 	}
@@ -368,18 +306,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) {
 	{ // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n";
 		std::cout << "\t Verifying the operator version of mxm_elementwise "
 				  << "(only value matrices)\n";
-		grb::clear( C );
-		rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE );
+		clear( C );
+		rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() );
-		printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" );
-		if( rc != grb::SUCCESS ) {
+		if( rc != SUCCESS ) {
 			std::cerr << "Call to grb::eWiseApply FAILED\n";
 			return;
 		}
-		grb::Matrix< int > intersection_A_B( n, n );
-		grb::buildMatrixUnique( intersection_A_B, I_C_intersection.data(), J_C_intersection.data(), V_C_intersection.data(), I_C_intersection.size(), grb::SEQUENTIAL );
+		Matrix< int > intersection_A_B( n, n );
+		assert( SUCCESS ==
+			buildMatrixUnique( 
+				intersection_A_B, 
+				I_C_intersection.data(), 
+				J_C_intersection.data(), 
+				V_C_intersection.data(), 
+				I_C_intersection.size(), 
+				SEQUENTIAL )
+		);
 		checkCRSandCCS( C, intersection_A_B, rc );
-		if( rc != grb::SUCCESS ) {
+
+		if( rc != SUCCESS ) {
 			return;
 		}
 	}
@@ -389,13 +335,13 @@ int main( int argc, char ** argv ) {
 	(void)argc;
 	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
 
-	grb::RC rc;
+	RC rc;
 	grb::Launcher< grb::AUTOMATIC > launcher;
-	if( launcher.exec( &grbProgram, NULL, 0, rc ) != grb::SUCCESS ) {
+	if( launcher.exec( &grbProgram, NULL, 0, rc ) != SUCCESS ) {
 		std::cerr << "Test failed to launch\n";
-		rc = grb::FAILED;
+		rc = FAILED;
 	}
-	if( rc == grb::SUCCESS ) {
+	if( rc == SUCCESS ) {
 		std::cout << "Test OK\n" << std::endl;
 	} else {
 		std::cerr << std::flush;
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index 0e57b8f58..ca112ce8e 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -18,10 +18,10 @@
 /*
  * @author Benjamin Lozes
  * @date 24th of May, 2023
- * 
- * @brief Test for eWiseApply(Matrix, Monoid) 
+ *
+ * @brief Test for eWiseApply(Matrix, Monoid)
  *		  and eWiseApply(Matrix, Operator) variants
- * 
+ *
  * This test is meant to ensure the behaviour of the eWiseApply(Matrix, Monoid)
  * and eWiseApply(Matrix, Operator) variants is correct. Precisely, we expect
  * the following behaviour:
@@ -30,7 +30,7 @@
  * 		  provided identity value for the zero elements.
  * 		- eWiseApply(Matrix, Operator) should apply the operator to all elements
  * 		  of the two matrices, EXCLUDING the couples (non_zero, zero)
- * 
+ *
  */
 
 #include <iostream>
@@ -40,162 +40,137 @@
 
 #include <graphblas.hpp>
 
-#define _DEBUG
+using namespace grb;
 
 using nz_type = int;
 
-constexpr size_t M = 10;
-constexpr size_t N = 10;
 constexpr nz_type A_INITIAL_VALUE = 1;
 constexpr nz_type B_INITIAL_VALUE = 3;
 
-namespace utils {
-	template< class Iterator >
-	void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-#ifndef _DEBUG
-		return;
-#endif
-		std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-		if( rows > 50 || cols > 50 ) {
-			os << "   Matrix too large to print" << std::endl;
-		} else {
-			// os.precision( 3 );
-			for( size_t y = 0; y < rows; y++ ) {
-				os << std::string( 3, ' ' );
-				for( size_t x = 0; x < cols; x++ ) {
-					auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-						return a.first.first == y && a.first.second == x;
-					} );
-					if( nnz_val != end )
-						os << std::fixed << ( *nnz_val ).second;
-					else
-						os << '_';
-					os << " ";
-				}
-				os << std::endl;
-			}
-		}
-		os << "]" << std::endl;
-		std::flush( os );
-	}
 
-	template< typename D >
-	void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-		grb::wait( mat );
-		printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+template< typename D >
+bool equals_matrix(
+	const Matrix< D > & A,
+	const Matrix< D > & B
+) {
+	if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){
+		return false;
 	}
 
-	template< typename D >
-	bool equals_matrix( const grb::Matrix< D > & A, const grb::Matrix< D > & B ) {
-		if( grb::nrows( A ) != grb::nrows( B ) || grb::ncols( A ) != grb::ncols( B ) )
-			return false;
-		grb::wait( A );
-		grb::wait( B );
-		std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() );
-		std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() );
-		return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
-	}
-} // namespace utils
+	wait( A );
+	wait( B );
+
+	std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() );
+	std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() );
+	return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
+}
 
 template< class Monoid >
 struct input_t {
-	const grb::Matrix< nz_type > & A;
-	const grb::Matrix< nz_type > & B;
-	const grb::Matrix< nz_type > & C_monoid;
-	const grb::Matrix< nz_type > & C_operator;
+	const Matrix< nz_type > & A;
+	const Matrix< nz_type > & B;
+	const Matrix< nz_type > & C_monoid;
+	const Matrix< nz_type > & C_operator;
 	const Monoid & monoid;
 
-	input_t( 
-		const grb::Matrix< nz_type > & A = {0,0},
-		const grb::Matrix< nz_type > & B = {0,0},
-		const grb::Matrix< nz_type > & C_monoid = {0,0},
-		const grb::Matrix< nz_type > & C_operator = {0,0},
-		const Monoid & monoid = Monoid() ) :
-		A( A ), B( B ), C_monoid( C_monoid ), C_operator( C_operator ), monoid( monoid ) {}
-};	
+	input_t(
+		const Matrix< nz_type > & A = {0,0},
+		const Matrix< nz_type > & B = {0,0},
+		const Matrix< nz_type > & C_monoid = {0,0},
+		const Matrix< nz_type > & C_operator = {0,0},
+		const Monoid & monoid = Monoid() 
+	) : A( A ), 
+		B( B ), 
+		C_monoid( C_monoid ),
+		C_operator( C_operator ), 
+		monoid( monoid ) {}
+};
 
 struct output_t {
-	grb::RC rc;
+	RC rc;
 };
 
 template< class Monoid >
 void grb_program( const input_t< Monoid > & input, output_t & output ) {
-	static_assert( grb::is_monoid< Monoid >::value, "Monoid required" );
-	const auto & op = input.monoid.getOperator();
-	grb::wait( input.A );
-	grb::wait( input.B );
+	static_assert( is_monoid< Monoid >::value, "Monoid required" );
+	const auto &op = input.monoid.getOperator();
+	wait( input.A );
+	wait( input.B );
 
-	auto & rc = output.rc;
-
-	utils::printSparseMatrix( input.A, "A" );
-	utils::printSparseMatrix( input.B, "B" );
+	RC &rc = output.rc;
 
 	{ // Operator variant
-		std::cout << "-- eWiseApply using Operator, supposed to be annihilating non-zeroes -> INTERSECTION\n";
-		grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) );
-		rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::RESIZE );
-		grb::wait( C );
-		if( rc != grb::RC::SUCCESS ) {
+		std::cout << "-- eWiseApply using Operator, supposed to be"
+					<< " annihilating non-zeroes -> INTERSECTION\n";
+		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
+		rc = eWiseApply( C, input.A, input.B, op, RESIZE );
+		wait( C );
+		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::RESIZE\n";
 			return;
 		}
-		rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::EXECUTE );
-		grb::wait( C );
-		if( rc != grb::RC::SUCCESS ) {
+		rc = eWiseApply( C, input.A, input.B, op, EXECUTE );
+		wait( C );
+		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::EXECUTE\n";
 			return;
 		}
 
-		if( ! utils::equals_matrix( C, input.C_operator ) ) {
+		if( !equals_matrix( C, input.C_operator ) ) {
 			std::cerr << "Error: Wrong result\n";
-			utils::printSparseMatrix( C, "Obtained (operator)", std::cerr );
-			utils::printSparseMatrix( input.C_operator, "Truth (operator)", std::cerr );
-			rc = grb::RC::FAILED;
+			rc = FAILED;
 			return;
 		}
 
 		std::cout << "Result (operator) is correct\n";
 	}
 
-	{ // Monoid variant
-		std::cout << "-- eWiseApply using Monoid, supposed to consider non-zeroes as the identity -> UNION\n";
-		grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) );
-		rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::RESIZE );
-		grb::wait( C );
-		if( rc != grb::RC::SUCCESS ) {
-			std::cerr << "Error: Phase::RESIZE\n";
-			return;
-		}
-		rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::EXECUTE );
-		grb::wait( C );
-		if( rc != grb::RC::SUCCESS ) {
-			std::cerr << "Error: Phase::EXECUTE\n";
-			return;
-		}
-
-		if( ! utils::equals_matrix( C, input.C_monoid ) ) {
-			std::cerr << "Error: Wrong result\n";
-			utils::printSparseMatrix( C, "Obtained (monoid)", std::cerr );
-			utils::printSparseMatrix( input.C_monoid, "Truth (monoid)", std::cerr );
-			rc = grb::RC::FAILED;
-			return;
-		}
-
-		std::cout << "Result (monoid) is correct\n";
-	}
-
-	rc = grb::RC::SUCCESS;
+	// { // Monoid variant
+	// 	std::cout << "-- eWiseApply using Monoid, supposed to consider"
+	// 				<< " non-zeroes as the identity -> UNION\n";
+	// 	Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
+	// 	rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE );
+	// 	wait( C );
+	// 	if( rc != SUCCESS ) {
+	// 		std::cerr << "Error: Phase::RESIZE\n";
+	// 		return;
+	// 	}
+	// 	rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE );
+	// 	wait( C );
+	// 	if( rc != SUCCESS ) {
+	// 		std::cerr << "Error: Phase::EXECUTE\n";
+	// 		return;
+	// 	}
+
+	// 	if( !equals_matrix( C, input.C_monoid ) ) {
+	// 		std::cerr << "Error: Wrong result\n";
+	// 		rc = FAILED;
+	// 		return;
+	// 	}
+
+	// 	std::cout << "Result (monoid) is correct\n";
+	// }
+
+	rc = SUCCESS;
 }
 
 int main( int argc, char ** argv ) {
 	(void) argc;
 	(void) argv;
 
-	if(argc > 1) std::cout << "Usage: " << argv[ 0 ] << std::endl;
+	size_t N = 10;
 
-	std::cout << "This is functional test " << argv[ 0 ] << std::endl;
-	grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher;
-	grb::RC rc = grb::RC::SUCCESS;
+	if( argc > 2 ) {
+		std::cout << "Usage: " << argv[ 0 ] << std::endl;
+		return 1;
+	}
+	if( argc == 2 ) {
+		N = std::stoul( argv[ 1 ] );
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush;
+
+	Launcher< AUTOMATIC > launcher;
 
 	// Create input data
 	/** Matrix A: Row matrix filled with A_INITIAL_VALUE
@@ -206,12 +181,13 @@ int main( int argc, char ** argv ) {
 	 * 	_ _ _ _ _
 	 * 	  (...)
 	 */
-	grb::Matrix< nz_type > A( M, N, N );
+	Matrix< nz_type > A( N, N, N );
 	std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
 	std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
 	std::iota( A_cols.begin(), A_cols.end(), 0 );
-	rc = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::SEQUENTIAL );
-	assert( rc == grb::RC::SUCCESS );
+	if( SUCCESS !=
+		buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL )
+	) { return 2; }
 
 	/** Matrix B: Column matrix filled with B_INITIAL_VALUE
 	 *  Y _ _ _ _
@@ -221,12 +197,13 @@ int main( int argc, char ** argv ) {
 	 * 	Y _ _ _ _
 	 * 	  (...)
 	 */
-	grb::Matrix< nz_type > B( M, N, N );
-	std::vector< size_t > B_rows( M, 0 ), B_cols( M, 0 );
-	std::vector< nz_type > B_values( M, B_INITIAL_VALUE );
+	Matrix< nz_type > B( N, N, N );
+	std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
+	std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
 	std::iota( B_rows.begin(), B_rows.end(), 0 );
-	rc = grb::buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), grb::IOMode::SEQUENTIAL );
-	assert( rc == grb::RC::SUCCESS );
+	if( SUCCESS !=
+		buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL )
+	) { return 3; }
 
 	{
 		/** Matrix C_monoid_truth: Union of A and B
@@ -237,17 +214,25 @@ int main( int argc, char ** argv ) {
 		 * Y  ___ ___ ___ ___
 		 * 	      (...)
 		 */
-		grb::Matrix< nz_type > C_monoid_truth( M, N );
-		size_t nvalues = grb::nrows( A ) + grb::ncols( B ) - 1;
+		Matrix< nz_type > C_monoid_truth( N, N );
+		size_t nvalues = nrows( A ) + ncols( B ) - 1;
 		std::vector< size_t > C_monoid_truth_rows( nvalues, 0 ), C_monoid_truth_cols( nvalues, 0 );
 		std::vector< nz_type > C_monoid_truth_values( nvalues, 0 );
 		C_monoid_truth_values[ 0 ] = A_INITIAL_VALUE + B_INITIAL_VALUE;
-		std::iota( C_monoid_truth_rows.begin() + grb::nrows( A ), C_monoid_truth_rows.end(), 1 );
-		std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + grb::nrows( A ), 1 );
-		std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + grb::nrows( A ), A_INITIAL_VALUE );
-		std::fill( C_monoid_truth_values.begin() + grb::nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE );
-		rc = grb::buildMatrixUnique( C_monoid_truth, C_monoid_truth_rows.data(), C_monoid_truth_cols.data(), C_monoid_truth_values.data(), C_monoid_truth_values.size(), grb::IOMode::SEQUENTIAL );
-		assert( rc == grb::RC::SUCCESS );
+		std::iota( C_monoid_truth_rows.begin() + nrows( A ), C_monoid_truth_rows.end(), 1 );
+		std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + nrows( A ), 1 );
+		std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + nrows( A ), A_INITIAL_VALUE );
+		std::fill( C_monoid_truth_values.begin() + nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE );
+		if( SUCCESS !=
+			buildMatrixUnique(
+				C_monoid_truth,
+				C_monoid_truth_rows.data(),
+				C_monoid_truth_cols.data(),
+				C_monoid_truth_values.data(),
+				C_monoid_truth_values.size(),
+				SEQUENTIAL
+			)
+		) { return 4; }
 
 		/** Matrix C_op_truth: Intersection of A and B
 		 *  X+Y ___ ___ ___ ___
@@ -257,28 +242,42 @@ int main( int argc, char ** argv ) {
 		 * 	___ ___ ___ ___ ___
 		 * 	       (...)
 		 */
-		grb::Matrix< nz_type > C_op_truth( M, N );
+		Matrix< nz_type > C_op_truth( N, N );
 		std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 );
 		std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE );
-		rc = grb::buildMatrixUnique( C_op_truth, C_op_truth_rows.data(), C_op_truth_cols.data(), C_op_truth_values.data(), C_op_truth_values.size(), grb::IOMode::SEQUENTIAL );
-		assert( rc == grb::RC::SUCCESS );
+		if( SUCCESS !=
+			buildMatrixUnique(
+				C_op_truth,
+				C_op_truth_rows.data(),
+				C_op_truth_cols.data(),
+				C_op_truth_values.data(),
+				C_op_truth_values.size(),
+				SEQUENTIAL
+			)
+		) { return 5; }
 
 		{ /** Test using addition operator, same type for lhs and rhs
 		   */
-			input_t< grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero > > input { A, B, C_monoid_truth, C_op_truth,
-				grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero >() };
-			output_t output { grb::RC::SUCCESS };
+			input_t<
+				Monoid< operators::add< nz_type >, identities::zero >
+			> input { A, B, C_monoid_truth, C_op_truth };
+			output_t output { SUCCESS };
 			// Run the test
-			rc = launcher.exec( &grb_program, input, output, false );
+			RC rc = launcher.exec( &grb_program, input, output, false );
 			// Check the result
-			assert( rc == grb::RC::SUCCESS );
-			if( output.rc != grb::RC::SUCCESS ) {
-				std::cout << "Test FAILED (" << grb::toString( output.rc ) << ")" << std::endl;
-				return 1;
+			if( rc != SUCCESS ) {
+				std::cerr << "Error: Launcher::exec\n";
+				return 6;
+			}
+			if( output.rc != SUCCESS ) {
+				std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
+				return 7;
 			}
 		}
 	}
 
-	std::cout << "Test OK" << std::endl;
+	std::cerr << std::flush;
+	std::cout << "Test OK" << std::endl << std::flush;
+	
 	return 0;
 }

From 8ee55c5be13394512c4fb0bc6e8644b83e9d1a44 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 12 Jul 2023 18:03:27 +0200
Subject: [PATCH 08/37] Non-square matrix CCS assignment bugfix

---
 include/graphblas/reference/blas3.hpp | 63 +++++++++++++++++++--------
 tests/unit/eWiseApply_matrix.cpp      | 47 ++++++++++----------
 2 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 97c585118..05f445f5d 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1062,11 +1062,19 @@ namespace grb {
 
 			const size_t nzc = capacity( C );
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel for simd
+#endif
+			for( size_t i = 0; i <= n_A; i++ ) {
+				C_ccs_raw.col_start[ i ] = 0;
+			}
+
 			C_crs_raw.col_start[ 0 ] = 0;
-			C_ccs_raw.col_start[ 0 ] = 0;
 			// Prefix sum computation into L.CRS.col_start
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for default( none ) shared( B_raw, A_raw, C_crs_raw, std::cout ) firstprivate( m_A )
+#pragma omp parallel for default( none ) \
+			shared( B_raw, A_raw, C_crs_raw, C_ccs_raw, std::cout ) \
+			firstprivate( m_A )
 #endif
 			for( size_t i = 0; i < m_A; i++ ) {
 				auto B_k = B_raw.col_start[ i ];
@@ -1086,6 +1094,7 @@ namespace grb {
 					}
 					if( B_raw.row_index[ B_k ] == j ) {
 						cumul += 1;
+						C_ccs_raw.col_start[ j + 1 ] += 1;
 					}
 				}
 				C_crs_raw.col_start[ i + 1 ] = cumul;
@@ -1093,41 +1102,57 @@ namespace grb {
 
 #ifdef _DEBUG
 			// Print the CRS prefix sum
-			std::cout << "CRS prefix sum: ";
+			std::cout << "before nCRS prefix sum: ";
 			for( size_t i = 0; i <= m_A; i++ ) {
 				std::cout << C_crs_raw.col_start[ i ] << " ";
 			}
 			std::cout << "\n";
+			// Print the CCS prefix sum
+			std::cout << "before nCCS prefix sum: ";
+			for( size_t i = 0; i <= n_A; i++ ) {
+				std::cout << C_ccs_raw.col_start[ i ] << " ";
+			}
+			std::cout << "\n";
 #endif
 
 			// Apply the prefix sum
 			for( size_t i = 1; i <= m_A; i++ ) {
 				C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ];
-				C_ccs_raw.col_start[ i ] = C_crs_raw.col_start[ i ];
 			}
+			for ( size_t i = 1; i <= n_A; i++ ) {
+				C_ccs_raw.col_start[ i ] += C_ccs_raw.col_start[ i - 1 ];
+			}
+
+#ifdef _DEBUG
+			// Print the CRS prefix sum
+			std::cout << "after nCRS prefix sum: ";
+			for( size_t i = 0; i <= m_A; i++ ) {
+				std::cout << C_crs_raw.col_start[ i ] << " ";
+			}
+			std::cout << "\n";
+			// Print the CCS prefix sum
+			std::cout << "after nCCS prefix sum: ";
+			for( size_t i = 0; i <= n_A; i++ ) {
+				std::cout << C_ccs_raw.col_start[ i ] << " ";
+			}
+			std::cout << "\n";
+#endif
 
 			// Check if the number of nonzeros is greater than the capacity
-			if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) {
+			if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ n_A ] > nzc ) {
 #ifdef _DEBUG
 				std::cout << "Insufficient capacity detected for requested operation.\n"
-						  << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros"
+						  << "Requested " << C_ccs_raw.col_start[ m_A ] << " nonzeros"
 						  << " but capacity is " << nzc << "\n";
 #endif
 				return MISMATCH;
 			}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd
-#endif
-			for( size_t i = 0; i < m_A; i++ ) {
-				C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0;
-			}
-
 			RC local_rc = rc;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 #pragma omp parallel default( none ) \
-			shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \
-			firstprivate( local_rc, m_A, oper, A_identity, B_identity )
+				shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \
+				firstprivate( local_rc, m_A, oper, A_identity, B_identity )
 #endif
 			{
 				size_t start_row = 0;
@@ -1144,6 +1169,7 @@ namespace grb {
 					const auto A_k_end = A_raw.col_start[ i + 1 ];
 					for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
 						const auto j = A_raw.row_index[ A_k ];
+
 						while(  B_k < B_raw.col_start[ i + 1 ]
 							    && B_raw.row_index[ B_k ] > j
 						) {
@@ -1165,8 +1191,9 @@ namespace grb {
 
 						C_crs_raw.row_index[ C_k ] = j;
 						C_crs_raw.setValue( C_k, c_val );
-						C_ccs_raw.row_index[ C_k ] = i;
-						C_ccs_raw.setValue( C_k, c_val );
+
+						C_ccs_raw.row_index[ C_ccs_raw.col_start[ j ] ] = i;
+						C_ccs_raw.setValue( C_ccs_raw.col_start[ j ], c_val );
 #ifdef _DEBUG
 						std::cout << "A( " + std::to_string( i ) + ";"
 									+ std::to_string( j ) + " ) = "
@@ -1174,7 +1201,7 @@ namespace grb {
 						std::cout << "B( " + std::to_string( i ) + ";"
 									+ std::to_string( j ) + " ) = "
 									+ std::to_string( b_val ) + "\n";
-						std::cout << "C.crs( " + std::to_string( i ) + ";"
+						std::cout << "C( " + std::to_string( i ) + ";"
 									+ std::to_string( j ) + " ) = "
 									+ std::to_string( c_val ) + "\n";
 #endif
diff --git a/tests/unit/eWiseApply_matrix.cpp b/tests/unit/eWiseApply_matrix.cpp
index 48db8af3c..98e8c33c7 100644
--- a/tests/unit/eWiseApply_matrix.cpp
+++ b/tests/unit/eWiseApply_matrix.cpp
@@ -23,23 +23,24 @@
 
 using namespace grb;
 
-void grb_program( const int &, grb::RC &rc ) {
+void grb_program( const size_t &n, grb::RC &rc ) {
 
 	// large non-square mixed-domain matrix check
 	{
-		grb::Matrix< char > A( 10000000, 2000000 );
-		grb::Matrix< float > B( 10000000, 2000000 );
-		grb::Matrix< size_t > C( 10000000, 2000000 );
-		size_t * I = new size_t[ 2000000 ];
-		size_t * J = new size_t[ 2000000 ];
-		char * V = new char[ 2000000 ];
-		for( size_t k = 0; k < 2000000; ++k ) {
-			I[ k ] = J[ k ] = k;
+		grb::Matrix< char > A( n, 2*n );
+		grb::Matrix< float > B( n, 2*n );
+		grb::Matrix< size_t > C( n, 2*n );
+		size_t * I = new size_t[ n ];
+		size_t * J = new size_t[ n ];
+		char * V = new char[ n ];
+		for( size_t k = 0; k < n; ++k ) {
+			I[ k ] = k;
+			J[ k ] = k+n;
 			V[ k ] = 2;
 		}
-		rc = grb::buildMatrixUnique( A, I, J, V, 2000000, SEQUENTIAL );
-		rc = rc ? rc : grb::buildMatrixUnique( B, I, J, V, 2000000, SEQUENTIAL );
-		rc = rc ? rc : grb::buildMatrixUnique( C, I, J, V, 2000000, SEQUENTIAL );
+		rc = grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL );
+		rc = rc ? rc : grb::buildMatrixUnique( B, I, J, V, n, SEQUENTIAL );
+		rc = rc ? rc : grb::buildMatrixUnique( C, I, J, V, n, SEQUENTIAL );
 		rc = rc ? rc : grb::eWiseApply( C, A, B,
 			grb::operators::add< float, size_t, char >(), RESIZE );
 		rc = rc ? rc : grb::eWiseApply( C, A, B,
@@ -49,13 +50,14 @@ void grb_program( const int &, grb::RC &rc ) {
 				<< "mixed-domain matrix check\n";
 			return;
 		}
+		
 		for( const auto &triple : C ) {
-			const size_t &i = triple.first.first;
-			const size_t &j = triple.first.second;
-			const size_t &v = triple.second;
-			if( i != j ) {
-				std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
-					<< "-- only expected entries on the diagonal\n";
+			const auto &i = triple.first.first;
+			const auto &j = triple.first.second;
+			const auto &v = triple.second;
+			if( j != i+n ) {
+				std::cout << "Unexpected entry at position ( " << i << ", " << i+n << " ) "
+					<< "-- only expected entries on the n-th diagonal\n";
 				rc = FAILED;
 			}
 			if( v != 4 ) {
@@ -74,15 +76,14 @@ void grb_program( const int &, grb::RC &rc ) {
 
 int main( int argc, char ** argv ) {
 	// defaults
-	bool printUsage = false;
-	int input = 0; // unused
+	size_t input = 1000; // unused
 
 	// error checking
 	if( argc > 1 ) {
-		printUsage = true;
+		input = std::strtoul( argv[ 1 ], nullptr, 10 );
 	}
-	if( printUsage ) {
-		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+	if( argc > 2 ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "[n]\n";
 		return 1;
 	}
 

From 3a70d9b78a49a39ea496cf09179bb99766a00951 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 13 Jul 2023 09:26:14 +0200
Subject: [PATCH 09/37] Adapt changes in spy unit-test

---
 tests/unit/spy.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/spy.cpp b/tests/unit/spy.cpp
index 71b8e8f28..780216d7f 100644
--- a/tests/unit/spy.cpp
+++ b/tests/unit/spy.cpp
@@ -82,7 +82,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, grb::nnz( spy ) );
-		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk: " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";
 			rc = grb::FAILED;
@@ -114,7 +114,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, nnz( spy ) );
-		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk (pattern): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";
 			rc = grb::FAILED;
@@ -146,7 +146,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, nnz( spy ) );
-		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk (boolean): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";
 			rc = grb::FAILED;

From 89ebb160a0ddc1c920c48e8834edfdc12bb64253 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 28 Jul 2023 11:56:40 +0200
Subject: [PATCH 10/37] Enable test for both variants

---
 tests/unit/eWiseApplyMatrix_variants.cpp | 50 ++++++++++++------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index ca112ce8e..6c1ff2ed0 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -125,31 +125,31 @@ void grb_program( const input_t< Monoid > & input, output_t & output ) {
 		std::cout << "Result (operator) is correct\n";
 	}
 
-	// { // Monoid variant
-	// 	std::cout << "-- eWiseApply using Monoid, supposed to consider"
-	// 				<< " non-zeroes as the identity -> UNION\n";
-	// 	Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
-	// 	rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE );
-	// 	wait( C );
-	// 	if( rc != SUCCESS ) {
-	// 		std::cerr << "Error: Phase::RESIZE\n";
-	// 		return;
-	// 	}
-	// 	rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE );
-	// 	wait( C );
-	// 	if( rc != SUCCESS ) {
-	// 		std::cerr << "Error: Phase::EXECUTE\n";
-	// 		return;
-	// 	}
-
-	// 	if( !equals_matrix( C, input.C_monoid ) ) {
-	// 		std::cerr << "Error: Wrong result\n";
-	// 		rc = FAILED;
-	// 		return;
-	// 	}
-
-	// 	std::cout << "Result (monoid) is correct\n";
-	// }
+	{ // Monoid variant
+		std::cout << "-- eWiseApply using Monoid, supposed to consider"
+					<< " non-zeroes as the identity -> UNION\n";
+		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
+		rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE );
+		wait( C );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error: Phase::RESIZE\n";
+			return;
+		}
+		rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE );
+		wait( C );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error: Phase::EXECUTE\n";
+			return;
+		}
+
+		if( !equals_matrix( C, input.C_monoid ) ) {
+			std::cerr << "Error: Wrong result\n";
+			rc = FAILED;
+			return;
+		}
+
+		std::cout << "Result (monoid) is correct\n";
+	}
 
 	rc = SUCCESS;
 }

From 9f8a43ea634bf43b77b63d3fa9b0f9590ec7476e Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 31 Jul 2023 17:23:33 +0200
Subject: [PATCH 11/37] Revert to stack implementation of the intersection
 variant

---
 include/graphblas/reference/blas3.hpp    | 456 ++++++++++-------------
 tests/unit/eWiseApplyMatrix_variants.cpp |  34 +-
 2 files changed, 210 insertions(+), 280 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 05f445f5d..7baa39025 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -137,7 +137,7 @@ namespace grb {
 			const auto &B_raw = !trans_right
 				? internal::getCRS( B )
 				: internal::getCCS( B );
-			auto &C_raw = internal::getCRS( C );
+			auto &CRS_raw = internal::getCRS( C );
 			auto &CCS_raw = internal::getCCS( C );
 
 			char * arr = nullptr;
@@ -175,7 +175,7 @@ namespace grb {
 			if( crs_only && phase == RESIZE ) {
 				// we are using an auxialiary CRS that we cannot resize ourselves
 				// instead, we update the offset array only
-				C_raw.col_start[ 0 ] = 0;
+				CRS_raw.col_start[ 0 ] = 0;
 			}
 			// if crs_only, then the below implements its resize phase
 			// if not crs_only, then the below is both crucial for the resize phase,
@@ -202,7 +202,7 @@ namespace grb {
 					if( crs_only && phase == RESIZE ) {
 						// we are using an auxialiary CRS that we cannot resize ourselves
 						// instead, we update the offset array only
-						C_raw.col_start[ i + 1 ] = nzc;
+						CRS_raw.col_start[ i + 1 ] = nzc;
 					}
 				}
 			}
@@ -259,7 +259,7 @@ namespace grb {
 			// use previously computed CCS offset array to update CCS during the
 			// computational phase
 			nzc = 0;
-			C_raw.col_start[ 0 ] = 0;
+			CRS_raw.col_start[ 0 ] = 0;
 			for( size_t i = 0; i < m; ++i ) {
 				coors.clear();
 				for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
@@ -302,8 +302,8 @@ namespace grb {
 					assert( nzc < old_nzc );
 					const size_t j = coors.index( k );
 					// update CRS
-					C_raw.row_index[ nzc ] = j;
-					C_raw.setValue( nzc, valbuf[ j ] );
+					CRS_raw.row_index[ nzc ] = j;
+					CRS_raw.setValue( nzc, valbuf[ j ] );
 					// update CCS
 					if( !crs_only ) {
 						const size_t CCS_index = C_col_index[ j ]++ + CCS_raw.col_start[ j ];
@@ -313,7 +313,7 @@ namespace grb {
 					// update count
 					(void) ++nzc;
 				}
-				C_raw.col_start[ i + 1 ] = nzc;
+				CRS_raw.col_start[ i + 1 ] = nzc;
 			}
 
 #ifndef NDEBUG
@@ -918,18 +918,7 @@ namespace grb {
 
 	namespace internal {
 
-		/**
-		 * \internal general elementwise matrix application that all eWiseApply
-		 *           variants refer to.
-		 * @param[in] oper The operator corresponding to \a mulMonoid if
-		 *                 \a allow_void is true; otherwise, an arbitrary operator
-		 *                 under which to perform the eWiseApply.
-		 * @param[in] mulMonoid The monoid under which to perform the eWiseApply if
-		 *                      \a allow_void is true; otherwise, will be ignored.
-		 * \endinternal
-		 */
 		template<
-			bool allow_void,
 			Descriptor descr,
 			class Operator,
 			typename OutputType, typename InputType1, typename InputType2,
@@ -951,295 +940,240 @@ namespace grb {
 				void
 			>::type * const = nullptr
 		) {
-			assert( !( descr & descriptors::force_row_major ) );
-			static_assert( allow_void ||
-				( !(
-					std::is_same< InputType1, void >::value
-					|| std::is_same< InputType2, void >::value
-					)
+#ifdef _DEBUG
+			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
+#endif
+			assert( !(descr & descriptors::force_row_major ) );
+			assert( phase != TRY );
+			static_assert(
+				!(
+				    std::is_same< InputType1, void >::value ||
+				    std::is_same< InputType2, void >::value
 				),
 				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid"
-				" version of elementwise mxm can only be used if neither of the input"
-				" matrices is a pattern matrix (of type void)" );
-			assert( phase != TRY );
-
+				" version of elementwise mxm can only be used if neither of the"
+				" input matrices is a pattern matrix (of type void)" );
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
 			constexpr bool trans_right = descr & descriptors::transpose_right;
 
-#ifdef _DEBUG
-			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
-#endif
-
-			if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) {
-#ifdef _DEBUG
-			std::cout << "No nonzeros in input matrices, nothing to compute.\n";
-#endif
-				return SUCCESS;
-			}
+			// run-time checks
+			const size_t m = nrows( C );
+			const size_t n = ncols( C );
+			const size_t m_A = !trans_left ? nrows( A ) : ncols( A );
+			const size_t n_A = !trans_left ? ncols( A ) : nrows( A );
+			const size_t m_B = !trans_right ? nrows( B ) : ncols( B );
+			const size_t n_B = !trans_right ? ncols( B ) : nrows( B );
 
-			const auto &A_raw = trans_left
-								? internal::getCCS( A )
-								: internal::getCRS( A );
-			const size_t m_A = trans_left
-								? ncols( A )
-								: nrows( A );
-			const size_t n_A = trans_left
-								? nrows( A )
-								: ncols( A );
-			if( m_A == 0 || n_A == 0 ) {
-#ifdef _DEBUG
-			std::cout << "Matrix A is empty, nothing to compute.\n";
-#endif
-				return SUCCESS;
+			if( m != m_A || m != m_B || n != n_A || n != n_B ) {
+				return MISMATCH;
 			}
 
-			const auto &B_raw = trans_right
-								? internal::getCCS( B )
-								: internal::getCRS( B );
-			const size_t m_B = trans_right
-								? ncols( B )
-								: nrows( B );
-			const size_t n_B = trans_right
-								? nrows( B )
-								: ncols( B );
-			if( m_A == 0 || n_A == 0 ) {
-#ifdef _DEBUG
-			std::cout << "Matrix B is empty, nothing to compute.\n";
-#endif
-				return SUCCESS;
-			}
+			const auto &A_raw = !trans_left ?
+				internal::getCRS( A ) :
+				internal::getCCS( A );
+			const auto &B_raw = !trans_right ?
+				internal::getCRS( B ) :
+				internal::getCCS( B );
+			auto &CRS_raw = internal::getCRS( C );
+			auto &CCS_raw = internal::getCCS( C );
+			const auto dummy_identity = identities::zero< OutputType >::value();
 
-			auto &C_crs_raw = internal::getCRS( C );
-			auto &C_ccs_raw = internal::getCCS( C );
-			const size_t m_C = nrows( C );
-			const size_t n_C = ncols( C );
+			// retrieve buffers
+			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
+			arr1 = arr2 = buf1 = buf2 = nullptr;
+			InputType1 * vbuf1 = nullptr;
+			InputType2 * vbuf2 = nullptr;
+			OutputType * valbuf = nullptr;
+			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
+			internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B );
+			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
+			// end buffer retrieval
 
-			// Check mask dimensions
-			if( m_A != m_B || n_A != n_B || m_A != m_C || n_A != n_C ) {
-#ifdef _DEBUG
-				std::cout << "Dimensions of matrices do not match!\n";
+			// initialisations
+			internal::Coordinates< reference > coors1, coors2;
+			coors1.set( arr1, false, buf1, n );
+			coors2.set( arr2, false, buf2, n );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel for simd default(none) shared(CCS_raw)
 #endif
-				return MISMATCH;
+			for( size_t j = 0; j <= n; ++j ) {
+				CCS_raw.col_start[ j ] = 0;
 			}
+			// end initialisations
 
-			const auto A_identity = identities::zero< InputType1 >::value();
-			const auto B_identity = identities::zero< InputType2 >::value();
+			// nonzero count
+			size_t nzc = 0;
 
-			RC rc = SUCCESS;
-			if( phase == Phase::RESIZE ) {
-				size_t nzc = 0;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for reduction( + : nzc ) \
-				default( none ) shared( B_raw, A_raw ) \
-				firstprivate( m_A )
-#endif
-				for( size_t i = 0; i < m_A; ++i ) {
-					auto B_k = B_raw.col_start[ i ];
-					const auto A_k_start = A_raw.col_start[ i ];
-					const auto A_k_end = A_raw.col_start[ i + 1 ];
-					for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
-						const auto j = A_raw.row_index[ A_k ];
-
-						while(  B_k < B_raw.col_start[ i + 1 ]
-							    && B_raw.row_index[ B_k ] > j
-						) {
-							B_k++;
-						}
-						if( B_k >= B_raw.col_start[ i + 1 ] ) {
-							break;
-						}
-						if( B_raw.row_index[ B_k ] == j ) {
-							nzc += 1;
+			// symbolic phase
+			if( phase == RESIZE ) {
+				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						coors1.assign( k_col );
+					}
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( coors1.assigned( l_col ) ) {
+							(void) ++nzc;
 						}
 					}
 				}
+
+				const RC ret = grb::resize( C, nzc );
 #ifdef _DEBUG
-				std::cout << "resize( C, " << nzc << " )\n";
+				std::cout << "grb::resize( C, " << nzc << " ) = " << ret << "\n";
 #endif
-				return resize( C, nzc );
+				return ret;
 			}
 
-			const size_t nzc = capacity( C );
+			// computational phase
+			if( phase == EXECUTE ) {
+				// retrieve additional buffer
+				config::NonzeroIndexType * const C_col_index = internal::template
+					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd
+#pragma omp parallel for simd default(none) shared(C_col_index)
 #endif
-			for( size_t i = 0; i <= n_A; i++ ) {
-				C_ccs_raw.col_start[ i ] = 0;
-			}
+				for( size_t j = 0; j < n; ++j ) {
+					C_col_index[ j ] = 0;
+				}
 
-			C_crs_raw.col_start[ 0 ] = 0;
-			// Prefix sum computation into L.CRS.col_start
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for default( none ) \
-			shared( B_raw, A_raw, C_crs_raw, C_ccs_raw, std::cout ) \
-			firstprivate( m_A )
-#endif
-			for( size_t i = 0; i < m_A; i++ ) {
-				auto B_k = B_raw.col_start[ i ];
-				size_t cumul = 0UL;
-				const auto A_k_start = A_raw.col_start[ i ];
-				const auto A_k_end = A_raw.col_start[ i + 1 ];
-				for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
-					const auto j = A_raw.row_index[ A_k ];
-
-					while(  B_k < B_raw.col_start[ i + 1 ]
-						    && B_raw.row_index[ B_k ] > j
-					) {
-						B_k++;
-					}
-					if( B_k >= B_raw.col_start[ i + 1 ]) {
-						break;
+				// perform column-wise nonzero count
+				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						coors1.assign( k_col );
 					}
-					if( B_raw.row_index[ B_k ] == j ) {
-						cumul += 1;
-						C_ccs_raw.col_start[ j + 1 ] += 1;
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( coors1.assigned( l_col ) ) {
+							(void) ++nzc;
+							(void) ++CCS_raw.col_start[ l_col + 1 ];
+						}
 					}
 				}
-				C_crs_raw.col_start[ i + 1 ] = cumul;
-			}
 
+				// check capacity
+				if( nzc > capacity( C ) ) {
 #ifdef _DEBUG
-			// Print the CRS prefix sum
-			std::cout << "before nCRS prefix sum: ";
-			for( size_t i = 0; i <= m_A; i++ ) {
-				std::cout << C_crs_raw.col_start[ i ] << " ";
-			}
-			std::cout << "\n";
-			// Print the CCS prefix sum
-			std::cout << "before nCCS prefix sum: ";
-			for( size_t i = 0; i <= n_A; i++ ) {
-				std::cout << C_ccs_raw.col_start[ i ] << " ";
-			}
-			std::cout << "\n";
+					std::cout << "\t detected insufficient capacity "
+						<< "for requested operation\n";
 #endif
+					const RC clear_rc = clear( C );
+					if( clear_rc != SUCCESS ) {
+						return PANIC;
+					} else {
+						return FAILED;
+					}
+				}
+
+				// prefix sum for CCS_raw.col_start
+				assert( CCS_raw.col_start[ 0 ] == 0 );
+				for( size_t j = 1; j < n; ++j ) {
+					CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+				}
+				assert( CCS_raw.col_start[ n ] == nzc );
 
-			// Apply the prefix sum
-			for( size_t i = 1; i <= m_A; i++ ) {
-				C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ];
-			}
-			for ( size_t i = 1; i <= n_A; i++ ) {
-				C_ccs_raw.col_start[ i ] += C_ccs_raw.col_start[ i - 1 ];
-			}
 
+				// do computations
+				bool columns[ n ] = { false };
+				bool columns2[ n ] = { false };
+				size_t nzc = 0;
+				CRS_raw.col_start[ 0 ] = 0;
+				for( size_t i = 0; i < m; ++i ) {
 #ifdef _DEBUG
-			// Print the CRS prefix sum
-			std::cout << "after nCRS prefix sum: ";
-			for( size_t i = 0; i <= m_A; i++ ) {
-				std::cout << C_crs_raw.col_start[ i ] << " ";
-			}
-			std::cout << "\n";
-			// Print the CCS prefix sum
-			std::cout << "after nCCS prefix sum: ";
-			for( size_t i = 0; i <= n_A; i++ ) {
-				std::cout << C_ccs_raw.col_start[ i ] << " ";
-			}
-			std::cout << "\n";
+						std::cout << "  -- i: " << i << "\n";
 #endif
 
-			// Check if the number of nonzeros is greater than the capacity
-			if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ n_A ] > nzc ) {
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						columns[ k_col ] = true;
+						valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
 #ifdef _DEBUG
-				std::cout << "Insufficient capacity detected for requested operation.\n"
-						  << "Requested " << C_ccs_raw.col_start[ m_A ] << " nonzeros"
-						  << " but capacity is " << nzc << "\n";
+						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n";
 #endif
-				return MISMATCH;
-			}
+					}
 
-			RC local_rc = rc;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel default( none ) \
-				shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \
-				firstprivate( local_rc, m_A, oper, A_identity, B_identity )
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( !columns[ l_col ] ) { // Union case: ignored
+							continue;
+						}
+						const auto valbuf_value_before = valbuf[ l_col ];
+						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
+						columns2[ l_col ] = true;
+#ifdef _DEBUG
+						std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity )
+						<< "  &&  A(" << i << ";" << l_col << ")=" << valbuf_value_before
+						<< "  ==>  C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n";
 #endif
-			{
-				size_t start_row = 0;
-				size_t end_row = m_A;
+					}
+
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start_row, end_row, 0, m_A );
+				#pragma omp parallel for simd
 #endif
+					for( size_t i = 0; i < n; i++ ) {
+						columns[ i ] = false;
+					}
 
-				for( size_t i = start_row; i < end_row; ++i ) {
-					auto B_k = B_raw.col_start[ i ];
-					auto C_k = C_crs_raw.col_start[ i ];
-
-					const auto A_k_start = A_raw.col_start[ i ];
-					const auto A_k_end = A_raw.col_start[ i + 1 ];
-					for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) {
-						const auto j = A_raw.row_index[ A_k ];
-
-						while(  B_k < B_raw.col_start[ i + 1 ]
-							    && B_raw.row_index[ B_k ] > j
-						) {
-							B_k++;
-						}
-						if( B_k >= B_raw.col_start[ i + 1 ] ) {
-							break;
-						}
-						if( B_raw.row_index[ B_k ] != j ) {
+					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
+						const size_t j = j_unsigned - 1;
+						if( !columns2[ j ] ) {
 							continue;
 						}
+						// update CRS
+						CRS_raw.row_index[ nzc ] = j;
+						CRS_raw.setValue( nzc, valbuf[ j ] );
+						// update CCS
+						C_col_index[ j ]++;
+						const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
+						CCS_raw.row_index[ CCS_index ] = i;
+						CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						// update count
+						(void)++nzc;
+					}
+					CRS_raw.col_start[ i + 1 ] = nzc;
 
-						const InputType1 a_val = A_raw.getValue( A_k, A_identity );
-						const InputType2 b_val = B_raw.getValue( B_k, B_identity );
-						OutputType c_val;
-						local_rc = local_rc
-									? local_rc
-									: grb::apply< descr >( c_val, a_val, b_val, oper );
-
-						C_crs_raw.row_index[ C_k ] = j;
-						C_crs_raw.setValue( C_k, c_val );
-
-						C_ccs_raw.row_index[ C_ccs_raw.col_start[ j ] ] = i;
-						C_ccs_raw.setValue( C_ccs_raw.col_start[ j ], c_val );
-#ifdef _DEBUG
-						std::cout << "A( " + std::to_string( i ) + ";"
-									+ std::to_string( j ) + " ) = "
-									+ std::to_string( a_val ) + "\n";
-						std::cout << "B( " + std::to_string( i ) + ";"
-									+ std::to_string( j ) + " ) = "
-									+ std::to_string( b_val ) + "\n";
-						std::cout << "C( " + std::to_string( i ) + ";"
-									+ std::to_string( j ) + " ) = "
-									+ std::to_string( c_val ) + "\n";
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel for simd
 #endif
-						C_k += 1;
+					for( size_t i = 0; i < n; i++ ) {
+						columns2[ i ] = false;
 					}
 				}
 
-				if( local_rc != SUCCESS ) {
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp critical
+#ifdef _DEBUG
+				std::cout << "CCS_raw.col_start = [ ";
+				for( size_t j = 0; j <= n; ++j )
+					std::cout << CCS_raw.col_start[ j ] << " ";
+				std::cout << "]\n";
+				std::cout << "C_col_index =       [ ";
+				for( size_t j = 0; j < n; ++j )
+					std::cout << C_col_index[ j ] << " ";
+				std::cout << "]\n";
+#endif
+#ifndef NDEBUG
+				for( size_t j = 0; j < n; ++j ) {
+					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
+				}
 #endif
-					{ 
-						rc = rc ? rc : local_rc;
-					}
-				}	
-			}
 
+				// set final number of nonzeroes in output matrix
 #ifdef _DEBUG
-			std::cout << "internal::setCurrentNonzeroes( C, "
-					  << C_crs_raw.col_start[ m_A ] << " )\n";
+				std::cout << "internal::setCurrentNonzeroes( C, " << nzc << " )\n";
 #endif
-			internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] );
+				internal::setCurrentNonzeroes( C, nzc );
+			}
 
-			return rc;
+			// done
+			return SUCCESS;
 		}
 
-		/**
-		 * \internal general elementwise matrix application that all eWiseApply
-		 *           variants refer to.
-		 * @param[in] oper The operator corresponding to \a mulMonoid if
-		 *                 \a allow_void is true; otherwise, an arbitrary operator
-		 *                 under which to perform the eWiseApply.
-		 * @param[in] mulMonoid The monoid under which to perform the eWiseApply if
-		 *                      \a allow_void is true; otherwise, will be ignored.
-		 * \endinternal
-		 */
 		template<
-			bool allow_void,
 			Descriptor descr,
 			class Monoid,
 			typename OutputType, typename InputType1, typename InputType2,
@@ -1261,19 +1195,11 @@ namespace grb {
 			void >::type * const = nullptr
 		) {
 
-			assert( !(descr & descriptors::force_row_major ) );
-			static_assert( allow_void ||
-				( !(
-				     std::is_same< InputType1, void >::value ||
-				     std::is_same< InputType2, void >::value
-				) ),
-				"grb::internal::eWiseApply_matrix_generic_union: the non-monoid"
-				" version of elementwise mxm can only be used if neither of the"
-				" input matrices is a pattern matrix (of type void)" );
-			assert( phase != TRY );
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n";
 #endif
+			assert( phase != TRY );
+			assert( !(descr & descriptors::force_row_major ) );
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
 			constexpr bool trans_right = descr & descriptors::transpose_right;
@@ -1301,7 +1227,7 @@ namespace grb {
 			const auto &B_raw = !trans_right ?
 				internal::getCRS( B ) :
 				internal::getCCS( B );
-			auto &C_raw = internal::getCRS( C );
+			auto &CRS_raw = internal::getCRS( C );
 			auto &CCS_raw = internal::getCCS( C );
 
 
@@ -1421,7 +1347,7 @@ namespace grb {
 				// do computations
 				bool columns[ n ] = { false };
 				size_t nzc = 0;
-				C_raw.col_start[ 0 ] = 0;
+				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
 #ifdef _DEBUG
 						std::cout << "  -- i: " << i << "\n";
@@ -1461,8 +1387,8 @@ namespace grb {
 							continue;
 						}
 						// update CRS
-						C_raw.row_index[ nzc ] = j;
-						C_raw.setValue( nzc, valbuf[ j ] );
+						CRS_raw.row_index[ nzc ] = j;
+						CRS_raw.setValue( nzc, valbuf[ j ] );
 						// update CCS
 						C_col_index[ j ]++;
 						const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
@@ -1471,7 +1397,7 @@ namespace grb {
 						// update count
 						(void)++nzc;
 					}
-					C_raw.col_start[ i + 1 ] = nzc;
+					CRS_raw.col_start[ i + 1 ] = nzc;
 
 					for( size_t i = 0; i < n; i++ ) {
 						columns[ i ] = false;
@@ -1558,7 +1484,7 @@ namespace grb {
 		std::cout << "In grb::eWiseApply_matrix_generic( reference, monoid )\n";
 #endif
 
-		return internal::eWiseApply_matrix_generic_union< true, descr >(
+		return internal::eWiseApply_matrix_generic_union< descr >(
 			C, A, B, mulmono, phase
 		);
 	}
@@ -1621,7 +1547,7 @@ namespace grb {
 		std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n";
 #endif
 
-		return internal::eWiseApply_matrix_generic_intersection< false, descr >(
+		return internal::eWiseApply_matrix_generic_intersection< descr >(
 			C, A, B, mulOp, phase
 		);
 	}
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index 6c1ff2ed0..711b8ca15 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -50,8 +50,8 @@ constexpr nz_type B_INITIAL_VALUE = 3;
 
 template< typename D >
 bool equals_matrix(
-	const Matrix< D > & A,
-	const Matrix< D > & B
+	const Matrix< D > &A,
+	const Matrix< D > &B
 ) {
 	if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){
 		return false;
@@ -60,25 +60,29 @@ bool equals_matrix(
 	wait( A );
 	wait( B );
 
-	std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() );
-	std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() );
+	std::vector< 
+		std::pair< std::pair< size_t, size_t >, D > 
+	> A_vec( A.cbegin(), A.cend() );
+	std::vector< 
+		std::pair< std::pair< size_t, size_t >, D > 
+	> B_vec( B.cbegin(), B.cend() );
 	return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
 }
 
 template< class Monoid >
 struct input_t {
-	const Matrix< nz_type > & A;
-	const Matrix< nz_type > & B;
-	const Matrix< nz_type > & C_monoid;
-	const Matrix< nz_type > & C_operator;
-	const Monoid & monoid;
+	const Matrix< nz_type > &A;
+	const Matrix< nz_type > &B;
+	const Matrix< nz_type > &C_monoid;
+	const Matrix< nz_type > &C_operator;
+	const Monoid &monoid;
 
 	input_t(
-		const Matrix< nz_type > & A = {0,0},
-		const Matrix< nz_type > & B = {0,0},
-		const Matrix< nz_type > & C_monoid = {0,0},
-		const Matrix< nz_type > & C_operator = {0,0},
-		const Monoid & monoid = Monoid() 
+		const Matrix< nz_type > &A = {0,0},
+		const Matrix< nz_type > &B = {0,0},
+		const Matrix< nz_type > &C_monoid = {0,0},
+		const Matrix< nz_type > &C_operator = {0,0},
+		const Monoid &monoid = Monoid() 
 	) : A( A ), 
 		B( B ), 
 		C_monoid( C_monoid ),
@@ -91,7 +95,7 @@ struct output_t {
 };
 
 template< class Monoid >
-void grb_program( const input_t< Monoid > & input, output_t & output ) {
+void grb_program( const input_t< Monoid > &input, output_t &output ) {
 	static_assert( is_monoid< Monoid >::value, "Monoid required" );
 	const auto &op = input.monoid.getOperator();
 	wait( input.A );

From ef2b8da231a6e0be02a0a7b0b4738a6abd10df06 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 31 Jul 2023 17:30:11 +0200
Subject: [PATCH 12/37] omp bugfix

---
 include/graphblas/reference/blas3.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 7baa39025..dd85856af 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -995,7 +995,7 @@ namespace grb {
 			coors1.set( arr1, false, buf1, n );
 			coors2.set( arr2, false, buf2, n );
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd default(none) shared(CCS_raw)
+#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n)
 #endif
 			for( size_t j = 0; j <= n; ++j ) {
 				CCS_raw.col_start[ j ] = 0;
@@ -1035,7 +1035,7 @@ namespace grb {
 					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd default(none) shared(C_col_index)
+#pragma omp parallel for simd default(none) shared(C_col_index) firstprivate(n)
 #endif
 				for( size_t j = 0; j < n; ++j ) {
 					C_col_index[ j ] = 0;

From 045242e8b7eaaba2d405974422beca15b621aeb4 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 3 Aug 2023 22:04:57 +0200
Subject: [PATCH 13/37] Replace local buffer with coordinates array

---
 include/graphblas/reference/blas3.hpp | 38 +++++++++------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index dd85856af..a7b99c350 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1080,8 +1080,6 @@ namespace grb {
 
 
 				// do computations
-				bool columns[ n ] = { false };
-				bool columns2[ n ] = { false };
 				size_t nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
@@ -1091,7 +1089,7 @@ namespace grb {
 
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						columns[ k_col ] = true;
+						coors1.assign( k_col );
 						valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
 #ifdef _DEBUG
 						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n";
@@ -1100,12 +1098,12 @@ namespace grb {
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
-						if( !columns[ l_col ] ) { // Union case: ignored
+						if( !coors1.assigned( l_col ) ) { // Union case: ignored
 							continue;
 						}
 						const auto valbuf_value_before = valbuf[ l_col ];
 						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
-						columns2[ l_col ] = true;
+						coors2.assign( l_col );
 #ifdef _DEBUG
 						std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity )
 						<< "  &&  A(" << i << ";" << l_col << ")=" << valbuf_value_before
@@ -1113,16 +1111,11 @@ namespace grb {
 #endif
 					}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd
-#endif
-					for( size_t i = 0; i < n; i++ ) {
-						columns[ i ] = false;
-					}
+					coors1.clear();
 
 					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
 						const size_t j = j_unsigned - 1;
-						if( !columns2[ j ] ) {
+						if( !coors2.assigned( j ) ) {
 							continue;
 						}
 						// update CRS
@@ -1138,12 +1131,7 @@ namespace grb {
 					}
 					CRS_raw.col_start[ i + 1 ] = nzc;
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd
-#endif
-					for( size_t i = 0; i < n; i++ ) {
-						columns2[ i ] = false;
-					}
+					coors2.clear();
 				}
 
 #ifdef _DEBUG
@@ -1345,7 +1333,7 @@ namespace grb {
 
 
 				// do computations
-				bool columns[ n ] = { false };
+
 				size_t nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
@@ -1355,7 +1343,7 @@ namespace grb {
 
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						columns[ k_col ] = true;
+						coors1.assign( k_col );
 						valbuf[ k_col ] = A_raw.getValue( k, identity_A );
 #ifdef _DEBUG
 						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n";
@@ -1364,7 +1352,7 @@ namespace grb {
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
-						if( columns[ l_col ] ) { // Intersection case
+						if( coors1.assigned( l_col ) ) { // Intersection case
 							const auto valbuf_value_before = valbuf[ l_col ];
 							(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper );
 #ifdef _DEBUG
@@ -1376,14 +1364,14 @@ namespace grb {
 #ifdef _DEBUG
 							std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n";
 #endif
-							columns[ l_col ] = true;
+							coors1.assign( l_col );
 							valbuf[ l_col ] = B_raw.getValue( l, identity_B );
 						}
 					}
 
 					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
 						const size_t j = j_unsigned - 1;
-						if( !columns[ j ] ) {
+						if( !coors1.assigned( j ) ) {
 							continue;
 						}
 						// update CRS
@@ -1399,9 +1387,7 @@ namespace grb {
 					}
 					CRS_raw.col_start[ i + 1 ] = nzc;
 
-					for( size_t i = 0; i < n; i++ ) {
-						columns[ i ] = false;
-					}
+					coors1.clear();
 				}
 
 #ifdef _DEBUG

From 154c592a44aa02652a6fa9a353ace55f5de09f26 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 4 Aug 2023 10:44:12 +0200
Subject: [PATCH 14/37] Bugfix for union variant

---
 include/graphblas/reference/blas3.hpp | 29 ++++++++-------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index a7b99c350..e718ac35b 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1340,33 +1340,22 @@ namespace grb {
 #ifdef _DEBUG
 						std::cout << "  -- i: " << i << "\n";
 #endif
-
+					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
 						valbuf[ k_col ] = A_raw.getValue( k, identity_A );
-#ifdef _DEBUG
-						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n";
-#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
-						if( coors1.assigned( l_col ) ) { // Intersection case
-							const auto valbuf_value_before = valbuf[ l_col ];
-							(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper );
-#ifdef _DEBUG
-							std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, identity_B )
-							<< "  &&  A(" << i << ";" << l_col << ")=" << valbuf_value_before
-							<< "  ==>  C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n";
-#endif
-						} else { // Union case
-#ifdef _DEBUG
-							std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n";
-#endif
+						const auto B_val = B_raw.getValue( l, identity_B );
+						if( !coors1.assigned( l_col ) ) { // Union case
 							coors1.assign( l_col );
-							valbuf[ l_col ] = B_raw.getValue( l, identity_B );
+							valbuf[ l_col ] = identity_A;
 						}
+						const auto valbuf_value_before = valbuf[ l_col ];
+						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_val, oper );
 					}
 
 					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
@@ -1386,8 +1375,6 @@ namespace grb {
 						(void)++nzc;
 					}
 					CRS_raw.col_start[ i + 1 ] = nzc;
-
-					coors1.clear();
 				}
 
 #ifdef _DEBUG
@@ -1467,7 +1454,7 @@ namespace grb {
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::eWiseApply_matrix_generic( reference, monoid )\n";
+		std::cout << "In grb::eWiseApply( reference, monoid )\n";
 #endif
 
 		return internal::eWiseApply_matrix_generic_union< descr >(
@@ -1530,7 +1517,7 @@ namespace grb {
 			"input matrices is a pattern matrix (of type void)"
 		);
 #ifdef _DEBUG
-		std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n";
+		std::cout << "In grb::eWiseApply( reference, operator )\n";
 #endif
 
 		return internal::eWiseApply_matrix_generic_intersection< descr >(

From 13a0ae32bbc57e6450fdc08e3317382bbf8e55e7 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 9 Aug 2023 17:56:00 +0200
Subject: [PATCH 15/37] Add support for descriptors::force_row_major

---
 include/graphblas/reference/blas3.hpp | 179 +++++++++++++++-----------
 1 file changed, 104 insertions(+), 75 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index e718ac35b..a060ea408 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -943,7 +943,6 @@ namespace grb {
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
 #endif
-			assert( !(descr & descriptors::force_row_major ) );
 			assert( phase != TRY );
 			static_assert(
 				!(
@@ -953,6 +952,7 @@ namespace grb {
 				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid"
 				" version of elementwise mxm can only be used if neither of the"
 				" input matrices is a pattern matrix (of type void)" );
+			constexpr bool crs_only = descr & descriptors::force_row_major;
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
 			constexpr bool trans_right = descr & descriptors::transpose_right;
@@ -965,7 +965,19 @@ namespace grb {
 			const size_t m_B = !trans_right ? nrows( B ) : ncols( B );
 			const size_t n_B = !trans_right ? ncols( B ) : nrows( B );
 
+			if( crs_only && (trans_left || trans_right) ) {
+#ifdef _DEBUG
+				std::cerr << "grb::descriptors::force_row_major and "
+					<< "grb::descriptors::transpose_left/right are mutually "
+					<< "exclusive\n";
+#endif
+				return ILLEGAL;
+			}
+
 			if( m != m_A || m != m_B || n != n_A || n != n_B ) {
+#ifdef _DEBUG
+				std::cerr << "grb::eWiseApply: dimensions of input matrices do not match\n";
+#endif
 				return MISMATCH;
 			}
 
@@ -994,11 +1006,14 @@ namespace grb {
 			internal::Coordinates< reference > coors1, coors2;
 			coors1.set( arr1, false, buf1, n );
 			coors2.set( arr2, false, buf2, n );
+			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n)
+				#pragma omp parallel for simd default(none) \
+					shared(CCS_raw) firstprivate(n)
 #endif
-			for( size_t j = 0; j <= n; ++j ) {
-				CCS_raw.col_start[ j ] = 0;
+				for( size_t j = 0; j <= n; ++j ) {
+					CCS_raw.col_start[ j ] = 0;
+				}
 			}
 			// end initialisations
 
@@ -1034,11 +1049,14 @@ namespace grb {
 				config::NonzeroIndexType * const C_col_index = internal::template
 					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
 
+				if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd default(none) shared(C_col_index) firstprivate(n)
+					#pragma omp parallel for simd default(none) \
+						shared(C_col_index) firstprivate(n)
 #endif
-				for( size_t j = 0; j < n; ++j ) {
-					C_col_index[ j ] = 0;
+					for( size_t j = 0; j < n; ++j ) {
+						C_col_index[ j ] = 0;
+					}
 				}
 
 				// perform column-wise nonzero count
@@ -1052,7 +1070,9 @@ namespace grb {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( coors1.assigned( l_col ) ) {
 							(void) ++nzc;
-							(void) ++CCS_raw.col_start[ l_col + 1 ];
+							if( !crs_only ) {
+								(void) ++CCS_raw.col_start[ l_col + 1 ];
+							}
 						}
 					}
 				}
@@ -1072,28 +1092,22 @@ namespace grb {
 				}
 
 				// prefix sum for CCS_raw.col_start
-				assert( CCS_raw.col_start[ 0 ] == 0 );
-				for( size_t j = 1; j < n; ++j ) {
-					CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+				if( !crs_only ) {
+					assert( CCS_raw.col_start[ 0 ] == 0 );
+					for( size_t j = 1; j < n; ++j ) {
+						CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+					}
+					assert( CCS_raw.col_start[ n ] == nzc );
 				}
-				assert( CCS_raw.col_start[ n ] == nzc );
-
 
 				// do computations
 				size_t nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
-#ifdef _DEBUG
-						std::cout << "  -- i: " << i << "\n";
-#endif
-
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
 						valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
-#ifdef _DEBUG
-						std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n";
-#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
@@ -1104,11 +1118,6 @@ namespace grb {
 						const auto valbuf_value_before = valbuf[ l_col ];
 						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
 						coors2.assign( l_col );
-#ifdef _DEBUG
-						std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity )
-						<< "  &&  A(" << i << ";" << l_col << ")=" << valbuf_value_before
-						<< "  ==>  C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n";
-#endif
 					}
 
 					coors1.clear();
@@ -1122,10 +1131,12 @@ namespace grb {
 						CRS_raw.row_index[ nzc ] = j;
 						CRS_raw.setValue( nzc, valbuf[ j ] );
 						// update CCS
-						C_col_index[ j ]++;
-						const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
-						CCS_raw.row_index[ CCS_index ] = i;
-						CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						if( !crs_only ) {
+							C_col_index[ j ]++;
+							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
+							CCS_raw.row_index[ CCS_index ] = i;
+							CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						}
 						// update count
 						(void)++nzc;
 					}
@@ -1134,21 +1145,23 @@ namespace grb {
 					coors2.clear();
 				}
 
+				if( !crs_only ) {
 #ifdef _DEBUG
-				std::cout << "CCS_raw.col_start = [ ";
-				for( size_t j = 0; j <= n; ++j )
-					std::cout << CCS_raw.col_start[ j ] << " ";
-				std::cout << "]\n";
-				std::cout << "C_col_index =       [ ";
-				for( size_t j = 0; j < n; ++j )
-					std::cout << C_col_index[ j ] << " ";
-				std::cout << "]\n";
+					std::cout << "CCS_raw.col_start = [ ";
+					for( size_t j = 0; j <= n; ++j )
+						std::cout << CCS_raw.col_start[ j ] << " ";
+					std::cout << "]\n";
+					std::cout << "C_col_index =       [ ";
+					for( size_t j = 0; j < n; ++j )
+						std::cout << C_col_index[ j ] << " ";
+					std::cout << "]\n";
 #endif
 #ifndef NDEBUG
-				for( size_t j = 0; j < n; ++j ) {
-					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
-				}
+					for( size_t j = 0; j < n; ++j ) {
+						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
+					}
 #endif
+				}
 
 				// set final number of nonzeroes in output matrix
 #ifdef _DEBUG
@@ -1187,11 +1200,20 @@ namespace grb {
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n";
 #endif
 			assert( phase != TRY );
-			assert( !(descr & descriptors::force_row_major ) );
+			constexpr bool crs_only = descr & descriptors::force_row_major;
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
 			constexpr bool trans_right = descr & descriptors::transpose_right;
 
+			if( crs_only && (trans_left || trans_right) ) {
+#ifdef _DEBUG
+				std::cerr << "grb::descriptors::force_row_major and "
+					<< "grb::descriptors::transpose_left/right are mutually "
+					<< "exclusive\n";
+#endif
+				return ILLEGAL;
+			}
+
 			// run-time checks
 			const size_t m = nrows( C );
 			const size_t n = ncols( C );
@@ -1205,6 +1227,9 @@ namespace grb {
 			const auto identity_B = monoid.template getIdentity< OutputType >();
 
 			if( m != m_A || m != m_B || n != n_A || n != n_B ) {
+#ifdef _DEBUG
+				std::cerr << "grb::eWiseApply: dimensions of input matrices do not match\n";
+#endif
 				return MISMATCH;
 			}
 
@@ -1234,21 +1259,14 @@ namespace grb {
 			internal::Coordinates< reference > coors1, coors2;
 			coors1.set( arr1, false, buf1, n );
 			coors2.set( arr2, false, buf2, n );
+			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#pragma omp parallel
-			{
-				size_t start, end;
-				config::OMP::localRange( start, end, 0, n + 1 );
-#else
-				const size_t start = 0;
-				const size_t end = n + 1;
+				#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n)
 #endif
-				for( size_t j = start; j < end; ++j ) {
+				for( size_t j = 0; j < n + 1; ++j ) {
 					CCS_raw.col_start[ j ] = 0;
 				}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
 			}
-#endif
 			// end initialisations
 
 			// nonzero count
@@ -1291,13 +1309,17 @@ namespace grb {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
 						(void) ++nzc;
-						(void) ++CCS_raw.col_start[ k_col + 1 ];
+						if( !crs_only ) {
+							(void) ++CCS_raw.col_start[ k_col + 1 ];
+						}
 					}
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( not coors1.assigned( l_col ) ) {
 							(void) ++nzc;
-							(void) ++CCS_raw.col_start[ l_col + 1 ];
+							if( !crs_only ) {
+								(void) ++CCS_raw.col_start[ l_col + 1 ];
+							}
 						}
 					}
 				}
@@ -1317,18 +1339,22 @@ namespace grb {
 				}
 
 				// prefix sum for CCS_raw.col_start
-				assert( CCS_raw.col_start[ 0 ] == 0 );
-				for( size_t j = 1; j < n; ++j ) {
-					CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+				if( !crs_only ) {
+					assert( CCS_raw.col_start[ 0 ] == 0 );
+					for( size_t j = 1; j < n; ++j ) {
+						CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+					}
+					assert( CCS_raw.col_start[ n ] == nzc );
 				}
-				assert( CCS_raw.col_start[ n ] == nzc );
 
 				// set C_col_index to all zero
+				if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd
+					#pragma omp parallel for simd
 #endif
-				for( size_t j = 0; j < n; j++ ) {
-					C_col_index[ j ] = 0;
+					for( size_t j = 0; j < n; j++ ) {
+						C_col_index[ j ] = 0;
+					}
 				}
 
 
@@ -1367,31 +1393,34 @@ namespace grb {
 						CRS_raw.row_index[ nzc ] = j;
 						CRS_raw.setValue( nzc, valbuf[ j ] );
 						// update CCS
-						C_col_index[ j ]++;
-						const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
-						CCS_raw.row_index[ CCS_index ] = i;
-						CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						if( !crs_only ) {
+							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ]++;
+							CCS_raw.row_index[ CCS_index ] = i;
+							CCS_raw.setValue( CCS_index, valbuf[ j ] );
+						}
 						// update count
 						(void)++nzc;
 					}
 					CRS_raw.col_start[ i + 1 ] = nzc;
 				}
 
+				if( !crs_only ) {
 #ifdef _DEBUG
-				std::cout << "CCS_raw.col_start = [ ";
-				for( size_t j = 0; j <= n; ++j )
-					std::cout << CCS_raw.col_start[ j ] << " ";
-				std::cout << "]\n";
-				std::cout << "C_col_index =       [ ";
-				for( size_t j = 0; j < n; ++j )
-					std::cout << C_col_index[ j ] << " ";
-				std::cout << "]\n";
+					std::cout << "CCS_raw.col_start = [ ";
+					for( size_t j = 0; j <= n; ++j )
+						std::cout << CCS_raw.col_start[ j ] << " ";
+					std::cout << "]\n";
+					std::cout << "C_col_index =       [ ";
+					for( size_t j = 0; j < n; ++j )
+						std::cout << C_col_index[ j ] << " ";
+					std::cout << "]\n";
 #endif
 #ifndef NDEBUG
-				for( size_t j = 0; j < n; ++j ) {
-					assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
-				}
+					for( size_t j = 0; j < n; ++j ) {
+						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
+					}
 #endif
+				}
 
 				// set final number of nonzeroes in output matrix
 #ifdef _DEBUG

From 7eba1fed1c5ff9af73c5c101fd041157e18b4e4a Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 31 Oct 2023 16:23:47 +0100
Subject: [PATCH 16/37] Bugfix + improvement

---
 include/graphblas/reference/blas3.hpp | 77 +++++++++++++++++----------
 1 file changed, 48 insertions(+), 29 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index a060ea408..a49b16c48 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -992,20 +992,17 @@ namespace grb {
 			const auto dummy_identity = identities::zero< OutputType >::value();
 
 			// retrieve buffers
-			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
-			arr1 = arr2 = buf1 = buf2 = nullptr;
+			char * arr1, * arr3, * buf1, * buf3;
+			arr1 = buf1 = nullptr;
 			InputType1 * vbuf1 = nullptr;
-			InputType2 * vbuf2 = nullptr;
 			OutputType * valbuf = nullptr;
 			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
-			internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B );
 			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
 			// end buffer retrieval
 
 			// initialisations
-			internal::Coordinates< reference > coors1, coors2;
+			internal::Coordinates< reference > coors1;
 			coors1.set( arr1, false, buf1, n );
-			coors2.set( arr2, false, buf2, n );
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp parallel for simd default(none) \
@@ -1022,6 +1019,7 @@ namespace grb {
 
 			// symbolic phase
 			if( phase == RESIZE ) {
+				nzc = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
@@ -1045,6 +1043,7 @@ namespace grb {
 
 			// computational phase
 			if( phase == EXECUTE ) {
+				nzc = 0;
 				// retrieve additional buffer
 				config::NonzeroIndexType * const C_col_index = internal::template
 					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
@@ -1101,9 +1100,11 @@ namespace grb {
 				}
 
 				// do computations
-				size_t nzc = 0;
+				nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
@@ -1111,57 +1112,75 @@ namespace grb {
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
-						if( !coors1.assigned( l_col ) ) { // Union case: ignored
+						const size_t j = B_raw.row_index[ l ];
+						if( !coors1.assigned( j ) ) { // Union case: ignored
 							continue;
 						}
-						const auto valbuf_value_before = valbuf[ l_col ];
-						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
-						coors2.assign( l_col );
-					}
 
-					coors1.clear();
+						const auto valbuf_value_before = valbuf[ j ];
+						OutputType result_value;
+						(void)grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
 
-					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
-						const size_t j = j_unsigned - 1;
-						if( !coors2.assigned( j ) ) {
-							continue;
-						}
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;
-						CRS_raw.setValue( nzc, valbuf[ j ] );
+						CRS_raw.setValue( nzc, result_value );
+
 						// update CCS
 						if( !crs_only ) {
 							C_col_index[ j ]++;
 							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
 							CCS_raw.row_index[ CCS_index ] = i;
-							CCS_raw.setValue( CCS_index, valbuf[ j ] );
+							CCS_raw.setValue( CCS_index, result_value );
 						}
+
 						// update count
 						(void)++nzc;
 					}
+
 					CRS_raw.col_start[ i + 1 ] = nzc;
 
-					coors2.clear();
 				}
 
-				if( !crs_only ) {
+
 #ifdef _DEBUG
+				std::cout << "CRS_raw.col_start = [ ";
+				for( size_t j = 0; j <= m; ++j )
+					std::cout << CRS_raw.col_start[ j ] << " ";
+				std::cout << "]\n";
+				std::cout << "CRS_raw.row_index = [ ";
+				for( size_t j = 0; j < nzc; ++j )
+					std::cout << CRS_raw.row_index[ j ] << " ";
+				std::cout << "]\n";
+				std::cout << "CRS_raw.values    = [ ";
+				for( size_t j = 0; j < nzc; ++j )
+					std::cout << CRS_raw.values[ j ] << " ";
+				std::cout << "]\n";
+				if( !crs_only ) {
+					std::cout << "C_col_index =       [ ";
+					for( size_t j = 0; j < n; ++j )
+						std::cout << C_col_index[ j ] << " ";
+					std::cout << "]\n";
 					std::cout << "CCS_raw.col_start = [ ";
 					for( size_t j = 0; j <= n; ++j )
 						std::cout << CCS_raw.col_start[ j ] << " ";
 					std::cout << "]\n";
-					std::cout << "C_col_index =       [ ";
-					for( size_t j = 0; j < n; ++j )
-						std::cout << C_col_index[ j ] << " ";
+					std::cout << "CCS_raw.row_index = [ ";
+					for( size_t j = 0; j < nzc; ++j )
+						std::cout << CCS_raw.row_index[ j ] << " ";
+					std::cout << "]\n";
+					std::cout << "CCS_raw.values    = [ ";
+					for( size_t j = 0; j < nzc; ++j )
+						std::cout << CCS_raw.values[ j ] << " ";
 					std::cout << "]\n";
+				}
 #endif
+
 #ifndef NDEBUG
-					for( size_t j = 0; j < n; ++j ) {
+				if( !crs_only ) {
+					for( size_t j = 0; j < n; ++j )
 						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
-					}
-#endif
 				}
+#endif
 
 				// set final number of nonzeroes in output matrix
 #ifdef _DEBUG

From b84c55dbc59cc29e153a462a6f04256723b24f49 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 1 Nov 2023 11:56:33 +0100
Subject: [PATCH 17/37] New unit-test cases

---
 tests/unit/CMakeLists.txt                |   1 +
 tests/unit/eWiseApplyMatrix_variants.cpp | 218 +++++++++++++++++------
 2 files changed, 161 insertions(+), 58 deletions(-)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 815db9d2b..5ac228625 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -87,6 +87,7 @@ add_grb_executables( ewiseapply ewiseapply.cpp
 
 add_grb_executables( eWiseApplyMatrix_variants eWiseApplyMatrix_variants.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index 711b8ca15..cab4b4ec7 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -39,9 +39,11 @@
 #include <vector>
 
 #include <graphblas.hpp>
+#include <utils/print_vec_mat.hpp>
 
 using namespace grb;
 
+
 using nz_type = int;
 
 constexpr nz_type A_INITIAL_VALUE = 1;
@@ -53,23 +55,26 @@ bool equals_matrix(
 	const Matrix< D > &A,
 	const Matrix< D > &B
 ) {
-	if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){
+	if( nrows( A ) != nrows( B ) ||
+		ncols( A ) != ncols( B ) ||
+		nnz( A ) != nnz( B )
+	) {
 		return false;
 	}
 
 	wait( A );
 	wait( B );
 
-	std::vector< 
-		std::pair< std::pair< size_t, size_t >, D > 
+	std::vector<
+		std::pair< std::pair< size_t, size_t >, D >
 	> A_vec( A.cbegin(), A.cend() );
-	std::vector< 
-		std::pair< std::pair< size_t, size_t >, D > 
+	std::vector<
+		std::pair< std::pair< size_t, size_t >, D >
 	> B_vec( B.cbegin(), B.cend() );
 	return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
 }
 
-template< class Monoid >
+template< class Monoid, Descriptor descr = descriptors::no_operation >
 struct input_t {
 	const Matrix< nz_type > &A;
 	const Matrix< nz_type > &B;
@@ -82,11 +87,11 @@ struct input_t {
 		const Matrix< nz_type > &B = {0,0},
 		const Matrix< nz_type > &C_monoid = {0,0},
 		const Matrix< nz_type > &C_operator = {0,0},
-		const Monoid &monoid = Monoid() 
-	) : A( A ), 
-		B( B ), 
+		const Monoid &monoid = Monoid()
+	) : A( A ),
+		B( B ),
 		C_monoid( C_monoid ),
-		C_operator( C_operator ), 
+		C_operator( C_operator ),
 		monoid( monoid ) {}
 };
 
@@ -94,32 +99,35 @@ struct output_t {
 	RC rc;
 };
 
-template< class Monoid >
-void grb_program( const input_t< Monoid > &input, output_t &output ) {
+template< class Monoid, Descriptor descr >
+void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 	static_assert( is_monoid< Monoid >::value, "Monoid required" );
 	const auto &op = input.monoid.getOperator();
-	wait( input.A );
-	wait( input.B );
 
 	RC &rc = output.rc;
 
 	{ // Operator variant
-		std::cout << "-- eWiseApply using Operator, supposed to be"
+		std::cout << "  -- eWiseApply using Operator, supposed to be"
 					<< " annihilating non-zeroes -> INTERSECTION\n";
 		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
-		rc = eWiseApply( C, input.A, input.B, op, RESIZE );
-		wait( C );
+
+		rc = eWiseApply<descr>( C, input.A, input.B, op, RESIZE );
 		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::RESIZE\n";
 			return;
 		}
-		rc = eWiseApply( C, input.A, input.B, op, EXECUTE );
-		wait( C );
+		if( capacity( C ) < nnz( input.C_operator ) ) {
+			std::cerr << "Error: Capacity should be at least " << nnz( input.C_operator ) << "\n";
+			rc = FAILED;
+			return;
+		}
+
+		rc = eWiseApply<descr>( C, input.A, input.B, op, EXECUTE );
 		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::EXECUTE\n";
 			return;
 		}
-
+		print_matrix( C, 10, "C (intersection)" );
 		if( !equals_matrix( C, input.C_operator ) ) {
 			std::cerr << "Error: Wrong result\n";
 			rc = FAILED;
@@ -130,22 +138,27 @@ void grb_program( const input_t< Monoid > &input, output_t &output ) {
 	}
 
 	{ // Monoid variant
-		std::cout << "-- eWiseApply using Monoid, supposed to consider"
+		std::cout << "  -- eWiseApply using Monoid, supposed to consider"
 					<< " non-zeroes as the identity -> UNION\n";
 		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
-		rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE );
-		wait( C );
+
+		rc = eWiseApply<descr>( C, input.A, input.B, input.monoid, RESIZE );
 		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::RESIZE\n";
 			return;
 		}
-		rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE );
-		wait( C );
+		if( capacity( C ) < nnz( input.C_operator ) ) {
+			std::cerr << "Error: Capacity should be at least " << nnz( input.C_monoid ) << "\n";
+			rc = FAILED;
+			return;
+		}
+
+		rc = eWiseApply<descr>( C, input.A, input.B, input.monoid, EXECUTE );
 		if( rc != SUCCESS ) {
 			std::cerr << "Error: Phase::EXECUTE\n";
 			return;
 		}
-
+		print_matrix( C, 10, "C (union)" );
 		if( !equals_matrix( C, input.C_monoid ) ) {
 			std::cerr << "Error: Wrong result\n";
 			rc = FAILED;
@@ -165,7 +178,7 @@ int main( int argc, char ** argv ) {
 	size_t N = 10;
 
 	if( argc > 2 ) {
-		std::cout << "Usage: " << argv[ 0 ] << std::endl;
+		std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl;
 		return 1;
 	}
 	if( argc == 2 ) {
@@ -186,12 +199,14 @@ int main( int argc, char ** argv ) {
 	 * 	  (...)
 	 */
 	Matrix< nz_type > A( N, N, N );
-	std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
-	std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
-	std::iota( A_cols.begin(), A_cols.end(), 0 );
-	if( SUCCESS !=
-		buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL )
-	) { return 2; }
+	{
+		std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
+		std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
+		std::iota( A_cols.begin(), A_cols.end(), 0 );
+		if( SUCCESS !=
+			buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL )
+		) { return 2; }
+	}
 
 	/** Matrix B: Column matrix filled with B_INITIAL_VALUE
 	 *  Y _ _ _ _
@@ -202,14 +217,17 @@ int main( int argc, char ** argv ) {
 	 * 	  (...)
 	 */
 	Matrix< nz_type > B( N, N, N );
-	std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
-	std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
-	std::iota( B_rows.begin(), B_rows.end(), 0 );
-	if( SUCCESS !=
-		buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL )
-	) { return 3; }
-
 	{
+		std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
+		std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
+		std::iota( B_rows.begin(), B_rows.end(), 0 );
+		if( SUCCESS !=
+			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL )
+		) { return 3; }
+	}
+
+	{ // C = A .+ B
+		std::cout << "-- Test C = A .+ B\n";
 		/** Matrix C_monoid_truth: Union of A and B
 		 * X+Y  X   X   X   X
 		 * Y  ___ ___ ___ ___
@@ -260,28 +278,112 @@ int main( int argc, char ** argv ) {
 			)
 		) { return 5; }
 
-		{ /** Test using addition operator, same type for lhs and rhs
-		   */
-			input_t<
-				Monoid< operators::add< nz_type >, identities::zero >
-			> input { A, B, C_monoid_truth, C_op_truth };
-			output_t output { SUCCESS };
-			// Run the test
-			RC rc = launcher.exec( &grb_program, input, output, false );
-			// Check the result
-			if( rc != SUCCESS ) {
-				std::cerr << "Error: Launcher::exec\n";
-				return 6;
-			}
-			if( output.rc != SUCCESS ) {
-				std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
-				return 7;
-			}
+		input_t<
+			Monoid< operators::add< nz_type >, identities::zero >
+		> input { A, B, C_monoid_truth, C_op_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		RC rc = launcher.exec( &grb_program, input, output, false );
+		// Check the result
+		if( rc != SUCCESS ) {
+			std::cerr << "Error: Launcher::exec\n";
+			return 6;
+		}
+		if( output.rc != SUCCESS ) {
+			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
+			return 7;
+		}
+	}
+
+	{ // C = A .+ A
+		std::cout << "-- Test C = A .+ A\n";
+		/** Matrix C_truth: Union/intersection of A and A
+		 * X+X X+X X+X X+X X+X
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___(...)
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		Matrix< nz_type > C_truth( N, N );
+		size_t nvalues = ncols( A );
+		std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 );
+		std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+A_INITIAL_VALUE );
+		std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 );
+		if( SUCCESS !=
+			buildMatrixUnique(
+				C_truth,
+				C_truth_rows.data(),
+				C_truth_cols.data(),
+				C_truth_values.data(),
+				C_truth_values.size(),
+				SEQUENTIAL
+			)
+		) { return 8; }
+
+		input_t<
+			Monoid< operators::add< nz_type >, identities::zero >
+		> input { A, A, C_truth, C_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		RC rc = launcher.exec( &grb_program, input, output, false );
+		// Check the result
+		if( rc != SUCCESS ) {
+			std::cerr << "Error: Launcher::exec\n";
+			return 9;
+		}
+		if( output.rc != SUCCESS ) {
+			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
+			return 10;
+		}
+	}
+
+	{ // C = A .+ Bt
+		std::cout << "-- Test C = A .+ Bt\n";
+		/** Matrix C_truth: Union/intersection of A and Bt
+		 * X+Y X+Y X+Y X+Y X+Y
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___(...)
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		Matrix< nz_type > C_truth( N, N );
+		size_t nvalues = ncols( A );
+		std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 );
+		std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+B_INITIAL_VALUE );
+		std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 );
+		if( SUCCESS !=
+			buildMatrixUnique(
+				C_truth,
+				C_truth_rows.data(),
+				C_truth_cols.data(),
+				C_truth_values.data(),
+				C_truth_values.size(),
+				SEQUENTIAL
+			)
+		) { return 8; }
+
+		input_t<
+			Monoid< operators::add< nz_type >, identities::zero >,
+			descriptors::transpose_right
+		> input { A, B, C_truth, C_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		RC rc = launcher.exec( &grb_program, input, output, false );
+		// Check the result
+		if( rc != SUCCESS ) {
+			std::cerr << "Error: Launcher::exec\n";
+			return 9;
+		}
+		if( output.rc != SUCCESS ) {
+			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
+			return 10;
 		}
 	}
 
 	std::cerr << std::flush;
 	std::cout << "Test OK" << std::endl << std::flush;
-	
+
 	return 0;
 }

From 4156581548e483d01e7320dda48b50aec60ae6f1 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 1 Nov 2023 14:53:29 +0100
Subject: [PATCH 18/37] Bugfix for union pattern

---
 include/graphblas/reference/blas3.hpp | 104 ++++++++++++++++++++------
 tests/unit/spy.cpp                    |   3 +
 2 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index a49b16c48..f4c9fd09c 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1293,6 +1293,7 @@ namespace grb {
 
 			// symbolic phase
 			if( phase == RESIZE ) {
+				nzc = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
@@ -1322,6 +1323,7 @@ namespace grb {
 					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
 
 				// perform column-wise nonzero count
+				nzc = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
@@ -1379,13 +1381,12 @@ namespace grb {
 
 				// do computations
 
-				size_t nzc = 0;
+				nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
-#ifdef _DEBUG
-						std::cout << "  -- i: " << i << "\n";
-#endif
 					coors1.clear();
+					coors2.clear();
+
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
 						coors1.assign( k_col );
@@ -1393,47 +1394,106 @@ namespace grb {
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
+						const size_t j = B_raw.row_index[ l ];
 						const auto B_val = B_raw.getValue( l, identity_B );
-						if( !coors1.assigned( l_col ) ) { // Union case
-							coors1.assign( l_col );
-							valbuf[ l_col ] = identity_A;
+						if( !coors1.assigned( j ) ) { // Union case
+							valbuf[ j ] = identity_A;
+						} else {
+							coors2.assign( j );
+						}
+
+						const auto valbuf_value_before = valbuf[ j ];
+						OutputType result_value;
+						(void)grb::apply( result_value, valbuf_value_before, B_val, oper );
+
+						// update CRS
+						CRS_raw.row_index[ nzc ] = j;
+						CRS_raw.setValue( nzc, result_value );
+
+						// update CCS
+						if( !crs_only ) {
+							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ];
+#ifdef NDEBUG
+							assert( CCS_index < capacity( C ) );
+							assert( CCS_index < CCS_raw.col_start[ j+1 ] );
+							assert( CCS_index >= CCS_raw.col_start[ j ] );
+#endif
+							CCS_raw.row_index[ CCS_index ] = i;
+							CCS_raw.setValue( CCS_index, result_value );
 						}
-						const auto valbuf_value_before = valbuf[ l_col ];
-						(void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_val, oper );
+						// update count
+						(void)++nzc;
 					}
 
-					for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) {
-						const size_t j = j_unsigned - 1;
-						if( !coors1.assigned( j ) ) {
+					for( size_t l = A_raw.col_start[ i ]; l < A_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t j = A_raw.row_index[ l ];
+						if( coors2.assigned( j ) ) { // Intersection case: already done before
 							continue;
 						}
+#ifdef NDEBUG
+						assert( !coors1.assigned( j ) ); // Union case: already done before
+#endif
+
+						const auto A_val = A_raw.getValue( l, identity_A );
+						OutputType result_value;
+						(void)grb::apply( result_value, A_val, identity_B, oper );
+
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;
-						CRS_raw.setValue( nzc, valbuf[ j ] );
+						CRS_raw.setValue( nzc, result_value );
+
 						// update CCS
 						if( !crs_only ) {
-							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ]++;
+							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ];
+#ifdef NDEBUG
+							assert( CCS_index < capacity( C ) );
+							assert( CCS_index < CCS_raw.col_start[ j+1 ] );
+							assert( CCS_index >= CCS_raw.col_start[ j ] );
+#endif
 							CCS_raw.row_index[ CCS_index ] = i;
-							CCS_raw.setValue( CCS_index, valbuf[ j ] );
+							CCS_raw.setValue( CCS_index, result_value );
 						}
 						// update count
 						(void)++nzc;
 					}
+
 					CRS_raw.col_start[ i + 1 ] = nzc;
 				}
 
 				if( !crs_only ) {
 #ifdef _DEBUG
-					std::cout << "CCS_raw.col_start = [ ";
-					for( size_t j = 0; j <= n; ++j )
-						std::cout << CCS_raw.col_start[ j ] << " ";
+					std::cout << "CRS_raw.col_start = [ ";
+					for( size_t j = 0; j <= m; ++j )
+						std::cout << CRS_raw.col_start[ j ] << " ";
 					std::cout << "]\n";
-					std::cout << "C_col_index =       [ ";
-					for( size_t j = 0; j < n; ++j )
-						std::cout << C_col_index[ j ] << " ";
+					std::cout << "CRS_raw.row_index = [ ";
+					for( size_t j = 0; j < nzc; ++j )
+						std::cout << CRS_raw.row_index[ j ] << " ";
 					std::cout << "]\n";
+					std::cout << "CRS_raw.values    = [ ";
+					for( size_t j = 0; j < nzc; ++j )
+						std::cout << CRS_raw.values[ j ] << " ";
+					std::cout << "]\n";
+					if( !crs_only ) {
+						std::cout << "C_col_index =       [ ";
+						for( size_t j = 0; j < n; ++j )
+							std::cout << C_col_index[ j ] << " ";
+						std::cout << "]\n";
+						std::cout << "CCS_raw.col_start = [ ";
+						for( size_t j = 0; j <= n; ++j )
+							std::cout << CCS_raw.col_start[ j ] << " ";
+						std::cout << "]\n";
+						std::cout << "CCS_raw.row_index = [ ";
+						for( size_t j = 0; j < nzc; ++j )
+							std::cout << CCS_raw.row_index[ j ] << " ";
+						std::cout << "]\n";
+						std::cout << "CCS_raw.values    = [ ";
+						for( size_t j = 0; j < nzc; ++j )
+							std::cout << CCS_raw.values[ j ] << " ";
+						std::cout << "]\n";
+					}
 #endif
+
 #ifndef NDEBUG
 					for( size_t j = 0; j < n; ++j ) {
 						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
diff --git a/tests/unit/spy.cpp b/tests/unit/spy.cpp
index 780216d7f..ce6c1759d 100644
--- a/tests/unit/spy.cpp
+++ b/tests/unit/spy.cpp
@@ -82,6 +82,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, grb::nnz( spy ) );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk: " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";
@@ -114,6 +115,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, nnz( spy ) );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk (pattern): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";
@@ -146,6 +148,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc
 	if( rc == grb::SUCCESS ) {
 		grb::Matrix< double > chk( p, q );
 		rc = rc ? rc : grb::resize( chk, nnz( spy ) );
+		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE );
 		rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() );
 		if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) {
 			std::cerr << "Unexpected number of nonzeroes for chk (boolean): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n";

From 30f8a302b53b0d40d65b14dd6b33d1173ddb3bc7 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 1 Nov 2023 16:39:29 +0100
Subject: [PATCH 19/37] Parallel iteration of coordinates

---
 include/graphblas/reference/blas3.hpp | 139 ++++++++++++++++++--------
 1 file changed, 95 insertions(+), 44 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index f4c9fd09c..f8049f370 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1105,10 +1105,38 @@ namespace grb {
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-						valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel default(none) \
+						shared(coors1, valbuf) \
+						firstprivate(i, A_raw, dummy_identity)
+#endif
+					{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						auto local_update = coors1.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns = coors1.maxAsyncAssigns();
+						size_t assigns = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
+						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+							const size_t k_col = A_raw.row_index[ k ];
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							if( !coors1.asyncAssign( k_col, local_update ) ) {
+								valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
+								if( ++assigns == maxAsyncAssigns ) {
+									coors1.joinUpdate( local_update );
+									assigns = 0;
+								}
+							}
+#else
+							if( !coors1.assign( k_col ) ) {
+								valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
+							}
+#endif
+						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors1.joinUpdate( local_update ) ) {}
+#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
@@ -1387,56 +1415,79 @@ namespace grb {
 					coors1.clear();
 					coors2.clear();
 
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-						valbuf[ k_col ] = A_raw.getValue( k, identity_A );
-					}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel default(none) \
+						shared(coors1, vbuf1, coors2, vbuf2) \
+						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
+#endif
+					{
+						auto local_update1 = coors1.EMPTY_UPDATE();
+						{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
+							size_t assigns1 = 0;
+							#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
+							for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+								const size_t k_col = A_raw.row_index[ k ];
 
-					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t j = B_raw.row_index[ l ];
-						const auto B_val = B_raw.getValue( l, identity_B );
-						if( !coors1.assigned( j ) ) { // Union case
-							valbuf[ j ] = identity_A;
-						} else {
-							coors2.assign( j );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+								if( !coors1.asyncAssign( k_col, local_update1 ) ) {
+									vbuf1[ k_col ] = A_raw.getValue( k, identity_A );
+									if( ++assigns1 == maxAsyncAssigns1 ) {
+										coors1.joinUpdate( local_update1 );
+										assigns1 = 0;
+									}
+								}
+#else
+								if( !coors1.assign( k_col ) ) {
+									vbuf1[ k_col ] = A_raw.getValue( k, identity_A );
+								}
+#endif
+							}
 						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors1.joinUpdate( local_update1 )) {}
+#endif
 
-						const auto valbuf_value_before = valbuf[ j ];
-						OutputType result_value;
-						(void)grb::apply( result_value, valbuf_value_before, B_val, oper );
-
-						// update CRS
-						CRS_raw.row_index[ nzc ] = j;
-						CRS_raw.setValue( nzc, result_value );
+						auto local_update2 = coors2.EMPTY_UPDATE();
+						{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
+							size_t assigns2 = 0;
+							#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
+							for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
+								const size_t k_col = B_raw.row_index[ k ];
 
-						// update CCS
-						if( !crs_only ) {
-							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ];
-#ifdef NDEBUG
-							assert( CCS_index < capacity( C ) );
-							assert( CCS_index < CCS_raw.col_start[ j+1 ] );
-							assert( CCS_index >= CCS_raw.col_start[ j ] );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+								if( !coors2.asyncAssign( k_col, local_update2 ) ) {
+									vbuf2[ k_col ] = B_raw.getValue( k, identity_B );
+									if( ++assigns2 == maxAsyncAssigns2 ) {
+										coors2.joinUpdate( local_update2 );
+										assigns2 = 0;
+									}
+								}
+#else
+								if( !coors2.assign( k_col ) ) {
+									vbuf2[ k_col ] = B_raw.getValue( k, identity_B );
+								}
 #endif
-							CCS_raw.row_index[ CCS_index ] = i;
-							CCS_raw.setValue( CCS_index, result_value );
+							}
 						}
-						// update count
-						(void)++nzc;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors2.joinUpdate( local_update2 )) {}
+#endif
 					}
 
-					for( size_t l = A_raw.col_start[ i ]; l < A_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t j = A_raw.row_index[ l ];
-						if( coors2.assigned( j ) ) { // Intersection case: already done before
-							continue;
-						}
-#ifdef NDEBUG
-						assert( !coors1.assigned( j ) ); // Union case: already done before
-#endif
+					for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) {
+						const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2;
+						const auto j = assigned_coors.index( k );
+						const auto A_val = coors1.assigned(k) ? vbuf1[ j ] : identity_A;
+						const auto B_val = coors2.assigned(k) ? vbuf2[ j ] : identity_B;
 
-						const auto A_val = A_raw.getValue( l, identity_A );
 						OutputType result_value;
-						(void)grb::apply( result_value, A_val, identity_B, oper );
+						(void)grb::apply( result_value, A_val, B_val, oper );
 
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;

From 4b2b27b4cf9888256f8ff907e6cc80ebc816ca39 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 1 Nov 2023 18:04:03 +0100
Subject: [PATCH 20/37] Void  values bugfix for test

---
 include/graphblas/reference/blas3.hpp | 40 +++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index f8049f370..5e2bf9d0a 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -57,6 +57,34 @@
 		"********************************************************************" \
 		"******************************\n" );
 
+#ifndef _H_GRB_REFERENCE_BLAS3_ACCESSORS
+#define _H_GRB_REFERENCE_BLAS3_ACCESSORS
+
+namespace grb::internal
+{
+	template< typename D, typename T >
+	static inline void assignValue(
+		D *array, size_t i, const T& value,
+		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
+	) { array[i] = value; }
+
+	template< typename T >
+	static inline void assignValue( void *, size_t, const T& ) { /* do nothing */ }
+
+	template< typename D, typename T >
+	static inline T getValue(
+		const D *array, size_t i, const T&,
+		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
+	) { return array[i]; }
+
+	template< typename T >
+	static inline T getValue( const void *, size_t, const T& identity ) { return identity; }
+
+} // namespace grb::internal
+
+#endif // _H_GRB_REFERENCE_BLAS3_ACCESSORS
+
+
 namespace grb {
 
 	namespace internal {
@@ -1433,7 +1461,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 								if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-									vbuf1[ k_col ] = A_raw.getValue( k, identity_A );
+									assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 									if( ++assigns1 == maxAsyncAssigns1 ) {
 										coors1.joinUpdate( local_update1 );
 										assigns1 = 0;
@@ -1441,7 +1469,7 @@ namespace grb {
 								}
 #else
 								if( !coors1.assign( k_col ) ) {
-									vbuf1[ k_col ] = A_raw.getValue( k, identity_A );
+									assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 								}
 #endif
 							}
@@ -1462,7 +1490,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 								if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-									vbuf2[ k_col ] = B_raw.getValue( k, identity_B );
+									assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
 									if( ++assigns2 == maxAsyncAssigns2 ) {
 										coors2.joinUpdate( local_update2 );
 										assigns2 = 0;
@@ -1470,7 +1498,7 @@ namespace grb {
 								}
 #else
 								if( !coors2.assign( k_col ) ) {
-									vbuf2[ k_col ] = B_raw.getValue( k, identity_B );
+									assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
 								}
 #endif
 							}
@@ -1483,8 +1511,8 @@ namespace grb {
 					for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) {
 						const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2;
 						const auto j = assigned_coors.index( k );
-						const auto A_val = coors1.assigned(k) ? vbuf1[ j ] : identity_A;
-						const auto B_val = coors2.assigned(k) ? vbuf2[ j ] : identity_B;
+						const auto A_val = coors1.assigned(k) ? getValue(vbuf1, j, identity_A) : identity_A;
+						const auto B_val = coors2.assigned(k) ? getValue(vbuf2, j, identity_B) : identity_B;
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );

From 02330a8321bcd8d2939013ad73434e3bfbce3041 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 2 Nov 2023 13:13:32 +0100
Subject: [PATCH 21/37] Logic bugfix

---
 include/graphblas/reference/blas3.hpp | 121 ++++++++++++++++----------
 1 file changed, 76 insertions(+), 45 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 5e2bf9d0a..1f382f017 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1354,12 +1354,13 @@ namespace grb {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-						(void) ++nzc;
+						if( !coors1.assign( k_col ) ) {
+							(void) ++nzc;
+						}
 					}
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
-						if( not coors1.assigned( l_col ) ) {
+						if( !coors1.assigned( l_col ) ) {
 							(void) ++nzc;
 						}
 					}
@@ -1384,15 +1385,16 @@ namespace grb {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						coors1.assign( k_col );
-						(void) ++nzc;
+						if( !coors1.assign( k_col ) ) {
+							(void) ++nzc;
+						}
 						if( !crs_only ) {
 							(void) ++CCS_raw.col_start[ k_col + 1 ];
 						}
 					}
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
-						if( not coors1.assigned( l_col ) ) {
+						if( !coors1.assigned( l_col ) ) {
 							(void) ++nzc;
 							if( !crs_only ) {
 								(void) ++CCS_raw.col_start[ l_col + 1 ];
@@ -1449,70 +1451,99 @@ namespace grb {
 						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
 #endif
 					{
-						auto local_update1 = coors1.EMPTY_UPDATE();
-						{
+
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
-							size_t assigns1 = 0;
-							#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+						auto local_update1 = coors1.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
+						size_t assigns1 = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
 #endif
-							for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-								const size_t k_col = A_raw.row_index[ k ];
+						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+							const size_t k_col = A_raw.row_index[ k ];
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-								if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-									assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-									if( ++assigns1 == maxAsyncAssigns1 ) {
-										coors1.joinUpdate( local_update1 );
-										assigns1 = 0;
-									}
+							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
+								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+								if( ++assigns1 == maxAsyncAssigns1 ) {
+									coors1.joinUpdate( local_update1 );
+									assigns1 = 0;
 								}
+							}
 #else
-								if( !coors1.assign( k_col ) ) {
-									assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-								}
-#endif
+							if( !coors1.assign( k_col ) ) {
+								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 							}
+#endif
 						}
+
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 						while( !coors1.joinUpdate( local_update1 )) {}
 #endif
 
-						auto local_update2 = coors2.EMPTY_UPDATE();
-						{
+
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
-							size_t assigns2 = 0;
-							#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+						auto local_update2 = coors2.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
+						size_t assigns2 = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
 #endif
-							for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
-								const size_t k_col = B_raw.row_index[ k ];
+						for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
+							const size_t k_col = B_raw.row_index[ k ];
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-								if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-									assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
-									if( ++assigns2 == maxAsyncAssigns2 ) {
-										coors2.joinUpdate( local_update2 );
-										assigns2 = 0;
-									}
+							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
+								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
+								if( ++assigns2 == maxAsyncAssigns2 ) {
+									coors2.joinUpdate( local_update2 );
+									assigns2 = 0;
 								}
+							}
 #else
-								if( !coors2.assign( k_col ) ) {
-									assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
-								}
-#endif
+							if( !coors2.assign( k_col ) ) {
+								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
 							}
+#endif
 						}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 						while( !coors2.joinUpdate( local_update2 )) {}
 #endif
 					}
 
-					for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) {
-						const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2;
-						const auto j = assigned_coors.index( k );
-						const auto A_val = coors1.assigned(k) ? getValue(vbuf1, j, identity_A) : identity_A;
-						const auto B_val = coors2.assigned(k) ? getValue(vbuf2, j, identity_B) : identity_B;
+					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
+						const auto j = coors1.index( k );
+						const auto A_val = getValue(vbuf1, j, identity_A);
+						const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B;
+						std::cout << " * (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n";
+
+						OutputType result_value;
+						(void)grb::apply( result_value, A_val, B_val, oper );
+
+						// update CRS
+						CRS_raw.row_index[ nzc ] = j;
+						CRS_raw.setValue( nzc, result_value );
+
+						// update CCS
+						if( !crs_only ) {
+							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ];
+#ifdef NDEBUG
+							assert( CCS_index < capacity( C ) );
+							assert( CCS_index < CCS_raw.col_start[ j+1 ] );
+							assert( CCS_index >= CCS_raw.col_start[ j ] );
+#endif
+							CCS_raw.row_index[ CCS_index ] = i;
+							CCS_raw.setValue( CCS_index, result_value );
+						}
+						// update count
+						(void)++nzc;
+					}
+					for( size_t k = 0; k < coors2.nonzeroes(); ++k ) {
+						const auto j = coors2.index( k );
+						if( coors1.assigned(j) ) { // Intersection case: already handled
+							continue;
+						}
+						const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A;
+						const auto B_val = getValue(vbuf2, j, identity_B);
+						std::cout << " # (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n";
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );

From 585f485ce8de3e69f573baad1b52e581187f4c4e Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 2 Nov 2023 13:35:35 +0100
Subject: [PATCH 22/37] Merge remote-tracking branch 'origin/develop' into
 636-unexpected-behaviour-of-ewiseapply-out-matrix-in-matrix-in-matrix-variants

---
 include/graphblas/banshee/pinnedvector.hpp    |  137 +-
 include/graphblas/base/benchmark.hpp          |  339 +++--
 include/graphblas/base/exec.hpp               |  301 ++--
 include/graphblas/benchmark.hpp               |    5 +-
 include/graphblas/bsp/collectives.hpp         |  602 ++++----
 include/graphblas/bsp/collectives_blas1.hpp   |  509 ++++---
 .../graphblas/bsp/collectives_blas1_raw.hpp   |    3 +-
 .../graphblas/bsp/exec_broadcast_routines.hpp |   81 +
 include/graphblas/bsp1d/benchmark.hpp         |  728 +++------
 include/graphblas/bsp1d/exec.hpp              | 1356 +++++++++++------
 include/graphblas/bsp1d/matrix.hpp            |    4 +-
 include/graphblas/exec.hpp                    |    5 +-
 include/graphblas/hyperdags/benchmark.hpp     |   61 +-
 include/graphblas/hyperdags/exec.hpp          |   54 +-
 include/graphblas/nonblocking/benchmark.hpp   |   50 +-
 include/graphblas/nonblocking/exec.hpp        |   63 +-
 include/graphblas/nonblocking/io.hpp          |    7 +-
 include/graphblas/nonblocking/spmd.hpp        |    1 +
 include/graphblas/reference/benchmark.hpp     |   62 +-
 include/graphblas/reference/exec.hpp          |   60 +-
 include/graphblas/reference/io.hpp            |    4 +-
 include/graphblas/reference/pinnedvector.hpp  |    4 +-
 include/graphblas/utils/TimerResults.hpp      |   17 +-
 src/graphblas/CMakeLists.txt                  |    3 +-
 src/graphblas/bsp/exec_broadcast_routines.cpp |   76 +
 src/graphblas/bsp1d/CMakeLists.txt            |    1 -
 src/graphblas/bsp1d/exec.cpp                  |   29 -
 src/graphblas/nonblocking/io.cpp              |    3 +-
 src/graphblas/reference/io.cpp                |    2 +-
 tests/smoke/label_test.cpp                    |    7 +-
 tests/smoke/simple_pagerank_from_mpi.cpp      |    5 +-
 tests/unit/CMakeLists.txt                     |   26 +
 tests/unit/auto_launcher.cpp                  |    3 +-
 tests/unit/buildVector.cpp                    |    1 +
 tests/unit/eWiseApplyMatrixReference.cpp      |   51 +-
 tests/unit/id_distributed.cpp                 |  307 ++++
 tests/unit/launcherAndBenchmarker.cpp         |  680 +++++++++
 tests/unit/mxv.cpp                            |    2 +-
 tests/unit/pinnedVector.cpp                   |   31 +-
 tests/unit/sparse_mxv.cpp                     |   14 +-
 tests/unit/unittests.sh                       |   47 +
 tests/unit/vmxa.cpp                           |    2 +-
 tests/unit/vxm.cpp                            |    2 +-
 tests/unit/wait.cpp                           |    3 +-
 tests/utils/output_verification.hpp           |   16 +-
 tests/utils/print_vec_mat.hpp                 |  504 +++++-
 46 files changed, 4083 insertions(+), 2185 deletions(-)
 create mode 100644 include/graphblas/bsp/exec_broadcast_routines.hpp
 create mode 100644 src/graphblas/bsp/exec_broadcast_routines.cpp
 delete mode 100644 src/graphblas/bsp1d/exec.cpp
 create mode 100644 tests/unit/id_distributed.cpp
 create mode 100644 tests/unit/launcherAndBenchmarker.cpp

diff --git a/include/graphblas/banshee/pinnedvector.hpp b/include/graphblas/banshee/pinnedvector.hpp
index 236884fde..f22566565 100644
--- a/include/graphblas/banshee/pinnedvector.hpp
+++ b/include/graphblas/banshee/pinnedvector.hpp
@@ -24,7 +24,7 @@
  * @author A. N. Yzelman
  */
 
-#if ! defined _H_GRB_BANSHEE_PINNEDVECTOR
+#if !defined _H_GRB_BANSHEE_PINNEDVECTOR
 #define _H_GRB_BANSHEE_PINNEDVECTOR
 
 #include <graphblas/base/pinnedvector.hpp>
@@ -33,77 +33,86 @@
 #include "coordinates.hpp"
 #include "vector.hpp"
 
+
 namespace grb {
 
 	/** No implementation notes. */
 	template< typename IOType >
 	class PinnedVector< IOType, banshee > {
 
-	private:
-		/**
-		 * Tell the system to delete \a _buffered_values only when we had its last
-		 * banshee.
-		 */
-		utils::AutoDeleter< IOType > _raw_deleter;
-
-		/**
-		 * Tell the system to delete \a _buffered_mask only when we had its last
-		 * banshee.
-		 */
-		utils::AutoDeleter< char > _assigned_deleter;
-
-		/** A buffer of the local vector. */
-		IOType * _buffered_values;
-
-		/** A buffer of the sparsity pattern of \a _buffered_values. */
-		internal::Coordinates< banshee > _buffered_mask;
-
-	public:
-		/** No implementation notes. */
-		PinnedVector() : _buffered_values( NULL ) {}
-
-		/** No implementation notes. */
-		template< typename Coords >
-		PinnedVector( const Vector< IOType, banshee, Coords > & x, IOMode mode ) :
-			_raw_deleter( x._raw_deleter ), _assigned_deleter( x._assigned_deleter ), _buffered_values( x._raw ), _buffered_mask( x._coordinates ) {
-			(void)mode; // sequential and parallel IO mode are equivalent for this implementation.
-		}
-
-		/** No implementation notes. */
-		IOType & operator[]( const size_t i ) noexcept {
-			return _buffered_values[ i ];
-		}
-
-		/** No implementation notes. */
-		const IOType & operator[]( const size_t i ) const noexcept {
-			return _buffered_values[ i ];
-		}
-
-		/** No implementation notes. */
-		bool mask( const size_t i ) const noexcept {
-			return _buffered_mask.assigned( i );
-		}
-
-		/** No implementation notes. */
-		size_t length() const noexcept {
-			return _buffered_mask.size();
-		}
-
-		/** No implementation notes. */
-		size_t index( const size_t index ) const noexcept {
-			return index;
-		}
-
-		/**
-		 * Frees the underlying raw memory area iff the underlying vector was
-		 * destroyed. Otherwise set the underlying vector to unpinned state.
-		 */
-		void free() {
-			_raw_deleter.clear();
-			_assigned_deleter.clear();
-		}
+		private:
+
+			/**
+			 * Tell the system to delete \a _buffered_values only when we had its last
+			 * banshee.
+			 */
+			utils::AutoDeleter< IOType > _raw_deleter;
+
+			/**
+			 * Tell the system to delete \a _buffered_mask only when we had its last
+			 * banshee.
+			 */
+			utils::AutoDeleter< char > _assigned_deleter;
+
+			/** A buffer of the local vector. */
+			IOType * _buffered_values;
+
+			/** A buffer of the sparsity pattern of \a _buffered_values. */
+			internal::Coordinates< banshee > _buffered_mask;
+
+
+		public:
+
+			/** No implementation notes. */
+			PinnedVector() : _buffered_values( NULL ) {}
+
+			/** No implementation notes. */
+			template< typename Coords >
+			PinnedVector( const Vector< IOType, banshee, Coords > & x, IOMode mode ) :
+				_raw_deleter( x._raw_deleter ), _assigned_deleter( x._assigned_deleter ),
+				_buffered_values( x._raw ), _buffered_mask( x._coordinates
+			) {
+				(void) mode; // sequential and parallel IO mode are equivalent for this
+				             // implementation.
+			}
+
+			/** No implementation notes. */
+			IOType & operator[]( const size_t i ) noexcept {
+				return _buffered_values[ i ];
+			}
+
+			/** No implementation notes. */
+			const IOType & operator[]( const size_t i ) const noexcept {
+				return _buffered_values[ i ];
+			}
+
+			/** No implementation notes. */
+			bool mask( const size_t i ) const noexcept {
+				return _buffered_mask.assigned( i );
+			}
+
+			/** No implementation notes. */
+			size_t length() const noexcept {
+				return _buffered_mask.size();
+			}
+
+			/** No implementation notes. */
+			size_t index( const size_t index ) const noexcept {
+				return index;
+			}
+
+			/**
+			 * Frees the underlying raw memory area iff the underlying vector was
+			 * destroyed. Otherwise set the underlying vector to unpinned state.
+			 */
+			void free() {
+				_raw_deleter.clear();
+				_assigned_deleter.clear();
+			}
+
 	};
 
 } // namespace grb
 
 #endif // end ``_H_GRB_BANSHEE_PINNEDVECTOR
+
diff --git a/include/graphblas/base/benchmark.hpp b/include/graphblas/base/benchmark.hpp
index 56a2fade6..f4775f587 100644
--- a/include/graphblas/base/benchmark.hpp
+++ b/include/graphblas/base/benchmark.hpp
@@ -28,10 +28,19 @@
 #ifndef _H_GRB_BENCH_BASE
 #define _H_GRB_BENCH_BASE
 
-#include <chrono>
-#include <ios>
+#include <cmath>  // for sqrt
 #include <limits>
-#include <string>
+#include <vector> // warning: normally should not be used in ALP backends(!)
+
+#ifndef _GRB_NO_STDIO
+ #include <ios>
+ #include <chrono>
+ #include <iostream>
+#endif
+
+#ifndef _GRB_NO_EXCEPTIONS
+ #include <stdexcept>
+#endif
 
 #include <graphblas/backends.hpp>
 #include <graphblas/ops.hpp>
@@ -43,21 +52,11 @@
 #include "config.hpp"
 #include "exec.hpp"
 
-#ifndef _GRB_NO_STDIO
- #include <iostream>
-#endif
-
-#ifndef _GRB_NO_EXCEPTIONS
- #include <stdexcept>
-#endif
-
-#include <math.h>
-
 
 /**
  * \defgroup benchmarking Benchmarking
  *
- * ALP has a specialised class for benchmarking ALP programs, grb::Benchmarker,
+ * ALP has a specialised class for benchmarking ALP programs, #grb::Benchmarker,
  * which is a variant on the #grb::Launcher. It codes a particular benchmarking
  * strategy of any given ALP program as described below.
  *
@@ -123,7 +122,7 @@ namespace grb {
 					grb::utils::TimerResults &total_times,
 					grb::utils::TimerResults &min_times,
 					grb::utils::TimerResults &max_times,
-					grb::utils::TimerResults * sdev_times
+					std::vector< grb::utils::TimerResults > &sdev_times
 				) {
 					inner_times.normalize( total );
 					total_times.accum( inner_times );
@@ -140,7 +139,7 @@ namespace grb {
 					grb::utils::TimerResults &total_times,
 					grb::utils::TimerResults &min_times,
 					grb::utils::TimerResults &max_times,
-					grb::utils::TimerResults * sdev_times,
+					std::vector< grb::utils::TimerResults > &sdev_times,
 					const size_t pid
 				) {
 					total_times.normalize( total );
@@ -192,82 +191,93 @@ namespace grb {
 				/**
 				 * Benchmarks a given ALP program.
 				 *
-				 * This variant applies to input data as a byte blob and output data as a
-				 * user-defined POD struct.
+				 * This variant applies to typed ALP programs.
 				 *
-				 * @tparam U       Output type of the given user program.
-				 * @tparam backend Which backend the program is using.
+				 * @see #grb::Launcher for more details on type requirements.
+				 *
+				 * @tparam RunnerType The type of the runner, i.e., functor object storing
+				 *                    the information for running the supplied ALP function.
 				 *
-				 * @param[in]  alp_program The use rogram to be benchmarked
-				 * @param[in]  data_in     Input data as a raw data blob
-				 * @param[in]  in_size     The size, in bytes, of the input data
-				 * @param[out] out_data    Output data
-				 * @param[in]  inner       The number of inner repetitions of the benchmark
-				 * @param[in]  outer       The number of outer repetitions of the benchmark
-				 * @param[in]  pid         Unique ID of the calling user process
+				 * @param[in]  runner      Functor object running the function.
+				 * @param[in]  times       Data structure with timing information.
+				 * @param[in]  inner       Number of inner iterations.
+				 * @param[out] outer       Number of outer iterations.
+				 * @param[in]  pid process Identifier of current user process.
 				 *
 				 * @see benchmarking
 				 *
 				 * @ingroup benchmarking
 				 */
 				template<
-					typename U,
-					enum Backend implementation = config::default_backend
+					enum Backend implementation,
+					typename RunnerType
 				>
 				static RC benchmark(
-					void ( *alp_program )( const void *, const size_t, U & ),
-					const void * data_in,
-					const size_t in_size,
-					U &data_out,
-					const size_t inner,
-					const size_t outer,
+					RunnerType &runner,
+					grb::utils::TimerResults &times,
+					const size_t inner, const size_t outer,
 					const size_t pid
 				) {
 					const double inf = std::numeric_limits< double >::infinity();
 					grb::utils::TimerResults total_times, min_times, max_times;
-					grb::utils::TimerResults * sdev_times =
-						new grb::utils::TimerResults[ outer ];
+					std::vector< grb::utils::TimerResults > sdev_times( outer );
 					total_times.set( 0 );
 					min_times.set( inf );
 					max_times.set( 0 );
+					grb::RC ret = grb::SUCCESS;
 
 					// outer loop
-					for( size_t out = 0; out < outer; ++out ) {
+					for( size_t out = 0; out < outer && ret == grb::SUCCESS; ++out ) {
 						grb::utils::TimerResults inner_times;
 						inner_times.set( 0 );
 
 						// inner loop
-						for( size_t in = 0; in < inner; in++ ) {
-							data_out.times.set( 0 );
-							( *alp_program )( data_in, in_size, data_out );
-							grb::collectives< implementation >::reduce(
-								data_out.times.io, 0, grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce(
-								data_out.times.preamble, 0, grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce(
-								data_out.times.useful, 0, grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce(
-								data_out.times.postamble, 0, grb::operators::max< double >() );
-							inner_times.accum( data_out.times );
+						for( size_t in = 0; in < inner && ret == grb::SUCCESS; ++in ) {
+							times.set( 0 );
+
+							runner();
+
+							ret = ret ? ret : grb::collectives< implementation >::reduce(
+								times.io, 0, grb::operators::max< double >() );
+							ret = ret ? ret : grb::collectives< implementation >::reduce(
+								times.preamble, 0, grb::operators::max< double >() );
+							ret = ret ? ret : grb::collectives< implementation >::reduce(
+								times.useful, 0, grb::operators::max< double >() );
+							ret = ret ? ret : grb::collectives< implementation >::reduce(
+								times.postamble, 0, grb::operators::max< double >() );
+
+							if( ret == grb::SUCCESS ) {
+								inner_times.accum( times );
+							}
 						}
 
-						// calculate performance stats
-						benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
-							max_times, sdev_times );
+						if( ret == grb::SUCCESS ) {
+							// calculate performance stats
+							benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
+								max_times, sdev_times );
+						}
 
 #ifndef _GRB_NO_STDIO
 						// give experiment output line
 						if( pid == 0 ) {
-							std::cout << "Outer iteration #" << out << " timings (io, preamble, "
-								<< "useful, postamble, time since epoch): ";
-							std::cout << inner_times.io << ", " << inner_times.preamble << ", "
-								<< inner_times.useful << ", " << inner_times.postamble << ", ";
-							printTimeSinceEpoch( false );
+							if( ret == grb::SUCCESS ) {
+								std::ios_base::fmtflags prev_cout_state( std::cout.flags() );
+								std::cout << "Outer iteration #" << out << " timings "
+									<< "(io, preamble, useful, postamble, time since epoch): "
+									<< std::fixed
+									<< inner_times.io << ", " << inner_times.preamble << ", "
+									<< inner_times.useful << ", " << inner_times.postamble << ", ";
+									printTimeSinceEpoch( false );
+								std::cout.flags( prev_cout_state );
+							} else {
+								std::cerr << "Error during cross-process collection of timing results: "
+									<< "\t" << grb::toString( ret ) << std::endl;
+							}
 						}
 #endif
 
 						// pause for next outer loop
-						if( sleep( 1 ) != 0 ) {
+						if( sleep( 1 ) != 0 && ret == grb::SUCCESS ) {
 #ifndef _GRB_NO_STDIO
 							std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
 								<< "exiting.\n";
@@ -276,30 +286,72 @@ namespace grb {
 						}
 					}
 
-					// calculate performance stats
-					benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
-						pid );
-					delete [] sdev_times;
+					if( ret == grb::SUCCESS ) {
+						// calculate performance stats
+						benchmark_calc_outer( outer, total_times, min_times, max_times,
+							sdev_times, pid );
+					}
+
+					return ret;
+				}
 
-					return SUCCESS;
+				/**
+				 * Benchmarks a given ALP program.
+				 *
+				 * This variant applies to untyped ALP programs.
+				 *
+				 * @see #grb::Launcher for more details on type requirements.
+				 *
+				 * @tparam U       Output type of the given user program.
+				 * @tparam backend Which backend the program is using.
+				 *
+				 * @param[in]  alp_program The user program to be benchmarked.
+				 * @param[in]  data_in     Input data as a raw data blob.
+				 * @param[in]  in_size     The size, in bytes, of the input data.
+				 * @param[out] out_data    Output data as a plain-old-data struct \a U.
+				 * @param[in]  inner       Number of inner repetitions of the benchmark.
+				 * @param[in]  outer       Number of outer repetitions of the benchmark.
+				 * @param[in]  pid         Unique ID of the calling user process.
+				 *
+				 * @see benchmarking
+				 *
+				 * @ingroup benchmarking
+				 */
+				template<
+					typename U,
+					enum Backend implementation
+				>
+				static RC benchmark(
+					AlpUntypedFunc< U > alp_program,
+					const void * data_in, const size_t in_size,
+					U &data_out,
+					const size_t inner, const size_t outer,
+					const size_t pid
+				) {
+					auto runner = [ alp_program, data_in, in_size, &data_out ] {
+						alp_program( data_in, in_size, data_out );
+					};
+					return benchmark< implementation >( runner, data_out.times, inner, outer,
+						pid );
 				}
 
 				/**
 				 * Benchmarks a given ALP program.
 				 *
-				 * This variant applies to input data as a user-defined POD struct and
-				 * output data as a user-defined POD struct.
+				 * This variant applies to typed ALP programs.
+				 *
+				 * @see #grb::Launcher for more details on type requirements.
 				 *
 				 * @tparam T Input type of the given user program.
 				 * @tparam U Output type of the given user program.
 				 *
-				 * @param[in]  alp_program The use rogram to be benchmarked
-				 * @param[in]  data_in     Input data as a raw data blob
-				 * @param[in]  in_size     The size, in bytes, of the input data
-				 * @param[out] out_data    Output data
-				 * @param[in]  inner       The number of inner repetitions of the benchmark
-				 * @param[in]  outer       The number of outer repetitions of the benchmark
-				 * @param[in]  pid         Unique ID of the calling user process
+				 * @param[in]  alp_program The user program to be benchmarked.
+				 * @param[in]  data_in     Input data as a raw data blob.
+				 * @param[in]  in_size     The size, in bytes, of the input data.
+				 * @param[out] out_data    Output data.
+				 * @param[in]  inner       Number of inner repetitions of the benchmark.
+				 * @param[in]  outer       Number of outer repetitions of the benchmark.
+				 * @param[in]  pid         Unique ID of the calling user process.
 				 *
 				 * @see benchmarking
 				 *
@@ -307,77 +359,19 @@ namespace grb {
 				 */
 				template<
 					typename T, typename U,
-					enum Backend implementation = config::default_backend
+					enum Backend implementation
 				>
 				static RC benchmark(
-					void ( *alp_program )( const T &, U & ),
-					const T &data_in,
-					U &data_out,
-					const size_t inner,
-					const size_t outer,
+					AlpTypedFunc< T, U > alp_program,
+					const T &data_in, U &data_out,
+					const size_t inner, const size_t outer,
 					const size_t pid
 				) {
-					const double inf = std::numeric_limits< double >::infinity();
-					grb::utils::TimerResults total_times, min_times, max_times;
-					grb::utils::TimerResults * sdev_times =
-						new grb::utils::TimerResults[ outer ];
-					total_times.set( 0 );
-					min_times.set( inf );
-					max_times.set( 0 );
-
-					// outer loop
-					for( size_t out = 0; out < outer; ++out ) {
-						grb::utils::TimerResults inner_times;
-						inner_times.set( 0 );
-
-						// inner loop
-						for( size_t in = 0; in < inner; ++in ) {
-							data_out.times.set( 0 );
-
-							( *alp_program )( data_in, data_out );
-							grb::collectives< implementation >::reduce( data_out.times.io, 0,
-								grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce( data_out.times.preamble, 0,
-								grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce( data_out.times.useful, 0,
-								grb::operators::max< double >() );
-							grb::collectives< implementation >::reduce( data_out.times.postamble, 0,
-								grb::operators::max< double >() );
-							inner_times.accum( data_out.times );
-						}
-
-						// calculate performance stats
-						benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
-							max_times, sdev_times );
-
-#ifndef _GRB_NO_STDIO
-						// give experiment output line
-						if( pid == 0 ) {
-							std::cout << "Outer iteration #" << out << " timings "
-								<< "(io, preamble, useful, postamble, time since epoch): " << std::fixed
-								<< inner_times.io << ", " << inner_times.preamble << ", "
-								<< inner_times.useful << ", " << inner_times.postamble << ", ";
-								printTimeSinceEpoch( false );
-							std::cout << std::scientific;
-						}
-#endif
-
-						// pause for next outer loop
-						if( sleep( 1 ) != 0 ) {
-#ifndef _GRB_NO_STDIO
-							std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
-								<< "exiting.\n";
-#endif
-							abort();
-						}
-					}
-
-					// calculate performance stats
-					benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
+					auto runner = [ alp_program, &data_in, &data_out ] {
+						alp_program( data_in, data_out );
+					};
+					return benchmark< implementation >( runner, data_out.times, inner, outer,
 						pid );
-					delete[] sdev_times;
-
-					return SUCCESS;
 				}
 
 
@@ -436,11 +430,14 @@ namespace grb {
 			 */
 			Benchmarker(
 				const size_t process_id = 0,
-				size_t nprocs = 1,
-				std::string hostname = "localhost",
-				std::string port = "0"
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
 			) {
-				(void)process_id; (void)nprocs; (void)hostname; (void)port;
+				(void) process_id;
+				(void) nprocs;
+				(void) hostname;
+				(void) port;
 #ifndef _GRB_NO_EXCEPTIONS
 				throw std::logic_error( "Benchmarker class called with unsupported mode or "
 					"implementation" );
@@ -450,17 +447,16 @@ namespace grb {
 			/**
 			 * Benchmarks a given ALP program.
 			 *
-			 * This variant applies to input data as a user-defined POD struct and
-			 * output data as a user-defined POD struct.
+			 * This variant applies to typed ALP programs.
 			 *
 			 * @tparam T Input type of the given user program.
 			 * @tparam U Output type of the given user program.
 			 *
-			 * @param[in]  alp_program The ALP program to be benchmarked
-			 * @param[in]  data_in     Input data as a raw data blob
-			 * @param[out] data_out    Output data
-			 * @param[in]  inner       The number of inner repetitions of the benchmark
-			 * @param[in]  outer       The number of outer repetitions of the benchmark
+			 * @param[in]  alp_program The ALP program to be benchmarked.
+			 * @param[in]  data_in     Input data.
+			 * @param[out] data_out    Output data.
+			 * @param[in]  inner       Number of inner repetitions of the benchmark.
+			 * @param[in]  outer       Number of outer repetitions of the benchmark.
 			 * @param[in]  broadcast   An optional argument that dictates whether the
 			 *                         \a data_in argument should be broadcast across all
 			 *                         user processes participating in the benchmark,
@@ -469,6 +465,8 @@ namespace grb {
 			 * The default value of \a broadcast is <tt>false</tt>.
 			 *
 			 * @returns #grb::SUCCESS The benchmarking has completed successfully.
+			 * @returns #grb::ILLEGAL If \a broadcast was <tt>false</tt> but \a T is not
+			 *                        default-constructible.
 			 * @returns #grb::FAILED  An error during benchmarking has occurred. The
 			 *                        benchmark attempt could be retried, and an error
 			 *                        for the failure is reported to the standard error
@@ -477,6 +475,8 @@ namespace grb {
 			 *                        starting the benchmark, while benchmarking, or
 			 *                        while aggregating the final results.
 			 *
+			 * @see #grb::Launcher for more details.
+			 *
 			 * @see benchmarking
 			 *
 			 * \internal This is the base implementation that should be specialised by
@@ -485,10 +485,8 @@ namespace grb {
 			template< typename T, typename U >
 			RC exec(
 				void ( *alp_program )( const T &, U & ),
-				const T &data_in,
-				U &data_out,
-				const size_t inner,
-				const size_t outer,
+				const T &data_in, U &data_out,
+				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) const {
 				(void) alp_program;
@@ -502,23 +500,26 @@ namespace grb {
 				// furthermore, it should be impossible to call this function without
 				// triggering an exception during construction of this stub class, so we
 				// just return PANIC here
+#ifndef _GRB_NO_STDIO
+				std::cerr << "Error: base Benchmarker::exec called. An implementation-"
+					<< "specific variant should have been called instead.\n";
+#endif
 				return PANIC;
 			}
 
 			/**
 			 * Benchmarks a given ALP program.
 			 *
-			 * This variant applies to input data as a byte blob and output data as a
-			 * user-defined POD struct.
+			 * This variant applies to untyped ALP programs.
 			 *
 			 * @tparam U Output type of the given user program.
 			 *
-			 * @param[in]  alp_program The use rogram to be benchmarked
-			 * @param[in]  data_in     Input data as a raw data blob
-			 * @param[in]  in_size     The size, in bytes, of the input data
-			 * @param[out] data_out    Output data
-			 * @param[in]  inner       The number of inner repetitions of the benchmark
-			 * @param[in]  outer       The number of outer repetitions of the benchmark
+			 * @param[in]  alp_program The user program to be benchmarked.
+			 * @param[in]  data_in     Input data as a raw data blob.
+			 * @param[in]  in_size     The size, in bytes, of the input data.
+			 * @param[out] data_out    Output data.
+			 * @param[in]  inner       Number of inner repetitions of the benchmark.
+			 * @param[in]  outer       Number of outer repetitions of the benchmark.
 			 * @param[in]  broadcast   An optional argument that dictates whether the
 			 *                         \a data_in argument should be broadcast across all
 			 *                         user processes participating in the benchmark,
@@ -537,6 +538,8 @@ namespace grb {
 			 *                        starting the benchmark, while benchmarking, or
 			 *                        while aggregating the final results.
 			 *
+			 * @see #grb::Launcher for more details.
+			 *
 			 * @see benchmarking
 			 *
 			 * \internal This is the base implementation that should be specialised by
@@ -562,6 +565,10 @@ namespace grb {
 				// furthermore, it should be impossible to call this function without
 				// triggering an exception during construction of this stub class, so we
 				// just return PANIC here
+#ifndef _GRB_NO_STDIO
+				std::cerr << "Error: base Benchmarker::exec called. An implementation-"
+					<< "specific variant should have been called instead.\n";
+#endif
 				return PANIC;
 			}
 
@@ -570,13 +577,7 @@ namespace grb {
 			 *
 			 * Calling this function is equivalent to calling #grb::Launcher::finalize.
 			 *
-			 * After a call to this function, no further ALP programs may be benchmarked
-			 * nor launched-- i.e., both the #grb::Launcher and #grb::Benchmarker
-			 * functionalities many no longer be used.
-			 *
-			 * A well-behaving program calls this function, or #grb::Launcher::finalize,
-			 * exactly once and just before exiting (or just before the guaranteed last
-			 * invocation of an ALP program).
+			 * @see #grb::Launcher for further details.
 			 *
 			 * @return #grb::SUCCESS The resources have successfully and permanently been
 			 *                       released.
diff --git a/include/graphblas/base/exec.hpp b/include/graphblas/base/exec.hpp
index fefb10132..18d7b9d99 100644
--- a/include/graphblas/base/exec.hpp
+++ b/include/graphblas/base/exec.hpp
@@ -40,38 +40,75 @@
 
 namespace grb {
 
+	/**
+	 * Type definition for an ALP function with input type information.
+	 */
+	template< typename InputType, typename OutputType >
+	using AlpTypedFunc = void ( * )( const InputType &, OutputType & );
+
+	/**
+	 * Type definition for an ALP function without input type information.
+	 */
+	template< typename OutputType >
+	using AlpUntypedFunc = void ( * )( const void *, size_t, OutputType & );
+
 	/**
 	 * The various ways in which the #grb::Launcher can be used to execute an
 	 * ALP program.
 	 *
-	 * \warning An implementation may require different linker commands
-	 *          when using different modes.
+	 * \warning An implementation or backend may require different linker commands
+	 *          when using different modes, and may require different arguments be
+	 *          passed on program launch. Please see the compiler and runner
+	 *          wrappers <tt>grbcxx</tt>, <tt>alpcxx</tt>, <tt>grbrun</tt>, and/or
+	 *          <tt>alprun</tt> for more details; or refer to the implementation
+	 *          documentation.
 	 *
-	 * \warning Depending on the mode given to #grb::Launcher, the parameters
-	 *          required for the exec function may differ.
+	 * \warning Depending on the mode given to #grb::Launcher, different parameters
+	 *          to the exec function may be required.
 	 *
-	 * \note However, the ALP program is unaware of which mode is the launcher
-	 *       employs and will not have to change.
+	 * An ALP program remains unaware of which mode the launcher employs. Normally,
+	 * it requires no change depending on how it is launched. An exception is when
+	 * data is passed through and from the caller program:
+	 *  -# if the launch mode is #AUTOMATIC, best practice is to minimise the input
+	 *     data footprint that requires broadcasting to all user processes
+	 *     executing the algorithm; in the base case, no input data requires
+	 *     broadcasting. Output is retained only from the first user process, i.e.,
+	 *     the user process for which #grb::spmd<>::pid() returns zero.
+	 *  -# for any other launch mode, multiple user processes may exist before any
+	 *     ALP or ALP/GraphBLAS context exists. Each pre-existing process in such
+	 *     external context is then mapped to an ALP user process in a one-to-one
+	 *     manner. Data, including pointer data, may be passed freely between these
+	 *     two mapped processes; this may, in principle and contrary to the
+	 *     automatic mode, consider large data. Output is retained at each user
+	 *     process and thus is freely available to the mapped external process. In
+	 *     best practice, different user processes return different parts of the
+	 *     overall output, thereby achieving parallel I/O.
 	 */
 	enum EXEC_MODE {
 
 		/**
-		 * Automatic mode. The #grb::Launcher can spawn user processes
-		 * which will execute a given program.
+		 * Automatic mode.
+		 *
+		 * The #grb::Launcher may spawn additional user processes which will jointly
+		 * execute a given ALP program.
 		 */
 		AUTOMATIC = 0,
 
 		/**
-		 * Manual mode. The user controls \a nprocs user processes
-		 * which together should execute a given program, by, for
-		 * example, using the #grb::Launcher.
+		 * Manual mode.
+		 *
+		 * The user controls \a nprocs external processes which jointly should form an
+		 * ALP context and execute one or more given ALP programs.
 		 */
 		MANUAL,
 
 		/**
-		 * When running from an MPI program. The user controls
-		 * \a nprocs MPI programs, which, together, should execute
-		 * a given ALP program.
+		 * From MPI mode.
+		 *
+		 * The user controls \a nprocs external MPI processes which jointly should
+		 * form an ALP context and execute one or more given ALP programs. The only
+		 * difference with the manual mode is that this mode guarantees that the
+		 * pre-existing external processes are MPI processes.
 		 */
 		FROM_MPI
 
@@ -81,17 +118,24 @@ namespace grb {
 	 * A group of user processes that together execute ALP programs.
 	 *
 	 * Allows an application to run any ALP program. Input data may be passed
-	 * through a user-defined type. Output data will be retrieved via the same
-	 * type.
+	 * through a user-defined type. Output data will be retrieved via another user-
+	 * defined type.
 	 *
-	 * For backends that support multiple user processes, the caller may
-	 * explicitly set the process ID and total number of user processes.
+	 * For backends that support multiple user processes, the caller may explicitly
+	 * set the process ID and total number of user processes. If the launcher is
+	 * requested to spawn new user processes, i.e., if it is constructed using the
+	 * #AUTOMATIC mode, then the backend spawns an implementation-defined number of
+	 * additional user processes beyond that corresponding to the process
+	 * constructing the launcher instance, that then jointly execute ALP programs
+	 * in parallel.
 	 *
-	 * The intended use is to `just call' the exec function, which should be
-	 * accepted by any backend.
+	 * The intended use is to `just call' the exec function, which must be accepted
+	 * by any backend in any implementation, to execute any ALP program.
 	 *
 	 * @tparam mode    Which #EXEC_MODE the Launcher should adhere to.
-	 * @tparam backend Which backend is to be used.
+	 * @tparam backend Which backend to use. This is a hidden template argument that
+	 *                 defaults to the backend selected at compile time through
+	 *                 <tt>grbcxx</tt> or <tt>alpcxx</tt>.
 	 */
 	template< enum EXEC_MODE mode, enum Backend backend >
 	class Launcher {
@@ -99,12 +143,21 @@ namespace grb {
 		public :
 
 			/**
-			 * Constructs a new #grb::Launcher. This constructor is a collective call;
-			 * all \a nprocs processes that form a single launcher group must make a
-			 * simultaneous call to this constructor.
+			 * Constructs a new #grb::Launcher.
+			 *
+			 * In #AUTOMATIC mode, a single root user processes issues a call to this
+			 * constructor. In all other modes, a call to this constructor is
+			 * \em collective: all \a nprocs processes that are to form a single launcher
+			 * group, must make a simultaneous call to this constructor and must do so
+			 * with consistent arguments.
+			 *
+			 * \note One may note that in all modes, a call to this constructor must be
+			 *       collective; it is just that in automatic mode there is but one
+			 *       process involved with the collective call (i.e., \a nprocs is one).
 			 *
 			 * There is an implementation-defined time-out for the creation of a launcher
-			 * group.
+			 * group. The default arguments to the below are consistent with the
+			 * automatic launcher mode.
 			 *
 			 * @param[in]  process_id  The user process ID of the calling process.
 			 *                         The value must be larger or equal to 0. This
@@ -124,18 +177,32 @@ namespace grb {
 			 *                         if and only if \a nprocs is larger than one.
 			 *                         Optional: the default value is `0'.
 			 *
-			 * @throws invalid_argument If \a nprocs is zero.
-			 * @throws invalid_argument If \a process_id is greater than or equal to
-			 *                          \a nprocs.
+			 * While these arguments are generic and would work with most network
+			 * fabrics, some modes such as indeed #FROM_MPI may require other arguments
+			 * for constructing a launcher. In terms of specification, only #AUTOMATIC
+			 * and #MANUAL are required to implement this specific constructor
+			 * signature, including the specified defaults for each argument. All
+			 * aforementioned default values must be legal for the #AUTOMATIC and
+			 * #MANUAL modes.
+			 *
+			 * Any other mode in #grb::EXEC_MODE, with possibly different constructor
+			 * signatures from those listed here, are both optional and implementation-
+			 * specific.
 			 *
 			 * \note An implementation or backend may define further constraints on the
 			 *       input arguments, such as, obviously, on \a hostname and \a port, but
 			 *       also on \a nprocs and, as a result, on \a process_id.
 
-			 * \note The most obvious is that backends supporting only one user process
-			 *       must not accept \a nprocs larger than 1.
+			 * \note The most obvious such restriction has backends supporting only one
+			 *       user process not accepting \a nprocs larger than 1.
+			 *
+			 * @throws invalid_argument If \a nprocs is zero.
+			 * @throws invalid_argument If \a process_id is greater than or equal to
+			 *                          \a nprocs.
 			 *
-			 * All aforementioned default values shall always be legal.
+			 * @throws std::invalid_argument If \a nprocs is zero.
+			 * @throws std::invalid_argument If \a process_id is larger than or equal to
+			 *                               \a nprocs.
 			 */
 			Launcher(
 				const size_t process_id = 0,
@@ -145,7 +212,8 @@ namespace grb {
 			) {
 				// spec does not specify any constrants on hostname and port
 				// so accept (and ignore) anything
-				(void) hostname; (void) port;
+				(void) hostname;
+				(void) port;
 
 #ifndef _GRB_NO_EXCEPTIONS
 				// sanity checks on process_id and nprocs
@@ -164,38 +232,77 @@ namespace grb {
 			 * Executes a given ALP program using the user processes encapsulated by this
 			 * launcher group.
 			 *
-			 * Calling this function, depending on whether the automatic or manual/MPI
-			 * mode was selected, will either \em spawn the maximum number of available
-			 * user processes and \em then execute the given program, \em or it will
-			 * employ the given processes that are managed by the user application and
-			 * used to construct this launcher instance to execute the given
-			 * \a alp_program.
+			 * Calling this function, depending on whether the automatic, manual, or from
+			 * MPI mode was selected, will either:
+			 *  -# use processes spawned by the ALP implemenation and use those, as well
+			 *     as the process which had constructed this launcher instance,
+			 *     to jointly execute the given \a alp_program, \em or
+			 *  -# employ the given processes that are managed by the user application
+			 *     and used to construct this launcher instance to execute the given
+			 *     \a alp_program.
 			 *
 			 * This is a collective function call-- all processes in the launcher group
 			 * must make a simultaneous call to this function and must do so using
 			 * consistent arguments.
 			 *
-			 * @tparam T The type of the data to pass to the ALP program as input.
-			 * @tparam U The type of the output data to pass back to the caller.
+			 * @tparam T The type of the data to pass to the ALP program as input. This
+			 *           must be a POD type that contains no pointers.
+			 *
+			 * \note In fact, \a T may be standard layout and contain no pointers, or it
+			 *       may be trivially copiable and contain no pointers.
+			 *
+			 * For calls with \a broadcast <tt>false</tt>, \a T must furthermore be
+			 * default-constructible (and have meaningful default values that allow for
+			 * successful multi-process execution).
+			 *
+			 * For programs or entry points that are solely to be called from manual or
+			 * from MPI modes with \a broadcast <tt>false</tt>, there are no constraints
+			 * on the type \a T since instances of \a T are only ever passed within the
+			 * pre-existing user process, and never communicated across user processes.
+			 *
+			 * @tparam U The type of the output data to pass back to the caller. This may
+			 *           be of any type.
+			 *
+			 * When \a mode is #AUTOMATIC, the type \a U must be default-constructible.
 			 *
 			 * @param[in]  alp_program The user program to be executed.
 			 * @param[in]  data_in     Input data of user-defined type \a T.
+			 * @param[out] data_out    Output data of user-defined type \a U.
+			 * @param[in]  broadcast   Whether the input should be broadcast from user
+			 *                         process 0 to all other user processes. Optional;
+			 *                         the default value is <tt>false</tt>.
+			 *
+			 * When in automatic mode and \a broadcast is <tt>false</tt>, the input data
+			 * \a data_in will only be available at user process with ID 0-- any other
+			 * user processes will receive a default-constructed \a data_in instead.
+			 * When in automatic mode and \a broadcast is <tt>true</tt>, the input data
+			 * \a data_in will be available at all user processes instead.
 			 *
-			 * When in automatic mode and \a broadcast is <tt>false</tt>, the data will
-			 * only be available at user process with ID 0. When in automatic mode and
-			 * \a broadcast is <tt>true</tt>, the data will be available at all user
-			 * processes. When in manual mode, the data will be available to this user
-			 * process only, with "this process" corresponding to the process that calls
-			 * this function.
+			 * When in #MANUAL or #FROM_MPI mode, each user process should collectively
+			 * call this function. If \a broadcast is <tt>false</tt>, the input data
+			 * will be passed from the external calling process to the corresponding ALP
+			 * user processes in a one-to-one manner. Should \a broadcast be
+			 * <tt>true</tt>, then the initial input data passed this way is overwritten
+			 * for user processes \f$ s > 0 \f$ with the \a data_in passed at user
+			 * process zero.
 			 *
-			 * @param[out] data_out  Output data of user-defined type \a U. The output
-			 *                       data should be available at user process with ID
-			 *                       zero.
-			 * @param[in]  broadcast Whether the input should be broadcast from user
-			 *                       process 0 to all other user processes. Optional;
-			 *                       the default value is \a false.
+			 * Only in #MANUAL or #FROM_MPI modes will the output of any user processes
+			 * with ID \f$ s > 0 \f$ be returned to all the processes that collectively
+			 * call this function.
+			 *
+			 * In #AUTOMATIC mode, the output at \f$ s > 0 \f$ is lost. Only the output
+			 * of the first user process \f$ s = 0 \f$ will be passed back to the root
+			 * process that called this function.
+			 *
+			 * \note The default for \a broadcast is <tt>false</tt> as it is the variant
+			 *       that implies the least cost when launching a program.
+			 *
+			 * \note The #FROM_MPI mode is specific to this imlementation and need not
+			 *       be provided as part of the specification.
 			 *
 			 * @return #grb::SUCCESS If the execution proceeded as intended.
+			 * @return #grb::ILLEGAL If \a broadcast was <tt>false</tt> and \a mode was
+			 *                       #AUTOMATIC, but \a T not default-constructible.
 			 * @return #grb::PANIC   If an unrecoverable error was encountered while
 			 *                       attempting to execute, attempting to terminate, or
 			 *                       while executing, the given program.
@@ -204,11 +311,12 @@ namespace grb {
 			 *          achieve its intended result-- for example, an iterative solver
 			 *          may fail to converge. A good programming pattern has that \a U
 			 *          either a) is an error code for the algorithm used (e.g.,
-			 *          #grb::RC), or b) that \a U contains such an error code.
+			 *          <tt>int</tt> or #grb::RC), or that b) \a U is a struct that
+			 *          contains such an error code.
 			 */
 			template< typename T, typename U >
 			RC exec(
-				void ( *alp_program )( const T &, U & ),
+				AlpTypedFunc< T, U > alp_program,
 				const T &data_in,
 				U &data_out,
 				const bool broadcast = false
@@ -227,30 +335,43 @@ namespace grb {
 			 * launcher group.
 			 *
 			 * This variant of exec has that \a data_in is of a variable byte size,
-			 * instead of a fixed POD type. If \a broadcast is <tt>true</tt> and the
-			 * launcher is instantiated using the #grb::AUTOMATIC mode, all bytes are
-			 * broadcast to all user processes.
+			 * instead of a fixed (POD pointer-less) type. We refer to the given function
+			 * as an untyped ALP function (since the input is a raw pointer), whereas the
+			 * other variant executes \em typed ALP functions instead.
 			 *
-			 * @param[in]  alp_program The user program to be executed.
+			 * If \a broadcast is <tt>true</tt>, all bytes are broadcast from the user
+			 * process with ID zero to all other user processes.
+			 *
+			 * \note When in #MANUAL or #FROM_MPI mode, this implies any arguments passed
+			 *       in a process-to-process manner will be lost.
+			 *
+			 * If \a broadcast is <tt>false</tt> and the launcher in #AUTOMATIC mode,
+			 * then the user processes with ID \f$ s > 0 \f$ will receive \a data_in
+			 * equal to <tt>nullptr</tt> and \a in_size equal to zero.
+			 *
+			 * See the \em typed ALP exec variant for more detailed comments, which also
+			 * transfer to this untyped variant.
+			 *
+			 * @param[in]  alp_program The (untyped) user program to be executed.
 			 * @param[in]  data_in     Pointer to raw input byte data.
 			 * @param[in]  in_size     The number of bytes the input data consists of.
-			 * @param[out] data_out  Output data of user-defined type \a U. The output
-			 *                       data should be available at user process with ID
-			 *                       zero.
-			 * @param[in]  broadcast Whether the input should be broadcast from user
-			 *                       process 0 to all other user processes. Optional;
-			 *                       the default value is \a false.
+			 * @param[out] data_out    Output data of user-defined type \a U. The output
+			 *                         data should be available at user process with ID
+			 *                         zero.
+			 * @param[in]  broadcast   Whether the input should be broadcast from user
+			 *                         process 0 to all other user processes. Optional;
+			 *                         the default value is \a false.
 			 *
 			 * @return #grb::SUCCESS If the execution proceeded as intended.
+			 * @return #grb::ILLEGAL If \a in_size is larger than zero but \a data_in is
+			 *                       equal to <tt>nullptr</tt>.
 			 * @return #grb::PANIC   If an unrecoverable error was encountered while
 			 *                       attempting to execute, attempting to terminate, or
 			 *                       while executing, the given program.
-			 *
-			 * For more details, see the other version of this function.
 			 */
 			template< typename U >
 			RC exec(
-				void ( *alp_program )( const void *, const size_t, U & ),
+				AlpUntypedFunc< U > alp_program,
 				const void * data_in,
 				const size_t in_size,
 				U &data_out,
@@ -268,25 +389,24 @@ namespace grb {
 			 * Releases all ALP resources.
 			 *
 			 * After a call to this function, no further ALP programs may launched using
-			 * the #grb::Launcher and #grb::Benchmarker. Also the use of #grb::init and
-			 * #grb::finalize will no longer be accepted.
+			 * \em any #grb::Launcher or #grb::Benchmarker instance. Implementations and
+			 * backends shall under no circumstance require a call to this function; any
+			 * use of this function shall remain purely optional.
 			 *
-			 * \warning #grb::init and #grb::finalize are deprecated.
+			 * \warning After a call to this function, also any subsequent call to the
+			 *          deprecated #grb::init and #grb::finalize will no longer be
+			 *          accepted.
 			 *
 			 * \internal
-			 * \todo Remove the above comments once #grb::init and #grb::finalize are
+			 * \todo Remove the above warning once #grb::init and #grb::finalize are
 			 *       moved to an internal namespace.
 			 * \endinternal
 			 *
 			 * After a call to this function, the only way to once again run ALP programs
-			 * is to use the #grb::Launcher from a new process.
+			 * is to use the #grb::Launcher from a different process.
 			 *
 			 * \warning Therefore, use this function with care and preferably only just
-			 *          before exiting the process.
-
-			 * A well-behaving program calls this function, or
-			 * #grb::Benchmarker::finalize, exactly once before its process terminates,
-			 * or just after the guaranteed last invocation of an ALP program.
+			 *          before exiting the process-- or not at all.
 			 *
 			 * @return #grb::SUCCESS The resources have successfully and permanently been
 			 *                       released.
@@ -296,25 +416,14 @@ namespace grb {
 			 *                       undefined and should no longer be used.
 			 *
 			 * \note In the terminology of the Message Passing Interface (MPI), this
-			 *       function is the ALP equivalent of the <tt>MPI_Finalize()</tt>.
-			 *
-			 * \note In #grb::AUTOMATIC mode when using a parallel backend that uses MPI
-			 *       to auto-parallelise the ALP computations, MPI is never explicitly
-			 *       exposed to the user application. This use case necessitates the
-			 *       specification of this function.
-			 *
-			 * \note Thus, and in particular, an ALP program launched in #grb::AUTOMATIC
-			 *       mode while using the #grb::BSP1D or the #grb::hybrid backends with
-			 *       ALP compiled using LPF that in turn is configured to use an
-			 *       MPI-based engine, should make sure to call this function before
-			 *       program exit.
-			 *
-			 * \note An application that launches ALP programs in #grb::FROM_MPI mode
-			 *       must still call this function, even though a proper such application
-			 *       makes its own call to <tt>MPI_Finalize()</tt>. This does \em not
-			 *       induce improper behaviour since calling this function using a
-			 *       launcher instance in #grb::FROM_MPI mode translates, from an MPI
-			 *       perspective, to a no-op.
+			 *       function is similar to <tt>MPI_Finalize()</tt>.
+			 *
+			 * \warning Different from MPI, however, a call to this function at program
+			 *          exit is not mandatory.
+			 *
+			 * \warning An application that launches ALP programs in #grb::FROM_MPI mode
+			 *          that calls this function, must (afterwards) still make a call to
+			 *          <tt>MPI_Finalize()</tt>.
 			 *
 			 * \internal This is the base implementation that should be specialised by
 			 *           each backend separately.
diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp
index ccace7979..81bd67773 100644
--- a/include/graphblas/benchmark.hpp
+++ b/include/graphblas/benchmark.hpp
@@ -45,7 +45,10 @@
 
 #ifdef _GRB_BACKEND
 namespace grb {
-	template< enum EXEC_MODE mode, enum Backend implementation = config::default_backend >
+	template<
+		enum EXEC_MODE mode,
+		enum Backend implementation = config::default_backend
+	>
 	class Benchmarker;
 }
 #endif
diff --git a/include/graphblas/bsp/collectives.hpp b/include/graphblas/bsp/collectives.hpp
index 098f7f738..e7291734d 100644
--- a/include/graphblas/bsp/collectives.hpp
+++ b/include/graphblas/bsp/collectives.hpp
@@ -66,331 +66,388 @@
 		"************************************************************************" \
 		"**********************\n" );
 
+
 namespace grb {
 
 	/**
-	 * Collective communications using the GraphBLAS operators for
-	 * reduce-style operations. This is the BSP1D implementation.
+	 * Collective communications using ALP operators for reduce-style operations.
+	 *
+	 * This is the BSP1D implementation.
 	 *
 	 * TODO internal issue #198
 	 */
 	template<>
 	class collectives< BSP1D > {
 
-	private:
-		/** Disallow instantiation of this class. */
-		collectives() {}
-
-	public:
-		/**
-		 * Schedules an allreduce operation of a single object of type IOType per
-		 * process. The allreduce shall be complete by the end of the call. This is a
-		 * collective graphBLAS operation.
-		 *
-		 * \parblock
-		 * \par Performance semantics:
-		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ N*Operator \f$ ;
-		 * -# transferred bytes: \f$ N \f$ ;
-		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
-		 * \endparblock
-		 *
-		 * This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes
-		 * if the internal buffer was not sufficiently large.
-		 */
-		template<
-			Descriptor descr = descriptors::no_operation,
-			typename Operator, typename IOType
-		>
-		static RC allreduce( IOType &inout, const Operator &op = Operator() ) {
-			// this is the serial algorithm only
-			// TODO internal issue #19
+		private:
+
+			/** Disallow instantiation of this class. */
+			collectives() {}
+
+
+		public:
+
+			/**
+			 * Schedules an allreduce operation of a single object of type IOType per
+			 * process. The allreduce shall be complete by the end of the call. This is a
+			 * collective graphBLAS operation.
+			 *
+			 * \parblock
+			 * \par Performance semantics:
+			 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ N*Operator \f$ ;
+			 * -# transferred bytes: \f$ N \f$ ;
+			 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+			 * \endparblock
+			 *
+			 * This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes
+			 * if the internal buffer was not sufficiently large.
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename Operator, typename IOType
+			>
+			static RC allreduce( IOType &inout, const Operator &op = Operator() ) {
+				// this is the serial algorithm only
+				// TODO internal issue #19
 #ifdef _DEBUG
-			std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = "
-				<< inout << " and op = " << &op << std::endl;
+				std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = "
+					<< inout << " and op = " << &op << std::endl;
 #endif
 
-			// static sanity check
-			NO_CAST_ASSERT_BLAS0( ( !( descr & descriptors::no_casting ) ||
-					std::is_same< IOType, typename Operator::D1 >::value ||
-					std::is_same< IOType, typename Operator::D2 >::value ||
-					std::is_same< IOType, typename Operator::D3 >::value
-				),
-				"grb::collectives::allreduce",
-				"Incompatible given value type and operator domains while "
-				"no_casting descriptor was set"
-			);
-
-			// we need access to LPF context
-			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
-
-			// catch trivial case early
-			if( data.P == 1 ) {
-				return SUCCESS;
-			}
+				// static sanity check
+				NO_CAST_ASSERT_BLAS0( ( !( descr & descriptors::no_casting ) ||
+						std::is_same< IOType, typename Operator::D1 >::value ||
+						std::is_same< IOType, typename Operator::D2 >::value ||
+						std::is_same< IOType, typename Operator::D3 >::value
+					),
+					"grb::collectives::allreduce",
+					"Incompatible given value type and operator domains while "
+					"no_casting descriptor was set"
+				);
+
+				// we need access to LPF context
+				internal::BSP1D_Data &data = internal::grb_BSP1D.load();
+
+				// catch trivial case early
+				if( data.P == 1 ) {
+					return SUCCESS;
+				}
 
-			// we need to register inout
-			lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT;
-			if( data.ensureMemslotAvailable() != grb::SUCCESS ) {
+				// we need to register inout
+				lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT;
+				if( data.ensureMemslotAvailable() != grb::SUCCESS ) {
 #ifndef NDEBUG
-				const bool could_not_ensure_enough_memory_slots_available = false;
-				assert( could_not_ensure_enough_memory_slots_available );
+					const bool could_not_ensure_enough_memory_slots_available = false;
+					assert( could_not_ensure_enough_memory_slots_available );
 #endif
-				return PANIC;
-			}
-			if( lpf_register_local( data.context,
-					&inout,
-					sizeof( IOType ),
-					&inout_slot
-				) != LPF_SUCCESS
-			) {
+					return PANIC;
+				}
+				if( lpf_register_local( data.context,
+						&inout,
+						sizeof( IOType ),
+						&inout_slot
+					) != LPF_SUCCESS
+				) {
 #ifndef NDEBUG
-				const bool lpf_register_returned_error = false;
-				assert( lpf_register_returned_error );
+					const bool lpf_register_returned_error = false;
+					assert( lpf_register_returned_error );
 #endif
-				return PANIC;
-			} else {
-				data.signalMemslotTaken();
-			}
+					return PANIC;
+				} else {
+					data.signalMemslotTaken();
+				}
 
-			// allgather inout values
-			// note: buffer size check is done by the below function
-			if( internal::allgather(
-				inout_slot, 0,
-				data.slot, data.s * sizeof( IOType ),
-				sizeof( IOType ),
-				data.P * sizeof( IOType ),
-				true
-			) != grb::SUCCESS ) {
+				// allgather inout values
+				// note: buffer size check is done by the below function
+				if( internal::allgather(
+					inout_slot, 0,
+					data.slot, data.s * sizeof( IOType ),
+					sizeof( IOType ),
+					data.P * sizeof( IOType ),
+					true
+				) != grb::SUCCESS ) {
 #ifndef NDEBUG
-				const bool allgather_returned_error = false;
-				assert( allgather_returned_error );
+					const bool allgather_returned_error = false;
+					assert( allgather_returned_error );
 #endif
-				return PANIC;
+					return PANIC;
 			}
 
-			// deregister
-			if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) {
+				// deregister
+				if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) {
 #ifndef NDEBUG
-				const bool lpf_deregister_returned_error = false;
-				assert( lpf_deregister_returned_error );
+					const bool lpf_deregister_returned_error = false;
+					assert( lpf_deregister_returned_error );
 #endif
-				return PANIC;
-			} else {
-				data.signalMemslotReleased();
-			}
-
-			// fold everything
-			IOType * __restrict__ const buffer = data.getBuffer< IOType >();
-			for( size_t i = 0; i < data.P; ++i ) {
-				if( i == data.s ) {
-					continue;
+					return PANIC;
+				} else {
+					data.signalMemslotReleased();
 				}
+
+				// fold everything
+				IOType * __restrict__ const buffer = data.getBuffer< IOType >();
+				for( size_t i = 0; i < data.P; ++i ) {
+					if( i == data.s ) {
+						continue;
+					}
 #ifdef _DEBUG
-				std::cout << data.s
-						  << ": in Collectives< BSP1D >::allreduce. Buffer "
-							 "index "
-						  << i << ", folding " << buffer[ i ] << " into " << inout << ", yields ";
+					std::cout << data.s << ": in Collectives< BSP1D >::allreduce. Buffer "
+						<< "index " << i << ", folding " << buffer[ i ] << " into " << inout
+						<< ", yields ";
 #endif
-				// if casting is required to apply op, foldl will take care of this
-				if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) {
-					assert( false );
-				}
+					// if casting is required to apply op, foldl will take care of this
+					if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) {
+						assert( false );
+					}
 #ifdef _DEBUG
-				std::cout << inout << std::endl;
+					std::cout << inout << std::endl;
 #endif
-			}
+				}
 
-			// done
-			return SUCCESS;
-		}
-
-		/**
-		 * Schedules a reduce operation of a single object of type IOType per process.
-		 * The reduce shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
-		 *
-		 * \parblock
-		 * \par Performance semantics:
-		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ N*Operator \f$ ;
-		 * -# transferred bytes: \f$ N \f$ ;
-		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
-		 * \endparblock
-		 *
-		 */
-		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
-		static RC reduce( IOType & inout, const lpf_pid_t root = 0, const Operator op = Operator() ) {
-			// this is the serial algorithm only
-			// TODO internal issue #19
-
-			// static sanity check
-			NO_CAST_ASSERT_BLAS0( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value ||
-									  std::is_same< IOType, typename Operator::D3 >::value ),
-				"grb::collectives::reduce",
-				"Incompatible given value type and operator domains while "
-				"no_casting descriptor was set" );
-
-			// we need access to LPF context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
-
-			// catch trivial case early
-			if( data.P == 1 ) {
+				// done
 				return SUCCESS;
 			}
 
-			// make sure we can support comms pattern: IOType -> P * IOType
-			lpf_coll_t coll;
-			if( commsPreamble( data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 ) != SUCCESS ) {
-				return PANIC;
-			}
+			/**
+			 * Schedules a reduce operation of a single object of type IOType per process.
+			 * The reduce shall be complete by the end of the call. This is a collective
+			 * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
+			 *
+			 * \parblock
+			 * \par Performance semantics:
+			 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ N*Operator \f$ ;
+			 * -# transferred bytes: \f$ N \f$ ;
+			 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+			 * \endparblock
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename Operator, typename IOType
+			>
+			static RC reduce(
+				IOType &inout, const lpf_pid_t root = 0,
+				const Operator op = Operator()
+			) {
+				// this is the serial algorithm only
+				// TODO internal issue #19
+
+				// static sanity check
+				NO_CAST_ASSERT_BLAS0( ( !(descr & descriptors::no_casting) ||
+						std::is_same< IOType, typename Operator::D1 >::value ||
+						std::is_same< IOType, typename Operator::D2 >::value ||
+						std::is_same< IOType, typename Operator::D3 >::value
+					), "grb::collectives::reduce",
+					"Incompatible given value type and operator domains while "
+					"no_casting descriptor was set"
+				);
+
+				// we need access to LPF context
+				internal::BSP1D_Data &data = internal::grb_BSP1D.load();
+
+				// catch trivial case early
+				if( data.P == 1 ) {
+					return SUCCESS;
+				}
 
-			// create a local register slot
-			lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT;
-			if( lpf_register_global( data.context, &inout, sizeof( IOType ), &inout_slot ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// make sure we can support comms pattern: IOType -> P * IOType
+				lpf_coll_t coll;
+				if( commsPreamble(
+						data, &coll, data.P, data.P * sizeof( IOType ), 0, 1
+					) != SUCCESS
+				) {
+					return PANIC;
+				}
 
-			if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// create a local register slot
+				lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT;
+				if( lpf_register_global(
+						data.context, &inout, sizeof( IOType ), &inout_slot
+					) != LPF_SUCCESS
+				) {
+					return PANIC;
+				}
 
-			// gather together values
-			if( lpf_gather( coll, inout_slot, data.slot, sizeof( IOType ), root ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
 
-			// finish the communication
-			if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// gather together values
+				if( lpf_gather(
+						coll, inout_slot, data.slot, sizeof( IOType ), root
+					) != LPF_SUCCESS
+				) {
+					return PANIC;
+				}
 
-			// do deregister
-			if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// finish the communication
+				if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
 
-			// fold everything: root only
-			if( data.s == root ) {
-				IOType * __restrict__ const buffer = data.getBuffer< IOType >();
-				for( size_t i = 0; i < data.P; ++i ) {
-					if( i == root ) {
-						continue;
-					}
-					// if casting is required to apply op, foldl will take care of this
-					// note: the no_casting check could be deferred to foldl but this would result in unclear error messages
-					if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) {
-						return PANIC;
+				// do deregister
+				if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
+
+				// fold everything: root only
+				if( data.s == root ) {
+					IOType * __restrict__ const buffer = data.getBuffer< IOType >();
+					for( size_t i = 0; i < data.P; ++i ) {
+						if( i == root ) {
+							continue;
+						}
+						// if casting is required to apply op, foldl will take care of this
+						// note: the no_casting check could be deferred to foldl but this would
+						//       result in unclear error messages
+						if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) {
+							return PANIC;
+						}
 					}
 				}
-			}
 
-			if( commsPostamble( data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 ) != SUCCESS ) {
-				return PANIC;
-			}
+				if( commsPostamble(
+						data, &coll, data.P, data.P * sizeof( IOType ), 0, 1
+					) != SUCCESS
+				) {
+					return PANIC;
+				}
 
-			// done
-			return SUCCESS;
-		}
-
-		/**
-		 * Schedules a broadcast operation of a single object of type IOType per process.
-		 * The broadcast shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the PlatformBSP #broadcast.
-		 *
-		 * @tparam IOType The type of the to-be broadcast value.
-		 *
-		 * @param[in,out] inout On input: the value at the root process to be broadcast.
-		 *                      On output at process \a root: the same value.
-		 *                      On output at non-root processes: the value at root.
-		 *
-		 * \parblock
-		 * \par Performance semantics: common
-		 * Whether system calls will happen depends on the LPF engine compiled with,
-		 * as does whether buffer space is proportional to the payload size is
-		 * required. In principle, when using a fabric like Inifiband and when using
-		 * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained.
-		 *
-		 * All below variants in any backend shall not result in dynamic memory
-		 * allocations.
-		 * \endparblock
-		 *
-		 * \parblock
-		 * \par Performance semantics: serial
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ NP \f$ ;
-		 * -# BSP cost: \f$ NPg + l \f$;
-		 * \endparblock
-		 *
-		 * \parblock
-		 * \par Performance semantics: two phase
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ 2N \f$ ;
-		 * -# BSP cost: \f$ 2(Ng + l) \f$;
-		 * \endparblock
-		 *
-		 * \parblock
-		 * \par Performance semantics: two level tree
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
-		 * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
-		 * \endparblock
-		 */
-		template< typename IOType >
-		static RC broadcast( IOType & inout, const lpf_pid_t root = 0 ) {
-			// we need access to LPF context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
-
-			// make sure we can support comms pattern: IOType -> IOType
-			lpf_coll_t coll;
-			if( commsPreamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) {
-				return PANIC;
+				// done
+				return SUCCESS;
 			}
 
-			// register inout
-			lpf_memslot_t slot = LPF_INVALID_MEMSLOT;
-			if( data.ensureMemslotAvailable() != SUCCESS ) {
-				return PANIC;
-			}
-			if( lpf_register_global( data.context, &inout, sizeof( IOType ), &slot ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+			/**
+			 * Schedules a broadcast operation of a single object of type IOType per process.
+			 * The broadcast shall be complete by the end of the call. This is a collective
+			 * graphBLAS operation. The BSP costs are as for the PlatformBSP #broadcast.
+			 *
+			 * @tparam IOType The type of the to-be broadcast value.
+			 *
+			 * @param[in,out] inout On input: the value at the root process to be broadcast.
+			 *                      On output at process \a root: the same value.
+			 *                      On output at non-root processes: the value at root.
+			 *
+			 * \parblock
+			 * \par Performance semantics: common
+			 * Whether system calls will happen depends on the LPF engine compiled with,
+			 * as does whether buffer space is proportional to the payload size is
+			 * required. In principle, when using a fabric like Inifiband and when using
+			 * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained.
+			 *
+			 * All below variants in any backend shall not result in dynamic memory
+			 * allocations.
+			 * \endparblock
+			 *
+			 * \parblock
+			 * \par Performance semantics: serial
+			 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ 0 \f$ ;
+			 * -# transferred bytes: \f$ NP \f$ ;
+			 * -# BSP cost: \f$ NPg + l \f$;
+			 * \endparblock
+			 *
+			 * \parblock
+			 * \par Performance semantics: two phase
+			 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ 0 \f$ ;
+			 * -# transferred bytes: \f$ 2N \f$ ;
+			 * -# BSP cost: \f$ 2(Ng + l) \f$;
+			 * \endparblock
+			 *
+			 * \parblock
+			 * \par Performance semantics: two level tree
+			 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ 0 \f$ ;
+			 * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
+			 * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
+			 * \endparblock
+			 */
+			template< typename IOType >
+			static RC broadcast( IOType &inout, const lpf_pid_t root = 0 ) {
+				// we need access to LPF context
+				internal::BSP1D_Data &data = internal::grb_BSP1D.load();
+
+				// make sure we can support comms pattern: IOType -> IOType
+				lpf_coll_t coll;
+				if( commsPreamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) {
+					return PANIC;
+				}
 
-			if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// register inout
+				lpf_memslot_t slot = LPF_INVALID_MEMSLOT;
+				if( data.ensureMemslotAvailable() != SUCCESS ) {
+					return PANIC;
+				}
+				if( lpf_register_global(
+						data.context, &inout, sizeof( IOType ), &slot
+					) != LPF_SUCCESS
+				) {
+					return PANIC;
+				}
 
-			// broadcast value
-			if( lpf_broadcast( coll, slot, slot, sizeof( IOType ), root ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
 
-			// finish communication
-			if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// broadcast value
+				if( lpf_broadcast(
+						coll, slot, slot, sizeof( IOType ), root
+					) != LPF_SUCCESS
+				) {
+					return PANIC;
+				}
 
-			// coda
-			if( lpf_deregister( data.context, slot ) != LPF_SUCCESS ) {
-				return PANIC;
-			}
+				// finish communication
+				if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
 
-			if( commsPostamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) {
-				return PANIC;
+				// coda
+				if( lpf_deregister( data.context, slot ) != LPF_SUCCESS ) {
+					return PANIC;
+				}
+
+				if( commsPostamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) {
+					return PANIC;
+				}
+
+				// done
+				return SUCCESS;
 			}
 
-			// done
-			return SUCCESS;
-		}
+			/**
+			 * Schedules a broadcast of a raw array of a given type.
+			 *
+			 * @tparam IOType The array element type.
+			 *
+			 * @param[in,out] inout A pointer to the array to broadcast (for the root
+			 *                      user process), or a pointer where to store the array
+			 *                      to be broadcast (for all other user processes).
+			 * @param[in] size      The size, in number of array elements, of the array
+			 *                      to be broadcast. Must match across all user processes
+			 *                      in the collective call.
+			 * @param[in] root      Which user process ID is the root.
+			 *
+			 * \parblock
+			 * \par Performance semantics
+			 *
+			 * Please refer to the LPF collectives higher-level library for the
+			 * performance semantics of this call. (This function does not implements
+			 * its own custom logic for this primitive.)
+			 * \endparblock
+			 *
+			 * @returns grb::SUCCESS On successful broadcast of the requested array.
+			 * @returns grb::PANIC   If the communication layer has failed.
+			 */
+			template< Descriptor descr = descriptors::no_operation, typename IOType >
+			static RC broadcast(
+				IOType * inout, const size_t size, const size_t root = 0
+			) {
+				return internal::broadcast< descr >( inout, size, root );
+			}
 
-		/** TODO documentation */
-		template< Descriptor descr = descriptors::no_operation, typename IOType >
-		static RC broadcast( IOType * inout, const size_t size, const size_t root = 0 ) {
-			return internal::broadcast< descr >( inout, size, root );
-		}
 	};
 
 } // namespace grb
@@ -398,3 +455,4 @@ namespace grb {
 #undef NO_CAST_ASSERT_BLAS0
 
 #endif // end ``_H_GRB_BSP_COLL''
+
diff --git a/include/graphblas/bsp/collectives_blas1.hpp b/include/graphblas/bsp/collectives_blas1.hpp
index eace13468..7b1633cef 100644
--- a/include/graphblas/bsp/collectives_blas1.hpp
+++ b/include/graphblas/bsp/collectives_blas1.hpp
@@ -37,13 +37,18 @@
 
 #include "internal-collectives.hpp"
 
-/** The difference between pid and root, modulus P - circumvents weird modulus behaviour under -ve numbers */
-#define DIFF( pid, root, P ) ( ( pid < root ) ? pid + P - root : pid - root ) % P
+/**
+ * The difference between pid and root, modulus P - circumvents weird modulus
+ * behaviour under -ve numbers
+ */
+#define DIFF( pid, root, P ) ( (pid < root) ? pid + P - root : pid - root ) % P
+
 
 namespace grb {
 
 	/**
-	 * Collective communications using the GraphBLAS operators for reduce-style operations.
+	 * Collective communications using the GraphBLAS operators for reduce-style
+	 * operations.
 	 */
 	namespace internal {
 
@@ -57,8 +62,9 @@ namespace grb {
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam IOType   The type of the to-be gathered value.
 		 *
-		 * @param[in]  in:  The value at the calling process to be gathered.
-		 * @param[out] out: The vector of gathered values, available at the root process.
+		 * @param[in]  in  The value at the calling process to be gathered.
+		 * @param[out] out The vector of gathered values, available at the root
+		 *                 process.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -74,22 +80,25 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
-		RC gather( const IOType & in,
+		>
+		RC gather(
+			const IOType &in,
 #ifdef BLAS1_RAW
 			IOType * out,
 #else
-			Vector< IOType, reference, Coords > & out,
+			Vector< IOType, reference, Coords > &out,
 #endif
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// run-time sanity check
 #ifndef BLAS1_RAW
@@ -121,9 +130,15 @@ namespace grb {
 			lpf_memslot_t slot = LPF_INVALID_MEMSLOT;
 			RC ret = SUCCESS;
 #ifndef BLAS1_RAW
-			if( lpf_register_global( data.context, internal::getRaw( out ), data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) {
+			if( lpf_register_global(
+					data.context, internal::getRaw( out ), data.P * sizeof( IOType ), &slot
+				) != LPF_SUCCESS
+			) {
 #else
-			if( lpf_register_global( data.context, out, data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) {
+			if( lpf_register_global(
+					data.context, out, data.P * sizeof( IOType ), &slot
+				) != LPF_SUCCESS
+			) {
 #endif
 				// failure at this point will have to be cleaned up as best as possible
 				ret = PANIC;
@@ -134,26 +149,42 @@ namespace grb {
 			}
 
 			// gather values
-			if( ret == SUCCESS && lpf_gather( coll, slot, slot, sizeof( IOType ), root ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+					lpf_gather( coll, slot, slot, sizeof( IOType ), root )
+				!= LPF_SUCCESS
+			) {
 				// failure at this point will have to be cleaned up as best as possible
 				ret = PANIC;
 			}
 
 			// perform communication
-			if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+					lpf_sync( data.context, LPF_SYNC_DEFAULT )
+				!= LPF_SUCCESS
+			) {
 				// failure at this point will have to be cleaned up as best as possible
 				ret = PANIC;
 			}
 
 #ifndef BLAS1_RAW
 			// make sure sparsity info is correct
-			for( size_t i = 0; data.s == root && ret == SUCCESS && internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes() && i < data.P; ++i ) {
-				(void)internal::getCoordinates( out ).assign( i );
+			for(
+				size_t i = 0;
+				data.s == root &&
+					ret == SUCCESS &&
+					internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes()
+					&& i < data.P;
+				++i
+			) {
+				(void) internal::getCoordinates( out ).assign( i );
 			}
 #endif
 
 			// deregister slot
-			if( slot != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, slot ) != LPF_SUCCESS ) {
+			if( slot != LPF_INVALID_MEMSLOT &&
+					lpf_deregister( data.context, slot )
+				!= LPF_SUCCESS
+			) {
 				// error during cleanup of memslot
 				ret = PANIC;
 			}
@@ -168,8 +199,8 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules a gather operation of a vector of \a N/P elements of type IOType per process
-		 * to a vector of \f$ N \f$ elements.
+		 * Schedules a gather operation of a vector of \a N/P elements of type IOType
+		 * per process to a vector of \f$ N \f$ elements.
 		 * The gather shall be complete by the end of the call. This is a collective
 		 * graphBLAS operation. The BSP costs are as for the LPF #gather.
 		 *
@@ -194,25 +225,27 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC gather(
 #ifdef BLAS1_RAW
 			const IOType * in,
 			const size_t size,
 			IOType * out,
 #else
-			const Vector< IOType, reference, Coords > & in,
-			Vector< IOType, reference, Coords > & out,
+			const Vector< IOType, reference, Coords > &in,
+			Vector< IOType, reference, Coords > &out,
 #endif
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: Vector IOType -> P * Vector IOType
 #ifndef BLAS1_RAW
@@ -229,9 +262,15 @@ namespace grb {
 			lpf_memslot_t slot = LPF_INVALID_MEMSLOT;
 			RC ret = SUCCESS;
 #ifndef BLAS1_RAW
-			if( lpf_register_global( data.context, internal::getRaw( out ), size * data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) {
+			if( lpf_register_global( data.context, internal::getRaw( out ),
+					size * data.P * sizeof( IOType ), &slot )
+				!= LPF_SUCCESS
+			) {
 #else
-			if( lpf_register_global( data.context, out, size * data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) {
+			if( lpf_register_global( data.context, out, size * data.P * sizeof( IOType ),
+					&slot )
+				!= LPF_SUCCESS
+			) {
 #endif
 				// failure at this point will have to be cleaned up as best as possible
 				ret = PANIC;
@@ -240,38 +279,68 @@ namespace grb {
 			// copy input to buffer
 			const size_t pos = ( data.s == root ) ? data.s : 0;
 #ifdef BLAS1_RAW
-			for( size_t i = 0; ret == SUCCESS && ( out + pos * size ) != in && i < size; i++ ) {
+			for(
+				size_t i = 0;
+				ret == SUCCESS && ( out + pos * size ) != in && i < size;
+				i++
+			) {
 				out[ pos * size + i ] = in[ i ];
 			}
 #else
-			for( size_t i = 0; ret == SUCCESS && ( internal::getRaw( out ) + pos * size ) != internal::getRaw( in ) && i < size; i++ ) {
+			for(
+				size_t i = 0;
+				ret == SUCCESS &&
+					(internal::getRaw( out ) + pos * size) != internal::getRaw( in ) &&
+					i < size;
+				i++
+			) {
 				internal::getRaw( out )[ pos * size + i ] = internal::getRaw( in )[ i ];
 			}
 #endif
 			// activate registrations
-			if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+					lpf_sync( data.context, LPF_SYNC_DEFAULT )
+				!= LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 			// gather values
-			if( ret == SUCCESS && lpf_gather( coll, slot, slot, size * sizeof( IOType ), root ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+					lpf_gather( coll, slot, slot, size * sizeof( IOType ), root )
+				!= LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 			// complete requested communication
-			if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+					lpf_sync( data.context, LPF_SYNC_DEFAULT )
+				!= LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 #ifndef BLAS1_RAW
 			// set sparsity of output
-			for( size_t i = 0; data.s == root && ret == SUCCESS && internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes() && i < data.P * size; ++i ) {
-				(void)internal::getCoordinates( out ).assign( i );
+			for(
+				size_t i = 0;
+				data.s == root &&
+					ret == SUCCESS &&
+					internal::getCoordinates( out ).size() !=
+				       		internal::getCoordinates( out ).nonzeroes() &&
+					i < data.P * size;
+				++i
+			) {
+				(void) internal::getCoordinates( out ).assign( i );
 			}
 #endif
 
 			// destroy memory slot
-			if( slot != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, slot ) != LPF_SUCCESS ) {
+			if( slot != LPF_INVALID_MEMSLOT &&
+					lpf_deregister( data.context, slot ) !=
+				LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
@@ -294,8 +363,10 @@ namespace grb {
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam IOType   The type of the to-be scattered value.
 		 *
-		 * @param[in]  in:  The vector of \a P elements at the root process to be scattered.
-		 * @param[out] out: The scattered value of the root process \f$ vector[i] \f$ at process \a i.
+		 * @param[in]  in  The vector of \a P elements at the root process to be
+		 *                 scattered.
+		 * @param[out] out The scattered value of the root process \f$ vector[i] \f$
+		 *                 at process \a i.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -311,23 +382,25 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC scatter(
 #ifdef BLAS1_RAW
 			const IOType * in,
 #else
-			const Vector< IOType, reference, Coords > & in,
+			const Vector< IOType, reference, Coords > &in,
 #endif
-			IOType & out,
-			const lpf_pid_t root ) {
+			IOType &out,
+			const lpf_pid_t root
+		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: P * IOType -> IOType
 #ifndef BLAS1_RAW
@@ -343,13 +416,27 @@ namespace grb {
 			lpf_memslot_t src, dest;
 			src = dest = LPF_INVALID_MEMSLOT;
 			RC ret = SUCCESS;
-			if( lpf_register_global( data.context, &out, sizeof( IOType ), &dest ) != LPF_SUCCESS ) {
+			if( lpf_register_global( data.context, &out, sizeof( IOType ), &dest )
+				!= LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 #ifndef BLAS1_RAW
-			if( ret == SUCCESS && lpf_register_global( data.context, const_cast< IOType * >( internal::getRaw( in ) ), data.P * sizeof( IOType ), &src ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS && lpf_register_global(
+					data.context,
+					const_cast< IOType * >( internal::getRaw( in ) ),
+					data.P * sizeof( IOType ),
+					&src
+				) != LPF_SUCCESS
+			) {
 #else
-			if( ret == SUCCESS && lpf_register_global( data.context, const_cast< IOType * >( in ), data.P * sizeof( IOType ), &src ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS && lpf_register_global(
+					data.context,
+					const_cast< IOType * >( in ),
+					data.P * sizeof( IOType ),
+					&src
+				) != LPF_SUCCESS
+			) {
 #endif
 				// failure at this point will have to be cleaned up as best as possible
 				ret = PANIC;
@@ -357,7 +444,9 @@ namespace grb {
 
 			// root copies output
 #ifndef BLAS1_RAW
-			if( ret == SUCCESS && data.s == root && &out != internal::getRaw( in ) + data.s ) {
+			if( ret == SUCCESS && data.s == root &&
+				&out != internal::getRaw( in ) + data.s
+			) {
 #else
 			if( ret == SUCCESS && data.s == root && &out != in + data.s ) {
 #endif
@@ -365,25 +454,35 @@ namespace grb {
 			}
 
 			// activate global regs
-			if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+				lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 			// scatter values
-			if( ret == SUCCESS && lpf_scatter( coll, src, dest, sizeof( IOType ), root ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+				lpf_scatter( coll, src, dest, sizeof( IOType ), root ) != LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 			// wait for completion of requested collective
-			if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) {
+			if( ret == SUCCESS &&
+				lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
 			// destroy memory slots
-			if( src != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, src ) != LPF_SUCCESS ) {
+			if( src != LPF_INVALID_MEMSLOT &&
+				lpf_deregister( data.context, src ) != LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
-			if( dest != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, dest ) != LPF_SUCCESS ) {
+			if( dest != LPF_INVALID_MEMSLOT &&
+				lpf_deregister( data.context, dest ) != LPF_SUCCESS
+			) {
 				ret = PANIC;
 			}
 
@@ -398,17 +497,20 @@ namespace grb {
 
 		/**
 		 * Schedules a scatter operation of a vector of \a N elements of type IOType
-		 * to a vector of \f$ N/P elements \f$ per process. It is assumed that \a N is a multiple of \a P.
-		 * The gather shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the LPF #gather.
+		 * to a vector of \f$ N/P elements \f$ per process. It is assumed that \a N is
+		 * a multiple of \a P. The gather shall be complete by the end of the call.
+		 * This is a collective graphBLAS operation. The BSP costs are as for the LPF
+		 * #gather.
 		 *
 		 * @tparam descr    The GraphBLAS descriptor.
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam IOType   The type of the to-be scattered value.
 		 *
-		 * @param[in]  in:  The vector of N elements at the root process to be scattered.
-		 * @param[out] out: The scattered vector of the root process, such that process \a i
-		 *                  has \f$ N/P \f$ elements located at offset \f$ (N/P)*i \f$.
+		 * @param[in]  in  The vector of N elements at the root process to be
+		 *                 scattered.
+		 * @param[out] out The scattered vector of the root process, such that process
+		 *                 \a i has \f$ N/P \f$ elements located at offset
+		 *                 \f$ (N/P)*i \f$.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -424,25 +526,27 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC scatter(
 #ifdef BLAS1_RAW
 			const IOType * in,
 			const size_t size,
 			IOType * out,
 #else
-			const Vector< IOType, reference, Coords > & in,
-			Vector< IOType, reference, Coords > & out,
+			const Vector< IOType, reference, Coords > &in,
+			Vector< IOType, reference, Coords > &out,
 #endif
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 			const size_t procs = data.P;
 
 #ifndef BLAS1_RAW
@@ -538,10 +642,12 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules an allgather operation of a single object of type IOType per process
-		 * to a vector of P elements.
-		 * The allgather shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the LPF #allgather.
+		 * Schedules an allgather operation of a single object of type IOType per
+		 * process to a vector of P elements.
+		 *
+		 * The allgather shall be complete by the end of the call. This is a
+		 * collective graphBLAS operation. The BSP costs are as for the LPF
+		 * #allgather.
 		 *
 		 * @tparam descr    The GraphBLAS descriptor.
 		 *                  Default is grb::descriptors::no_operation.
@@ -564,22 +670,24 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
-		RC allgather( IOType & in,
+		>
+		RC allgather(
+			IOType &in,
 #ifdef BLAS1_RAW
 			IOType * out
 #else
-			Vector< IOType, reference, Coords > & out
+			Vector< IOType, reference, Coords > &out
 #endif
 		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: IOType -> P * IOType
 #ifndef BLAS1_RAW
@@ -656,10 +764,12 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules an allgather operation of a vector of \a N/P elements of type IOType per process
-		 * to a vector of \f$ N \f$ elements.
-		 * The allgather shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the LPF #allgather.
+		 * Schedules an allgather operation of a vector of \a N/P elements of type
+		 * IOType per process to a vector of \f$ N \f$ elements.
+		 *
+		 * The allgather shall be complete by the end of the call. This is a
+		 * collective graphBLAS operation. The BSP costs are as for the LPF
+		 * #allgather.
 		 *
 		 * @tparam descr    The GraphBLAS descriptor.
 		 *                  Default is grb::descriptors::no_operation.
@@ -682,25 +792,26 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC allgather(
 #ifdef BLAS1_RAW
 			const IOType * in,
 			const size_t size,
 			IOType * out
 #else
-			const Vector< IOType, reference, Coords > & in,
-			Vector< IOType, reference, Coords > & out
+			const Vector< IOType, reference, Coords > &in,
+			Vector< IOType, reference, Coords > &out
 #endif
 		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: IOType -> P * IOType
 #ifndef BLAS1_RAW
@@ -777,8 +888,9 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules an alltoall operation of a vector of P elements of type IOType per process
-		 * to a vector of \a P elements.
+		 * Schedules an alltoall operation of a vector of P elements of type IOType
+		 * per process to a vector of \a P elements.
+		 *
 		 * The alltoall shall be complete by the end of the call. This is a collective
 		 * graphBLAS operation. The BSP costs are as for the LPF #alltoall.
 		 *
@@ -786,9 +898,10 @@ namespace grb {
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam IOType   The type of the vector elements.
 		 *
-		 * @param[in]  in:  The vector of \a P elements at each process.
-		 * @param[out] out: The resulting vector of \a P elements, such that process \f$ i \f$ will
-		 *                  receive (in order) the element at \f$ vector[i] \f$ from each process.
+		 * @param[in]  in  The vector of \a P elements at each process.
+		 * @param[out] out The resulting vector of \a P elements, such that process
+		 *                 \f$ i \f$ will receive (in order) the element at
+		 *                 \f$ vector[i] \f$ from each process.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -804,24 +917,25 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC alltoall(
 #ifdef BLAS1_RAW
 			IOType * in,
 			IOType * out
 #else
-			const Vector< IOType, reference, Coords > & in,
-			Vector< IOType, reference, Coords > & out
+			const Vector< IOType, reference, Coords > &in,
+			Vector< IOType, reference, Coords > &out
 #endif
 		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 #ifndef BLAS1_RAW
 			TEST_VEC_SIZE( in, data.P )
 			TEST_VEC_SIZE( out, data.P )
@@ -899,19 +1013,22 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules an allcombine operation of a vector of \a N/P elements of type IOType per process
-		 * to a vector of \a N/P elements.
-		 * The allcombine shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the LPF #allcombine.
+		 * Schedules an allcombine operation of a vector of \a N/P elements of type
+		 * IOType per process to a vector of \a N/P elements.
+		 *
+		 * The allcombine shall be complete by the end of the call. This is a
+		 * collective graphBLAS operation. The BSP costs are as for the LPF
+		 * #allcombine.
 		 *
 		 * @tparam descr    The GraphBLAS descriptor.
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam Operator Which operator to use for combining.
 		 * @tparam IOType   The type of the vector elements.
 		 *
-		 * @param[in,out]  inout: The vector of \a N/P elements at each process. At the end of
-		 *                        the call, each process shall hold the combined vectors.
-		 * @param[in]      op:    The associative operator to combine by.
+		 * @param[in,out]  inout The vector of \a N/P elements at each process. At
+		 *                       the end of the call, each process shall hold the
+		 *                       combined vectors.
+		 * @param[in]      op    The associative operator to combine by.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -935,7 +1052,8 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename IOType
 #ifndef BLAS1_RAW
@@ -949,16 +1067,19 @@ namespace grb {
 #else
 			Vector< IOType, reference, Coords > &inout,
 #endif
-			const Operator op ) {
+			const Operator op
+		) {
 			// static sanity check
-			NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value ||
-									  std::is_same< IOType, typename Operator::D3 >::value ),
-				"grb::collectives::allcombine",
+			NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) ||
+					std::is_same< IOType, typename Operator::D1 >::value ||
+					std::is_same< IOType, typename Operator::D2 >::value ||
+					std::is_same< IOType, typename Operator::D3 >::value
+				), "grb::collectives::allcombine",
 				"Incompatible given value type and operator domains while "
 				"no_casting descriptor was set" );
 
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 #ifndef BLAS1_RAW
 			const size_t size = internal::getCoordinates( inout ).size();
 #endif
@@ -1113,8 +1234,9 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules a combine operation of a vector of N/P elements of type IOType per process
-		 * to a vector of N elements.
+		 * Schedules a combine operation of a vector of N/P elements of type IOType
+		 * per process to a vector of N elements.
+		 *
 		 * The combine shall be complete by the end of the call. This is a collective
 		 * graphBLAS operation. The BSP costs are as for the LPF #combine.
 		 *
@@ -1123,10 +1245,11 @@ namespace grb {
 		 * @tparam Operator Which operator to use for combining.
 		 * @tparam IOType   The type of the vector elements.
 		 *
-		 * @param[in,out]  inout: The vector of \a N/P elements at each process. At the end of
-		 *                        the call, the root process shall hold the combined vectors.
-		 * @param[in]      op:    The associative operator to combine by.
-		 * @param[in]      root:  The root process.
+		 * @param[in,out]  inout The vector of \a N/P elements at each process. At
+		 *                       the end of the call, the root process shall hold the
+		 *                       combined vectors.
+		 * @param[in]      op    The associative operator to combine by.
+		 * @param[in]      root  The root process.
 		 *
 		 * @returns grb::SUCCESS When the operation succeeds as planned.
 		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
@@ -1158,32 +1281,37 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC combine(
 #ifdef BLAS1_RAW
 			IOType * inout,
 			const size_t size,
 #else
-			Vector< IOType, reference, Coords > & inout,
+			Vector< IOType, reference, Coords > &inout,
 #endif
 			const Operator op,
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// static sanity check
-			NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value ||
-									  std::is_same< IOType, typename Operator::D3 >::value ),
-				"grb::collectives::combine",
+			NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) ||
+					std::is_same< IOType, typename Operator::D1 >::value ||
+					std::is_same< IOType, typename Operator::D2 >::value ||
+					std::is_same< IOType, typename Operator::D3 >::value
+				), "grb::collectives::combine",
 				"Incompatible given value type and operator domains while "
-				"no_casting descriptor was set" );
+				"no_casting descriptor was set"
+			);
 
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: IOType -> P * IOType
 			lpf_coll_t coll;
@@ -1418,13 +1546,14 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules a reduce operation of a vector of N/P elements of type IOType per process
-		 * to a single element.
+		 * Schedules a reduce operation of a vector of N/P elements of type IOType per
+		 * process to a single element.
+		 *
 		 * The reduce shall be complete by the end of the call. This is a collective
 		 * graphBLAS operation. The BSP costs are as for the LPF #reduce.
 		 *
-		 * Since this is a collective call, there are \a N/P values \a in at each process
-		 * Let these vectors be denoted by \f$ x_s \f$, with
+		 * Since this is a collective call, there are \a N/P values \a in at each
+		 * process. Let these vectors be denoted by \f$ x_s \f$, with
 		 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
 		 * argument \a in on input at the user process with ID \a s. Let
 		 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
@@ -1463,7 +1592,8 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1471,26 +1601,30 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC reduce(
 #ifdef BLAS1_RAW
 			const InputType * in,
 			const size_t size,
 #else
-			const Vector< InputType, reference, Coords > & in,
+			const Vector< InputType, reference, Coords > &in,
 #endif
-			IOType & out,
+			IOType &out,
 			const Operator op,
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// static sanity check
-			NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value ||
-									  std::is_same< IOType, typename Operator::D3 >::value ),
-				"grb::collectives::reduce",
+			NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) ||
+					std::is_same< InputType, typename Operator::D1 >::value ||
+					std::is_same< IOType, typename Operator::D2 >::value ||
+					std::is_same< IOType, typename Operator::D3 >::value
+				), "grb::collectives::reduce",
 				"Incompatible given value type and operator domains while "
-				"no_casting descriptor was set" );
+				"no_casting descriptor was set"
+			);
 
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: IOType -> P * IOType
 			lpf_coll_t coll;
@@ -1579,7 +1713,8 @@ namespace grb {
 		}
 
 		// reduce to the left
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1587,16 +1722,18 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
-		RC reducel( IOType & out,
+		>
+		RC reducel(
+			IOType &out,
 #ifdef BLAS1_RAW
 			const InputType * in,
 			const size_t size,
 #else
-			const Vector< InputType, reference, Coords > & in,
+			const Vector< InputType, reference, Coords > &in,
 #endif
 			const Operator op,
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 #ifdef BLAS1_RAW
 			return reduce( in, size, out, op, root );
 #else
@@ -1605,7 +1742,8 @@ namespace grb {
 		}
 
 		// reduce to the right
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1613,7 +1751,7 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC reducer(
 #ifdef BLAS1_RAW
 			const InputType * in,
@@ -1621,9 +1759,10 @@ namespace grb {
 #else
 			const Vector< InputType, reference, Coords > & in,
 #endif
-			IOType & out,
+			IOType &out,
 			const Operator op,
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 #ifdef BLAS1_RAW
 			return reduce( in, size, out, op, root );
 #else
@@ -1632,8 +1771,9 @@ namespace grb {
 		}
 
 		/**
-		 * Schedules an allreduce operation of a vector of N/P elements of type IOType per process
-		 * to a single element.
+		 * Schedules an allreduce operation of a vector of N/P elements of type IOType
+		 * per process to a single element.
+		 *
 		 * The allreduce shall be complete by the end of the call. This is a collective
 		 * graphBLAS operation. The BSP costs are as for the LPF #allreduce.
 		 *
@@ -1676,8 +1816,8 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1685,25 +1825,29 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC allreduce(
 #ifdef BLAS1_RAW
 			const InputType * in,
 			const size_t size,
 #else
-			const Vector< InputType, reference, Coords > & in,
+			const Vector< InputType, reference, Coords > &in,
 #endif
-			IOType & out,
-			const Operator op ) {
+			IOType &out,
+			const Operator op
+		) {
 			// static sanity check
-			NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value ||
-									  std::is_same< IOType, typename Operator::D3 >::value ),
-				"grb::collectives::allreduce",
+			NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) ||
+					std::is_same< InputType, typename Operator::D1 >::value ||
+					std::is_same< IOType, typename Operator::D2 >::value ||
+					std::is_same< IOType, typename Operator::D3 >::value
+				), "grb::collectives::allreduce",
 				"Incompatible given value type and operator domains while "
-				"no_casting descriptor was set" );
+				"no_casting descriptor was set"
+			);
 
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 			// make sure we can support comms pattern: P * IOType
 			lpf_coll_t coll;
@@ -1716,7 +1860,8 @@ namespace grb {
 
 			// reduce our values locally
 			// if casting is required to apply op, foldl will take care of this
-			// note: the no_casting check could be deferred to foldl but this would result in unclear error messages
+			// note: the no_casting check could be deferred to foldl but this would
+			//       result in unclear error messages
 			for( size_t i = 0; i < size; i++ ) {
 #ifdef BLAS1_RAW
 				if( foldl< descr >( out, in[ i ], op ) != SUCCESS ) {
@@ -1755,7 +1900,8 @@ namespace grb {
 					continue;
 				}
 				// if casting is required to apply op, foldl will take care of this
-				// note: the no_casting check could be deferred to foldl but this would result in unclear error messages
+				// note: the no_casting check could be deferred to foldl but this would
+				//       result in unclear error messages
 				if( foldl< descr >( out, buffer[ i ], op ) != SUCCESS ) {
 					return PANIC;
 				}
@@ -1771,7 +1917,8 @@ namespace grb {
 		}
 
 		// allreduce to the left
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1779,15 +1926,17 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
-		RC allreducel( IOType & out,
+		>
+		RC allreducel(
+			IOType &out,
 #ifdef BLAS1_RAW
 			const InputType * in,
 			const size_t size,
 #else
-			const Vector< InputType, reference, Coords > & in,
+			const Vector< InputType, reference, Coords > &in,
 #endif
-			const Operator op ) {
+			const Operator op
+		) {
 #ifdef BLAS1_RAW
 			return allreduce( in, size, out, op );
 #else
@@ -1796,7 +1945,8 @@ namespace grb {
 		}
 
 		// allreduce to the right
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename Operator,
 			typename InputType,
 			typename IOType
@@ -1804,7 +1954,7 @@ namespace grb {
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC allreducer(
 #ifdef BLAS1_RAW
 			const InputType * in,
@@ -1812,8 +1962,9 @@ namespace grb {
 #else
 			const Vector< InputType, reference, Coords > & in,
 #endif
-			IOType & out,
-			const Operator op ) {
+			IOType &out,
+			const Operator op
+		) {
 #ifdef BLAS1_RAW
 			return allreduce( in, size, out, op );
 #else
@@ -1824,14 +1975,17 @@ namespace grb {
 		/**
 		 * Schedules a broadcast operation of a vector of N elements of type IOType
 		 * to a vector of N elements per process.
-		 * The broadcast shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the LPF #broadcast.
+		 *
+		 * The broadcast shall be complete by the end of the call. This is a
+		 * collective graphBLAS operation. The BSP costs are as for the LPF
+		 * #broadcast.
 		 *
 		 * @tparam descr    The GraphBLAS descriptor.
 		 *                  Default is grb::descriptors::no_operation.
 		 * @tparam IOType   The type of the to-be broadcast vector element values.
 		 *
-		 * @param[in,out] inout On input: the vector at the root process to be broadcast.
+		 * @param[in,out] inout On input: the vector at the root process to be
+		 *                      broadcast.
 		 *                      On output at process \a root: the same value.
 		 *                      On output at non-root processes: the vector at root.
 		 *
@@ -1858,23 +2012,25 @@ namespace grb {
 		 * \endparblock
 		 *
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType
 #ifndef BLAS1_RAW
 			,
 			typename Coords
 #endif
-			>
+		>
 		RC broadcast(
 #ifdef BLAS1_RAW
 			IOType * inout,
 			const size_t size,
 #else
-			Vector< IOType, reference, Coords > & inout,
+			Vector< IOType, reference, Coords > &inout,
 #endif
-			const lpf_pid_t root ) {
+			const lpf_pid_t root
+		) {
 			// we need access to BSP context
-			internal::BSP1D_Data & data = internal::grb_BSP1D.load();
+			internal::BSP1D_Data &data = internal::grb_BSP1D.load();
 
 #ifndef BLAS1_RAW
 			const size_t size = internal::getCoordinates( inout ).size();
@@ -1927,3 +2083,4 @@ namespace grb {
 	} // namespace internal
 
 } // namespace grb
+
diff --git a/include/graphblas/bsp/collectives_blas1_raw.hpp b/include/graphblas/bsp/collectives_blas1_raw.hpp
index 71e37fb09..3c61e1b7e 100644
--- a/include/graphblas/bsp/collectives_blas1_raw.hpp
+++ b/include/graphblas/bsp/collectives_blas1_raw.hpp
@@ -53,9 +53,10 @@
 		"**********************\n" );
 
 #define BLAS1_RAW
-#include "collectives_blas1.hpp"
+ #include "collectives_blas1.hpp"
 #undef BLAS1_RAW
 
 #undef NO_CAST_ASSERT_BLAS1
 
 #endif // end ``_H_GRB_BSP_COLL_BLAS1_RAW''
+
diff --git a/include/graphblas/bsp/exec_broadcast_routines.hpp b/include/graphblas/bsp/exec_broadcast_routines.hpp
new file mode 100644
index 000000000..c577fa984
--- /dev/null
+++ b/include/graphblas/bsp/exec_broadcast_routines.hpp
@@ -0,0 +1,81 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file exec_broadcast_routines.hpp
+ *
+ * Routines used in the Launcher for broadcasting data.
+ *
+ * @author Alberto Scolari
+ * @date August, 2023
+ */
+
+#ifndef _H_BSP1D_EXEC_BROADCAST_ROUTINES
+#define _H_BSP1D_EXEC_BROADCAST_ROUTINES
+
+#include <stddef.h>
+
+#include <lpf/collectives.h>
+#include <lpf/core.h>
+
+
+namespace grb {
+
+	namespace internal {
+
+		/** Global internal singleton to track whether MPI was initialized. */
+		extern bool grb_mpi_initialized;
+
+		/**
+		 * Initialize collective communication for broadcast.
+		 *
+		 * @param[in,out] ctx  Fresh(!) LPF context to work with.
+		 * @param[in]     s    This user process ID.
+		 * @param[in]     P    Total number of user processes.
+		 * @param[in]     regs Total number of memory slot registrations to be made
+		 *                     as part of preparing for the broadcast.
+		 * @param[out]    coll New collectives context.
+		 *
+		 * \internal We follow here the LPF convention where output arguments are
+		 *           ordered last.
+		 */
+		lpf_err_t lpf_init_collectives_for_broadcast(
+			lpf_t &ctx,
+			const lpf_pid_t s, const lpf_pid_t P,
+			const size_t regs,
+			lpf_coll_t &coll
+		);
+
+		/**
+		 * Register a memory area as a global one and perform a broadcast.
+		 *
+		 * @param[in,out] ctx  The LPF context in which \a coll was initialised.
+		 * @param[in]     coll The initialised collectives context.
+		 * @param[in]     data Pointer to data to broadcast.
+		 * @param[in[     size The size of the data (in bytes) to broadcast.
+		 */
+		lpf_err_t lpf_register_and_broadcast(
+			lpf_t &ctx, lpf_coll_t &coll,
+			void * const data, const size_t size
+		);
+
+	} // end internal
+
+} // end grb
+
+#endif // _H_BSP1D_EXEC_BROADCAST_ROUTINES
+
diff --git a/include/graphblas/bsp1d/benchmark.hpp b/include/graphblas/bsp1d/benchmark.hpp
index 99bf865c5..31717d187 100644
--- a/include/graphblas/bsp1d/benchmark.hpp
+++ b/include/graphblas/bsp1d/benchmark.hpp
@@ -16,19 +16,24 @@
  */
 
 /*
- * @author A. N. Yzelman
- * @date 17th of April, 2017
+ * @author A. N. Yzelman; Alberto Scolari
+ * @date 17th of April, 2017; 28 of August 2023
  */
 
 #ifndef _H_GRB_BSP1D_BENCH
 #define _H_GRB_BSP1D_BENCH
 
+#include <string>
+#include <type_traits>
+
 #include <lpf/core.h>
 
-#include <graphblas/base/benchmark.hpp>
-#include <graphblas/exec.hpp>
 #include <graphblas/rc.hpp>
 
+#include <graphblas/base/benchmark.hpp>
+
+#include <graphblas/utils/TimerResults.hpp>
+
 #include "exec.hpp"
 
 
@@ -36,530 +41,271 @@ namespace grb {
 
 	namespace internal {
 
-		struct packedBenchmarkerInput {
-			const void * blob;
-			size_t blob_size;
+		/**
+		 * Data structure with input and benchmarking information.
+		 *
+		 * @tparam InputType  The input type.
+		 * @tparam OutputType The output type.
+		 * @tparam _mode      The #grb::EXEC_MODE of the benchmarker.
+		 *
+		 * In automatic mode, this struct must be broadcast from process 0 to the
+		 * other processes, as it contains the valid number of inner and outer
+		 * iterations. In other modes, all processes must choose the same number
+		 * of inner/outer iterations, otherwise deadlocks may occur.
+		 *
+		 * @tparam _requested_broadcast Whether or not the user has requested input be
+		 *                              broadcast.
+		 *
+		 * @tparam untyped_call         Whether the user has made a benchmark request
+		 *                              using an untyped ALP program.
+		 */
+		template<
+			typename InputType, typename OutputType,
+			EXEC_MODE _mode,
+			bool _requested_broadcast,
+			bool untyped_call
+		>
+		struct BenchmarkDispatcher :
+			ExecDispatcher<
+				InputType, OutputType,
+				_mode, _requested_broadcast,
+				untyped_call
+			>,
+			protected BenchmarkerBase
+		{
+			/** Whether the dispatcher requires broadcasting. */
+			static constexpr bool needs_initial_broadcast = _mode == AUTOMATIC;
+
+			/** Inner number of experiments. */
 			size_t inner;
-			size_t outer;
-			bool bcast_blob;
-		};
-
-	} // namespace internal
 
-} // namespace grb
-
-/** Global internal function used to call lpf_hook with. */
-template< typename T, typename U >
-void _grb_bench_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) {
-	assert( P > 0 );
-	assert( s < P );
-
-	// construct default input type
-	T data_in_local;
-	// get input struct
-	assert( args.input_size ==
-		sizeof( struct grb::internal::packedBenchmarkerInput ) );
-	const struct grb::internal::packedBenchmarkerInput input =
-		*static_cast< const struct grb::internal::packedBenchmarkerInput * >(
-			args.input
-		);
-
-	// get input data from PID 0
-	if( input.bcast_blob && P > 1 ) {
-		// init BSP & collectives
-		lpf_coll_t coll;
-		lpf_err_t brc = lpf_resize_message_queue( ctx, 2*(P-1) );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_resize_memory_register( ctx, 2 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll );
-		assert( brc == LPF_SUCCESS );
-
-		// we need input fields from root
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		lpf_memslot_t global;
-		if( s == 0 ) {
-			assert( input.blob_size == sizeof( T ) );
-			brc = lpf_register_global( ctx,
-				const_cast< void * >( input.blob ), input.blob_size, &global );
-		} else {
-			brc = lpf_register_global( ctx, &data_in_local, sizeof( T ), &global );
-		}
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, sizeof( T ), 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-#ifdef NDEBUG
-		(void)brc;
-#endif
-	} else {
-		// if we do not broadcast then everyone should have their own local input
-		assert( input.blob_size == sizeof( T ) );
-	}
-
-	// get input data
-	const T &data_in = input.bcast_blob ?
-		// then get unified view of input data after broadcast
-		( s == 0 ?
-			*static_cast< const T * >( input.blob ) :
-			data_in_local
-		) :
-		// otherwise just copy from args_in if there is one (to catch automatic mode)
-		*static_cast< const T * >( input.blob );
-
-	// we need an output field
-	U data_out_local = U();
-	U &data_out = args.output_size == sizeof( U ) ?
-		*static_cast< U * >( args.output ) : // if we were passed output area, use it
-		data_out_local;                      // otherwise use local empty output area
-
-	// init graphblas
-	if( grb::init( s, P, ctx ) != grb::SUCCESS ) {
-		std::cerr << "Could not initialise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return; // note that there is no way to return error codes
-	}
-
-	// retrieve and run the function to be executed
-	assert( args.f_size == 2 );
-	// retrieve benchmarking functions
-	typedef void ( *grb_func_t )( const T &, U & );
-	typedef void ( *bench_func_t )(
-			void ( *grb_program )( const T &, U & ),
-			const T &, U &,
-			size_t, size_t, lpf_pid_t
-		);
-	bench_func_t bench_program =
-		reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] );
-	grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] );
-	// execute benchmark
-	( *bench_program )(
-			grb_program, data_in, data_out, input.inner, input.outer, s
-		);
-
-	// close GraphBLAS context and done!
-	if( grb::finalize() != grb::SUCCESS ) {
-		std::cerr << "Could not finalise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return;
-	}
-}
-
-/** Global internal function used to call lpf_hook with. */
-template< typename U >
-void _grb_bench_varin_spmd( lpf_t ctx,
-	lpf_pid_t s, lpf_pid_t P,
-	lpf_args_t args
-) {
-	assert( P > 0 );
-	assert( s < P );
-
-	// input data to grbProgram
-	void * data_in = nullptr;
-	// get input struct
-	assert( args.input_size ==
-		sizeof( struct grb::internal::packedBenchmarkerInput ) );
-	const struct grb::internal::packedBenchmarkerInput input =
-		*static_cast< const struct grb::internal::packedBenchmarkerInput * >(
-			args.input
-		);
-
-	// size of the data_in block
-	size_t size;
-
-	// we need input fields from root. First synchronise on input size
-	if( input.bcast_blob && P > 1 ) {
-
-		// init collectives
-		lpf_coll_t coll;
-		lpf_err_t brc = lpf_resize_message_queue( ctx, P - 1 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_resize_memory_register( ctx, 2 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_collectives_init( ctx, s, P, 1, 0, sizeof( size_t ), &coll );
-		assert( brc == LPF_SUCCESS );
-
-		// broadcast the size of data
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		lpf_memslot_t global;
-		if( s == 0 ) {
-			size = input.blob_size;
-		}
-		brc = lpf_register_global( ctx, &size, sizeof( size_t ), &global );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, sizeof( size_t ), 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-
-		// now that the input size is known, retrieve the input data
-		if( s > 0 ) {
-			data_in = new char[ size ];
-		} else {
-			data_in = const_cast< void * >( input.blob );
-		}
-		brc = lpf_register_global( ctx, data_in, size, &global );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, size, 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-
-#ifdef NDEBUG
-		(void)brc;
-#endif
-	} else {
-		data_in = const_cast< void * >( input.blob );
-		size = input.blob_size;
-	}
-
-	// we need an output field
-	U data_out_local = U();
-	U &data_out = args.output_size == sizeof( U ) ?
-		*static_cast< U * >( args.output ) :
-		data_out_local;
-	// note: the above switch handily catches automatic mode
-
-	// init graphblas
-	if( grb::init( s, P, ctx ) != grb::SUCCESS ) {
-		std::cerr << "Could not initialise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return; // note that there is no way to return error codes
-	}
-
-	// retrieve and run the function to be executed
-	assert( args.f_size == 2 );
-	// assume we are performing benchmarks
-	typedef void ( *grb_func_t )( void *, size_t, U & );
-	typedef void ( *bench_func_t )( void ( *grb_program )( void *, size_t, U & ),
-		void *, size_t, U &, size_t, size_t, lpf_pid_t );
-	bench_func_t bench_program =
-		reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] );
-	grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] );
-	// run benchmark
-	( *bench_program )( grb_program, (void *)data_in, size,
-		data_out, input.inner, input.outer, s );
-
-	// close GraphBLAS context and done!
-	if( grb::finalize() != grb::SUCCESS ) {
-		std::cerr << "Could not finalise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return;
-	}
-}
-
-/** Global internal function used to call lpf_exec with. */
-template< typename T, typename U, bool varin >
-void _grb_bench_exec( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) {
-	assert( P > 0 );
-	assert( s < P );
-
-	grb::internal::packedBenchmarkerInput input;
-	constexpr size_t size = sizeof( struct grb::internal::packedBenchmarkerInput );
-
-	// only call broadcast if P > 1, or otherwise UB
-	if( P > 1 ) {
-		// init and use collectives to broadcast input
-		lpf_coll_t coll;
-		const size_t nmsgs = P + 1 > 2 * P - 3 ?
-			P + 1 :
-			2 * P - 3; // see LPF collectives doc
-		lpf_err_t brc = lpf_resize_message_queue( ctx, nmsgs );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_resize_memory_register( ctx, 3 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_collectives_init( ctx, s, P, 1, 0, size, &coll );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		lpf_memslot_t destination, source;
-		brc = lpf_register_global( ctx, &input, size, &destination );
-		assert( brc == LPF_SUCCESS );
-		if( s == 0 ) {
-			assert( args.input_size == size );
-			brc = lpf_register_global( ctx,
-				const_cast< void * >( args.input ), size, &source );
-		} else {
-			brc = lpf_register_global( ctx,
-				const_cast< void * >( args.input ), 0, &source );
-		}
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, source, destination, size, 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, source );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, destination );
-		assert( brc == LPF_SUCCESS );
-#ifdef NDEBUG
-		(void)brc;
-#endif
-	}
-
-	// non-root processes update args
-	if( s > 0 ) {
-		input.blob = nullptr;
-		input.blob_size = 0;
-		args.input = &input;
-		args.input_size = size;
-		assert( input.bcast_blob );
-	}
-
-	// now we are at exactly the equal state as a hook-induced function
-	if( varin ) {
-		_grb_bench_varin_spmd< U >( ctx, s, P, args );
-	} else {
-		_grb_bench_spmd< T, U >( ctx, s, P, args );
-	}
-}
-
-namespace grb {
-
-	template<>
-	class Benchmarker< FROM_MPI, BSP1D > :
-		protected Launcher< FROM_MPI, BSP1D >, protected internal::BenchmarkerBase
-	{
-
-		public:
+			/** Outer number of experiments. */
+			size_t outer;
 
-			Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) :
-				Launcher< FROM_MPI, BSP1D >( comm )
+			/**
+			 * Builds dispatcher from basic information.
+			 *
+			 * @param[in] _in      Pointer to the input data.
+			 * @param[in] _in_size Byte size of the input data.
+			 * @param[in] _inner   The nummer of inner iterations.
+			 * @param[in] _outer   The number of outer iterations.
+			 */
+			BenchmarkDispatcher(
+				const InputType *_in, const size_t _in_size,
+				size_t _inner, size_t _outer
+			) :
+				ExecDispatcher< InputType, OutputType, _mode, _requested_broadcast,
+					untyped_call >( _in, _in_size ),
+				inner( _inner ), outer( _outer )
 			{}
 
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
-				U &data_out,
-				const size_t inner, const size_t outer,
-				const bool broadcast = false
-			) const {
-				// check arguments
-				if( in_size > 0 && data_in == nullptr ) {
-					return ILLEGAL;
+			/**
+			 * Reconstruct object from LPF args, where it is embedded in its input field.
+			 *
+			 * @param[in] s    The process ID.
+			 * @param[in] args The LPF I/O arguments.
+			 */
+			BenchmarkDispatcher( const lpf_pid_t s, const lpf_args_t args ) :
+				ExecDispatcher<
+					InputType, OutputType,
+					_mode, _requested_broadcast,
+					untyped_call
+				>( nullptr, 0 )
+			{
+				if( s > 0 && _mode == AUTOMATIC ) {
+					inner = 0;
+					outer = 0;
+					return;
 				}
+				typedef BenchmarkDispatcher<
+					InputType, OutputType,
+					_mode, _requested_broadcast,
+					untyped_call
+				> self_t;
+				const self_t *orig = reinterpret_cast< const self_t * >( args.input );
+				this->in = orig->in;
+				this->in_size = orig->in_size;
+				inner = orig->inner;
+				outer = orig->outer;
+			}
 
-				// prepare packed input
-				struct internal::packedBenchmarkerInput input;
-				input.blob = data_in;
-				input.blob_size = in_size;
-				input.inner = inner;
-				input.outer = outer;
-				input.bcast_blob = broadcast;
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > );
-				fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = {
-					&input, sizeof( struct internal::packedBenchmarkerInput ),
-					&data_out, sizeof( U ),
-					fargs, 2
+			/**
+			 * Benchmark the ALP function \a fun with the given input/output parameters.
+			 *
+			 * @param[in]  fun     The ALP function to run.
+			 * @param[in]  s       The process ID.
+			 * @param[in]  P       The total nuber of processes.
+			 * @param[in]  in      Pointer to the input data.
+			 * @param[in]  in_size Byte size of the input data.
+			 * @param[out] out     Pointer where to output.
+			 */
+			grb::RC operator()(
+				const lpf_func_t fun,
+				const lpf_pid_t s, const lpf_pid_t P,
+				const InputType * const in, const size_t in_size,
+				OutputType * const out
+			) const {
+				auto runner = [ fun, in_size, in, out, s, P ] () {
+					ExecDispatcher<
+						InputType, OutputType,
+						_mode, _requested_broadcast,
+						untyped_call
+					>::lpf_grb_call( fun, s, P, in, in_size, out );
 				};
+				return benchmark< BSP1D >( runner, out->times, inner, outer, s );
+			}
 
-				// do hook
-				const lpf_err_t spmdrc = lpf_hook( init,
-					&(_grb_bench_varin_spmd< U >), args );
+		};
 
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
+	} // namespace internal
 
-				// done
-				return SUCCESS;
-			}
+	/**
+	 * Collection of processes that can launch an ALP function and benchmark it.
+	 */
+	template< enum EXEC_MODE mode >
+	class Benchmarker< mode, BSP1D > : protected Launcher< mode, BSP1D > {
 
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ), // user program
-				const T &data_in, U &data_out,           // input & output data
+		private:
+
+			/** Pack input/output data and run the given ALP function. */
+			template< typename T, typename U, bool untyped_call >
+			RC pack_and_run(
+				const lpf_func_t alp_program,
+				const T * const data_in, const size_t in_size,
+				U * const data_out,
 				const size_t inner, const size_t outer,
-				const bool broadcast = false
-			) {
-				// prepare packed input
-				struct internal::packedBenchmarkerInput input;
-				input.blob = data_in;
-				input.blob_size = sizeof( T );
-				input.inner = inner;
-				input.outer = outer;
-				input.bcast_blob = broadcast;
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< T, U > );
-				fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 2 };
-
-				// do hook
-				const lpf_err_t spmdrc = lpf_hook( init, &(_grb_bench_spmd< T, U >), args );
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
+				const bool broadcast
+			) const {
+				if( broadcast ) {
+					typedef internal::BenchmarkDispatcher<
+						T, U, mode, true,
+						untyped_call
+					> Disp;
+					Disp disp_info( data_in, in_size, inner, outer );
+					return this->template run_lpf< T, U, Disp >(
+						alp_program,
+						reinterpret_cast< void * >( &disp_info ),
+						sizeof( Disp ), data_out
+					);
+				} else {
+					typedef internal::BenchmarkDispatcher<
+						T, U, mode, false,
+						untyped_call
+					> Disp;
+					Disp disp_info = { data_in, in_size, inner, outer };
+					return this->template run_lpf< T, U, Disp >(
+						alp_program,
+						reinterpret_cast< void * >( &disp_info ),
+						sizeof( Disp ), data_out
+					);
 				}
-
-				// done
-				return SUCCESS;
 			}
 
-			/** This implementation needs to release MPI resources in manual mode. */
-			static enum RC finalize() {
-				// done
-				return Launcher< FROM_MPI, BSP1D >::finalize();
-			}
-	};
-
-	template< enum EXEC_MODE mode >
-	class Benchmarker< mode, BSP1D > :
-		protected Launcher< mode, BSP1D >, protected internal::BenchmarkerBase
-	{
-
 
 		public:
 
+			/** import constructor(s) from base class, implicitly based on mode */
+			using Launcher< mode, BSP1D >::Launcher;
+
 			/**
-			 * \internal
-			 * @param[in] process_id    User process ID
-			 * @param[in] nproces       Total number of user processes
-			 * @param[in] hostname      One of the process' hostname
-			 * @param[in] port          A free port at \a hostname
-			 * @param[in] is_mpi_inited Whether MPI is already initialised
-			 * \endinternal
+			 * Run an untyped ALP function in parallel.
+			 *
+			 * @tparam U The output type.
+			 *
+			 * @param[in]  alp_program ALP function to execute in parallel.
+			 * @param[in]  data_in     Pointer to input data.
+			 * @param[in]  in_size     Size (in bytes) of the input data.
+			 * @param[out] data_out    Output data.
+			 * @param[in]  inner       Number of inner iterations.
+			 * @param[in]  outer       Number of outer iterations.
+			 * @param[in]  broadcast   Whether to broadcast inputs from user process zero
+			 *                         to all other user processes.
+			 *
+			 * @returns grb::SUCCESS On a successfully completed benchmark call.
+			 * @returns grb::ILLEGAL If \a data_in is <tt>nullptr</tt> but \a in_size is
+			 *                       larger than zero.
+			 * @returns grb::PANIC   On an unrecoverable critical failure (see base
+			 *                       specification).
 			 */
-			Benchmarker(
-				const size_t process_id = 0,
-				const size_t nprocs = 1,
-				const std::string hostname = "localhost",
-				const std::string port = "0",
-				const bool is_mpi_inited = false
-			) : Launcher< mode, BSP1D >(
-				process_id, nprocs, hostname, port, is_mpi_inited
-			) {}
-
 			template< typename U >
-			enum RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
+			RC exec(
+				const AlpUntypedFunc< U > alp_program,
+				const void * const data_in, const size_t in_size,
 				U &data_out,
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) const {
+				static_assert(
+					mode != AUTOMATIC ||
+						std::is_default_constructible< U >::value,
+					"The output type U should be default-constructible when using automatic "
+					"mode launchers."
+				);
 				// check input arguments
 				if( in_size > 0 && data_in == nullptr ) {
-					return ILLEGAL;
-				}
-
-				// prepare packed input
-				struct internal::packedBenchmarkerInput input;
-				input.blob = data_in;
-				input.blob_size = in_size;
-				input.inner = inner;
-				input.outer = outer;
-				input.bcast_blob = broadcast;
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > );
-				fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &input, sizeof( struct internal::packedBenchmarkerInput ),
-					&data_out, sizeof( U ),
-					fargs, 2
-				};
-
-				// launch
-				lpf_err_t spmdrc = LPF_SUCCESS;
-				if( mode == MANUAL ) {
-					// do hook
-					spmdrc = lpf_hook( init, &(_grb_bench_varin_spmd< U >), args );
-				} else {
-					assert( mode == AUTOMATIC );
-					// do exec
-					spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P,
-						&(_grb_bench_exec< void, U, true >), args );
+					return grb::ILLEGAL;
 				}
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
-
-				// done
-				return SUCCESS;
+				return pack_and_run< void, U, true >(
+					reinterpret_cast< lpf_func_t >( alp_program ),
+					data_in, in_size, &data_out,
+					inner, outer,
+					broadcast
+				);
 			}
 
-			/** No implementation notes. */
+			/**
+			 * Run a typed ALP function in parallel.
+			 *
+			 * @tparam T Input type.
+			 * @tparam U Output type.
+			 *
+			 * @param[in]  alp_program The ALP function to execute in parallel.
+			 * @param[in]  data_in     Pointer to the input data.
+			 * @param[out] data_out    The output data.
+			 * @param[in]  inner       Number of inner iterations.
+			 * @param[in]  outer       Number of outer iterations.
+			 * @param[in]  broadcast   Whether to broadcast inputs from user process zero
+			 *                         to all other user processes.
+			 *
+			 * @returns grb::SUCCESS On a successfully completed benchmark call.
+			 * @returns grb::ILLEGAL If \a broadcast was false and the benchmarker is in
+			 *                       #AUTOMATIC mode, while \a T is not default-
+			 *                       constructible.
+			 * @returns grb::PANIC   On unrecoverable errors (see the base specification
+			 *                       for details).
+			 */
 			template< typename T, typename U >
-			enum RC exec(
-				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out,           // input & output data
+			RC exec(
+				const AlpTypedFunc< T, U > alp_program,
+				const T &data_in, U &data_out,
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
-			) {
-				// prepare packed input
-				struct internal::packedBenchmarkerInput input;
-				input.blob = &data_in;
-				input.blob_size = sizeof( T );
-				input.inner = inner;
-				input.outer = outer;
-				input.bcast_blob = broadcast;
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< T, U > );
-				fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &input, sizeof( struct internal::packedBenchmarkerInput ),
-					&data_out, sizeof( U ),
-					fargs, 2
-				};
-
-				// launch
-				lpf_err_t spmdrc = LPF_SUCCESS;
-				if( mode == MANUAL ) {
-					// do hook
-					spmdrc = lpf_hook( this->init, &(_grb_bench_spmd< T, U >), args );
-				} else {
-					assert( mode == AUTOMATIC );
-					// do exec
-					spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P,
-						&(_grb_bench_exec< T, U, false >), args );
-				}
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
+			) const {
+				static_assert(
+					mode != AUTOMATIC ||
+						std::is_default_constructible< U >::value,
+					"The output type U should be default-constructible when using automatic "
+					"mode launchers."
+				);
+				if(
+					mode == AUTOMATIC && broadcast == false &&
+					!std::is_default_constructible< T >::value
+				) {
+					std::cerr << "Error: input type of an ALP function must be "
+						"default-constructible when using automatic mode benchmarkers without "
+						"broadcasting.\n";
+					return grb::ILLEGAL;
 				}
-
-				// done
-				return SUCCESS;
+				return pack_and_run< T, U, false >(
+					reinterpret_cast< lpf_func_t >( alp_program ),
+					&data_in, sizeof( T ), &data_out,
+					inner, outer,
+					broadcast
+				);
 			}
 
-			/** This implementation needs to release MPI resources in manual mode. */
-			static enum RC finalize() {
-				return Launcher< mode, BSP1D >::finalize();
-			}
+			/** Reuse BSP1D launcher implementation of finalize. */
+			using Launcher< mode, BSP1D >::finalize;
 
 	};
 
diff --git a/include/graphblas/bsp1d/exec.hpp b/include/graphblas/bsp1d/exec.hpp
index e8e627aa9..0d415d636 100644
--- a/include/graphblas/bsp1d/exec.hpp
+++ b/include/graphblas/bsp1d/exec.hpp
@@ -16,272 +16,800 @@
  */
 
 /*
- * @author A. N. Yzelman
- * @date 17th of April, 2017
+ * @author A. N. Yzelman; Alberto Scolari
+ * @date 17th of April, 2017; 28 of August 2023
  */
 
 #ifndef _H_GRB_BSP1D_EXEC
 #define _H_GRB_BSP1D_EXEC
 
+#include <atomic>
+#include <memory>
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+#include <functional>
+#include <type_traits>
+
+#ifndef _GRB_NO_STDIO
+ #include <iostream> //for std::cerr
+#endif
+
 #include <lpf/collectives.h>
 #include <lpf/core.h>
 #include <lpf/mpi.h>
-#include <mpi.h>
-#include <string.h> //for memcpy
 
+#include <mpi.h> //for EXEC_MODE::FROM_MPI support
+
+#include <graphblas/rc.hpp>
 #include <graphblas/backends.hpp>
+
 #include <graphblas/base/exec.hpp>
+
 #include <graphblas/collectives.hpp>
-#include <graphblas/rc.hpp>
 
 #include "init.hpp"
 
-#ifndef _GRB_NO_STDIO
- #include <iostream> //for std::cerr
-#endif
+#include "../bsp/exec_broadcast_routines.hpp"
 
 
-/** Global internal singleton to track whether MPI was initialized. */
-extern bool _grb_mpi_initialized;
+namespace grb {
 
-/** Global internal function used to call lpf_hook or lpf_exec with. */
-template< typename T, typename U, bool broadcast = true >
-void _grb_exec_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) {
-	assert( P > 0 );
-	assert( s < P );
+	namespace internal {
 
-#ifdef _DEBUG
-	if( s == 0 ) {
-		std::cout << "Info: launcher spawned or hooked " << P << " ALP/GraphBLAS "
-			<< "user processes.\n";
-	}
-#endif
+		/**
+		 * Base data structure storing necessary data to run an ALP function through
+		 * LPF.
+		 *
+		 * @tparam InputType            The type of function input.
+		 * @tparam mode                 The grb::EXEC_MODE of the launcher.
+		 * @tparam _requested_broadcast Whether inputs shall be broadcast.
+		 */
+		template<
+			typename InputType,
+			EXEC_MODE _mode,
+			bool _requested_broadcast
+		>
+		struct DispatchInfo {
+
+			/** Make available the launcher mode. */
+			static constexpr EXEC_MODE mode = _mode;
+
+			/** Make available whether input broadcast was requested. */
+			static constexpr bool requested_broadcast = _requested_broadcast;
+
+			/** Note: benchmarker classes may require initial broadcasts */
+			static constexpr bool needs_initial_broadcast = false;
+
+			/** Pointer to input argument. */
+			const InputType * in;
+
+			/** Byte size of input argument. */
+			size_t in_size;
+
+			/**
+			 * Construct from base information.
+			 *
+			 * @param[in] _in      Pointer to the input argument.
+			 * @param[in] _in_size Byte size of the input argument.
+			 */
+			DispatchInfo( const InputType * const _in, const size_t _in_size ) :
+				in( _in ), in_size( _in_size )
+			{}
+
+			/**
+			 * Construct from LPF arguments, following a call to lpf_hook() or
+			 * lpf_exec().
+			 *
+			 * @param[in] s    The user process ID.
+			 * @param[in] args The LPF I/O arguments.
+			 */
+			DispatchInfo( const lpf_pid_t s, const lpf_args_t args ) {
+				if( s > 0 && mode == AUTOMATIC ) {
+					in = nullptr;
+					in_size = 0;
+				} else {
+					in = static_cast< const InputType *>( args.input );
+					in_size = args.input_size;
+				}
+			}
+
+			/** @returns in */
+			const InputType * get_input() const { return in; }
+
+			/** @returns in_size */
+			size_t get_input_size() const { return in_size; }
+
+		};
+
+		/**
+		 * Adaptor to run a typed ALP function: it stores relevant parameters for data
+		 * broadcast.
+		 *
+		 * Inherited from DispatchInfo.
+		 *
+		 * Adapts the function call to the underlying type.
+		 */
+		template<
+			typename InputType, typename OutputType,
+			EXEC_MODE _mode,
+			bool _requested_broadcast, bool _variable_input
+		>
+		class ExecDispatcher :
+			public DispatchInfo< InputType, _mode, _requested_broadcast >
+		{
+
+			protected:
+
+				/**
+				 * Static adapter for typed ALP functions.
+				 *
+				 * Casts and calls the opaque \a fun function.
+				 *
+				 * This function is factored out so as to allow its call from the BSP
+				 * #grb::Benchmarker.
+				 *
+				 * @param[in]  fun     Pointer to the typed ALP function.
+				 * @param[in]  s       The user process ID.
+				 * @param[in]  P       The total number of user processes.
+				 * @param[in]  in      Pointer to the input argument.
+				 * @param[in]  in_size Byte size of the input argument.
+				 * @param[out] out     Pointer to where to store the output.
+				 */
+				static inline void lpf_grb_call(
+					const lpf_func_t fun,
+					const lpf_pid_t s, const lpf_pid_t P,
+					const InputType * const in,
+					const size_t in_size,
+					OutputType *out
+				) {
+					(void) in_size;
+					(void) s;
+					(void) P;
+					reinterpret_cast< AlpTypedFunc< InputType, OutputType > >( fun )
+						( *in, *out );
+				}
+
+
+			public:
+
+				/** Use base constructor */
+				using DispatchInfo< InputType, _mode, _requested_broadcast >::DispatchInfo;
+
+				/** Typed dispatching has static size inputs */
+				constexpr static bool is_input_size_variable = false;
+
+				/**
+				 * Functor operator to call a typed ALP function.
+				 *
+				 * @param[in]  fun     Pointer to the typed ALP function.
+				 * @param[in]  s       The user process ID.
+				 * @param[in]  P       The total number of user processes.
+				 * @param[in]  in      Pointer to the input argument.
+				 * @param[in]  in_size Byte size of the input argument.
+				 * @param[out] out     Pointer to where to store the output.
+				 */
+				inline grb::RC operator()(
+					const lpf_func_t fun,
+					const lpf_pid_t s, const lpf_pid_t P,
+					const InputType *in, const size_t in_size,
+					OutputType * out
+				) const {
+					lpf_grb_call( fun, s, P, in, in_size, out );
+					return grb::SUCCESS;
+				}
+
+		};
+
+		/**
+		 * Adaptor to run an untyped ALP function.
+		 *
+		 * It stores relevant parameters for data broadcast (inherited from
+		 * DispatchInfo) and adapts the function call to the underlying type.
+		 */
+		template<
+			typename OutputType,
+			EXEC_MODE _mode,
+			bool _requested_broadcast
+		>
+		class ExecDispatcher< void, OutputType, _mode, _requested_broadcast, true > :
+			public DispatchInfo< void, _mode, _requested_broadcast >
+		{
+
+			protected:
+
+				/**
+				 * Calls an untyped ALP function.
+				 *
+				 * Factored out as a separate function to allow its use from the BSP
+				 * #grb::Benchmarker.
+				 *
+				 * @param[in]  fun     Pointer to the untyped ALP function.
+				 * @param[in]  s       The user process ID.
+				 * @param[in]  P       The total number of user processes.
+				 * @param[in]  in      Pointer to the input argument.
+				 * @param[in]  in_size Byte size of the input argument.
+				 * @param[out] out     Pointer to where to store the output.
+				 */
+				static inline void lpf_grb_call(
+					const lpf_func_t fun,
+					const lpf_pid_t s, const lpf_pid_t P,
+					const void * const in, const size_t in_size,
+					OutputType * const out
+				) {
+					(void) s;
+					(void) P;
+					reinterpret_cast< AlpUntypedFunc< OutputType > >( fun )
+						( in, in_size, *out );
+				}
+
+
+			public:
+
+				/** Use base class constructor. */
+				using DispatchInfo< void, _mode, _requested_broadcast >::DispatchInfo;
+
+				/** Untyped inputs have variably-sized inputs. */
+				constexpr static bool is_input_size_variable = true;
+
+				/**
+				 * Functor operator to call an untyped ALP function.
+				 *
+				 * @param[in]  fun     Pointer to the untyped ALP function.
+				 * @param[in]  s       The user process ID.
+				 * @param[in]  P       The total number of user processes.
+				 * @param[in]  in      Pointer to the input argument.
+				 * @param[in]  in_size Byte size of the input argument.
+				 * @param[out] out     Pointer to where to store the output.
+				 */
+				inline grb::RC operator()(
+					const lpf_func_t fun,
+					lpf_pid_t s, lpf_pid_t P,
+					const void * const in, const size_t in_size,
+					OutputType * const out
+				) const {
+					lpf_grb_call( fun, s, P, in, in_size, out );
+					return grb::SUCCESS;
+				}
 
-	T data_in_local; // construct default input type
-
-	// get input data from PID 0
-	if( broadcast && P > 1 ) {
-
-		// init collectives
-		lpf_coll_t coll;
-		lpf_err_t brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll );
-		assert( brc == LPF_SUCCESS );
-
-		// we need input fields from root, prepare for broadcast
-		brc = lpf_resize_message_queue( ctx, 2*(P-1) ); // two-phase broadcast may
-		                                                // get up to P-1 messages and
-								// send up to P-1 messages
-								// per process
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_resize_memory_register( ctx, 2 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		lpf_memslot_t global;
-		if( s == 0 ) {
-			assert( args.input_size == sizeof( T ) );
-			brc = lpf_register_global( ctx,
-				const_cast< void * >( args.input ),
-				args.input_size, &global
+		};
+
+		/**
+		 * Allocator for data structures: if \a typed_allocation is \a true, then
+		 * allocate \a T on the heap via its default contructor \a T(), otherwise as a
+		 * byte array (without construction).
+		 *
+		 * @tparam T The type of the object that should be allocated.
+		 *
+		 * @tparam typed_allocation Whether or not we may rely on the default
+		 *                          constructor of \a T.
+		 *
+		 * This allocator is only used for typed ALP functions.
+		 */
+		template< typename T, bool typed_allocation >
+		struct ExecAllocator {
+
+			static_assert( std::is_default_constructible< T >::value,
+				"T must be default constructible" );
+
+			typedef std::function< void( T * ) > Deleter;
+			typedef std::unique_ptr< T, Deleter > PointerHolder;
+
+			static PointerHolder make_pointer( size_t ) {
+				return PointerHolder(
+					new T(), // allocate with default construction
+					[] ( T * const ptr ) { delete ptr; }
+				);
+			}
+
+		};
+
+		/**
+		 * Template specialisation for untyped allocation: data is allocated as a byte
+		 * array and not initialised.
+		 *
+		 * This allocator is used for launching untyped ALP programs \em and may be
+		 * used for launching typed ALP programs where inputs are not-default
+		 * constructible but copiable. The latter only applies in broadcasting mode.
+		 */
+		template< typename T >
+		struct ExecAllocator< T, false > {
+
+			typedef std::function< void( T * ) > Deleter;
+			typedef std::unique_ptr< T, Deleter > PointerHolder;
+
+			static PointerHolder make_pointer( const size_t size ) {
+				return PointerHolder( reinterpret_cast< T * >( new char[ size ] ),
+					[] ( T * const ptr ) { delete [] reinterpret_cast< char * >( ptr ); } );
+			}
+
+		};
+
+		/**
+		 * Dispatcher to be called via LPF for distributed execution of an ALP
+		 * function.
+		 *
+		 * It handles type information of the called function via the
+		 * \a DispatcherType structure.
+		 *
+		 * This call may perform memory allocations and initialisations depending
+		 * on several conditions; in general, it performs these operations only
+		 * if strictly needed.
+		 *
+		 * Depending on the \a mode type parameter, it attempts to create an input
+		 * data structure if this is not available. This is especially important
+		 * in AUTOMATIC mode, where processes with \a s > 0 have no data
+		 * pre-allocated.
+		 *
+		 * In AUTOMATIC mode, indeed, this function does its best to supply the user
+		 * function with input data:
+		 * - if broadcast was requested, data must be copied from the node with
+		 *  s == 0 to the other nodes; memory on s > 0 is allocated via \a T's
+		 * default constructor if possible, or as a byte array; in the end,
+		 * data on s > 0 is anyway overwritten by data from s == 0;
+		 * - if broadcast was not requested, this function allocates sensible input
+		 *   by calling \a T's default constructor, if possible. If this is not
+		 *   possible, the call to this function shall have no other effect than
+		 *   (immediately) returning #grb::ILLEGAL.
+		 *
+		 * For modes other than AUTOMATIC, typed ALP functions are assumed to
+		 * always have a pre-allocated input, allocated by the function that
+		 * \em hooked into LPF; no memory is allocated in this case. If broadcast
+		 * is requested, the input for s > 0 is simply overwritten with that from
+		 * s == 0. For untyped functions, memory is allocated only if broadcasting
+		 * is requested (because the size is known a priori only at user process 0),
+		 * otherwise no allocation occurs and each ALP function takes the original
+		 * input from the launching function.
+		 *
+		 * \note Thus, implicitly, if in #grb::MANUAL or in #grb::FROM_MPI modes with
+		 *       \a broadcast <tt>true</tt>, any input pointers at user processes
+		 *       \f$ s > 0 \f$ will be ignored.
+		 *
+		 * @tparam T              ALP function input type.
+		 * @tparam U              ALP function outut type.
+		 * @tparam DispatcherType Information on the ALP function to run.
+		 *
+		 * @param[in,out] ctx  LPF context to run in.
+		 * @param[in] s        User process identifier (in the range [0, P)).
+		 * @param[in] P        Number of parallel processes.
+		 * @param[in,out] args Input and output information for LPF calls.
+		 */
+		template<
+			typename T, typename U,
+			typename DispatcherType
+		>
+		void alp_exec_dispatch(
+			lpf_t ctx,
+			const lpf_pid_t s, const lpf_pid_t P,
+			lpf_args_t args
+		) {
+			static_assert(
+				std::is_same< T, void >::value ||
+					std::is_trivially_copyable< T >::value ||
+					std::is_standard_layout< T >::value,
+				"The input type \a T must be void or memcpy-able (trivially copyable or"
+				"standard layout)."
 			);
-		} else {
-			assert( args.input_size == 0 );
-			brc = lpf_register_global( ctx, &data_in_local, sizeof( T ), &global );
-		}
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, sizeof( T ), 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-
-#ifdef NDEBUG
-		(void)brc;
-#endif
-	}
-
-	// sanity check
-	if( !broadcast ) {
-		// if we do not broadcast then everyone should have their own local input
-		assert( args.input_size == sizeof( T ) );
-	}
-
-	// get input data
-	const T &data_in = broadcast ?
-		// then get unified view of input data after broadcast
-		( s == 0 ? *static_cast< const T * >( args.input ) : data_in_local ) :
-		// otherwise just copy from args_in if there is one (to catch automatic mode)
-		*static_cast< const T * >( args.input );
-
-	// we need an output field
-	U data_out_local = U();
-	U &data_out = args.output_size == sizeof( U ) ?
-		*static_cast< U * >( args.output ) : // if we were passed output area, use it
-		data_out_local;                      // otherwise use local empy output area
-
-	// initialise ALP/GraphBLAS
-	grb::RC grb_rc = grb::init( s, P, ctx );
-	if( grb_rc != grb::SUCCESS ) {
-		std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return;
-	}
-
-	// retrieve and run the function to be executed
-	if( args.f_size == 1 ) {
-		typedef void ( *grb_func_t )( const T &, U & );
-		grb_func_t grb_program =
-			reinterpret_cast< grb_func_t >( args.f_symbols[ 0 ] );
-		( *grb_program )( data_in, data_out );
-	} else {
-		// assume we are performning benchmarks
-		typedef void ( *grb_func_t )( const T &, U & );
-		typedef void ( *bench_func_t )( void ( *grb_program )( const T &, U & ),
-			const T &, U &, lpf_pid_t );
-		bench_func_t bench_program = reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] );
-		grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] );
-		( *bench_program )( grb_program, data_in, data_out, s );
-	}
-
-	// finalise ALP/GraphBLAS
-	grb_rc = grb::finalize();
-	if( grb_rc != grb::SUCCESS ) {
-		std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl;
-		assert( false );
-	}
-}
-
-/** Global internal function used to call lpf_hook or lpf_exec with. */
-template< typename U, bool broadcast = true >
-void _grb_exec_varin_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) {
-	assert( P > 0 );
-	assert( s < P );
 
+			constexpr bool is_typed_alp_prog = !(DispatcherType::is_input_size_variable);
+			constexpr bool is_input_def_constructible =
+				std::is_default_constructible< T >::value;
+			constexpr grb::EXEC_MODE mode = DispatcherType::mode;
+			constexpr bool broadcast_input = DispatcherType::requested_broadcast;
+			constexpr bool dispatcher_needs_broadcast =
+				DispatcherType::needs_initial_broadcast;
+
+			assert( P > 0 );
+			assert( s < P );
 #ifdef _DEBUG
-	// info to stdout
-	if( s == 0 ) {
-		std::cout << "Info: launcher spawned " << P << " processes.\n";
-	}
+			if( s == 0 ) {
+				std::cout << "Info: launcher spawned or hooked " << P << " ALP user "
+					<< "processes.\n";
+			}
 #endif
-	// input data to grbProgram
-	void * data_in = NULL;
-
-	// size of the data_in block
-	size_t size;
-
-	// we need input fields from root. First synchronise on input size
-	if( broadcast && P > 1 ) {
-
-		// init collectives
-		lpf_coll_t coll;
-		lpf_err_t brc = lpf_resize_message_queue( ctx, P - 1 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_resize_memory_register( ctx, 2 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_collectives_init( ctx, s, P, 1, 0, sizeof( size_t ), &coll );
-		assert( brc == LPF_SUCCESS );
-
-		// broadcast the size of data
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		lpf_memslot_t global;
-		if( s == 0 ) {
-			size = args.input_size;
-		}
-		brc = lpf_register_global( ctx, &size, sizeof( size_t ), &global );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, sizeof( size_t ), 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-
-		// now that the input size is known, retrieve the input data
-		if( s > 0 ) {
-			data_in = new char[ size ];
-		} else {
-			data_in = const_cast< void * >( args.input );
+			if(
+				!is_input_def_constructible &&
+				is_typed_alp_prog &&
+				mode == AUTOMATIC &&
+				!broadcast_input &&
+				P > 1
+			) {
+				std::cerr << "Error: cannot locally construct input type (typeid name \""
+					<< typeid(T).name() << "\"for an ALP program that is launched "
+					<< "in automatic mode, with broadcasting, and using more than one user"
+					<< "one user process.\n"
+					<< "Additionally, this error should have been caught prior to the "
+					<< "attempted launch of the ALP program-- please submit a bug report."
+					<< std::endl;
+				assert( false );
+				return;
+			}
+
+			lpf_coll_t coll;
+			lpf_err_t brc = LPF_SUCCESS;
+
+			// initialise collectives if they are needed
+			if( P > 1 && (broadcast_input || dispatcher_needs_broadcast) ) {
+				brc = lpf_init_collectives_for_broadcast( ctx, s, P, 2, coll );
+				if( brc != LPF_SUCCESS ) {
+					std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed"
+						<< std::endl;
+				}
+				assert( brc == LPF_SUCCESS );
+			}
+
+			// call information for the ALP function, reconstructed from the arguments
+			DispatcherType dispatcher( s, args );
+
+			// ensure dispatcher is valid
+			if( P > 1 && dispatcher_needs_broadcast ) {
+				// fetch the dispatcher
+				brc = lpf_register_and_broadcast(
+					ctx, coll,
+					static_cast< void * >( &dispatcher ),
+					sizeof( DispatcherType )
+				);
+				if( brc != LPF_SUCCESS ) {
+					std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed"
+						<< std::endl;
+				}
+				assert( brc == LPF_SUCCESS );
+			}
+
+			// dispatcher is now valid on all processes: assign initial value for size
+			size_t in_size = dispatcher.get_input_size();
+
+			// set in_size on user processes with IDs larger than 0
+			if( P > 1 ) {
+				// check if input args should come from PID 0
+				if( broadcast_input ) {
+					// user requested broadcast and the input size is user-given: fetch size
+					lpf_err_t brc = lpf_register_and_broadcast(
+							ctx, coll,
+							reinterpret_cast< void * >( &in_size ), sizeof( size_t )
+						);
+					if( brc != LPF_SUCCESS ) {
+						std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed"
+							<< std::endl;
+					}
+					assert( brc == LPF_SUCCESS );
+					assert( in_size != 0 );
+				} else if( mode == AUTOMATIC && !broadcast_input && s > 0 ) {
+					// AUTOMATIC mode, untyped, no broadcast: pass zero as size
+					in_size = 0;
+				}
+			}
+
+			// now set the input argument (in) itself
+			constexpr bool typed_alloc = is_typed_alp_prog && is_input_def_constructible;
+			typedef ExecAllocator< T, typed_alloc > InputAllocator;
+			typename InputAllocator::PointerHolder data_in_holder;
+
+			// set default value
+			const T * data_in = dispatcher.get_input();
+
+			// set in on user processes with IDs larger than 0
+			if( s > 0 ) {
+				if( mode == AUTOMATIC && !is_typed_alp_prog && !broadcast_input ) {
+					// AUTOMATIC mode, untyped, no broadcast: pass nullptr
+					data_in = nullptr;
+				} else if( mode == AUTOMATIC || (broadcast_input && !is_typed_alp_prog) ) {
+					// if no memory exists (mode == AUTOMATIC) or the size was not known and
+					// the user requested broadcast, then allocate input data
+					data_in_holder = InputAllocator::make_pointer( in_size );
+					data_in = data_in_holder.get();
+				}
+			}
+
+			// set contents of in
+			if( broadcast_input && P > 1 ) {
+				// retrieve data
+				lpf_err_t brc = lpf_register_and_broadcast(
+						ctx, coll,
+						const_cast< void * >( reinterpret_cast< const void * >( data_in ) ),
+						in_size
+					);
+				if( brc != LPF_SUCCESS ) {
+					std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed"
+						<< std::endl;
+				}
+				assert( brc == LPF_SUCCESS );
+			}
+
+			// now set the output argument
+			typedef ExecAllocator< U, std::is_default_constructible< U >::value >
+				OutputAllocator;
+			typename OutputAllocator::PointerHolder data_out_holder;
+
+			// set default value
+			U * data_out = reinterpret_cast< U * >( args.output );
+
+			// set out on user processes with ID larger than 0
+			if( mode == AUTOMATIC && s > 0 ) {
+				// allocate output if memory does not exist
+				data_out_holder = OutputAllocator::make_pointer( sizeof( U ) );
+				data_out = reinterpret_cast< U * >( data_out_holder.get() );
+			}
+
+			// at this point, the dispatcher, input, and output are all good to go
+
+			// now, initialise ALP
+			grb::RC grb_rc = grb::init< BSP1D >( s, P, ctx );
+			if( grb_rc != grb::SUCCESS ) {
+				std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl;
+				assert( false );
+				return;
+			}
+
+			// retrieve and run the function to be executed
+			assert( args.f_size == 1 );
+			grb_rc = dispatcher( args.f_symbols[ 0 ], s, P, data_in, in_size, data_out );
+			if( grb_rc != grb::SUCCESS ) {
+				std::cerr << "Error: dispatcher failed" << std::endl;
+				assert( false );
+				return;
+			}
+
+			// finalise ALP/GraphBLAS
+			grb_rc = grb::finalize< BSP1D >();
+			if( grb_rc != grb::SUCCESS ) {
+				std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl;
+				assert( false );
+			}
 		}
-		brc = lpf_register_global( ctx, data_in, size, &global );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_broadcast( coll, global, global, size, 0 );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-		assert( brc == LPF_SUCCESS );
-		brc = lpf_deregister( ctx, global );
-		assert( brc == LPF_SUCCESS );
-
-#ifdef NDEBUG
-		(void)brc;
-#endif
-	} else {
-		data_in = const_cast< void * >( args.input );
-		size = args.input_size;
-	}
-
-	// we need an output field
-	U data_out_local = U();
-	U &data_out = args.output_size == sizeof( U ) ?
-		*static_cast< U * >( args.output ) :
-		data_out_local;
-	// note: the above switch handily catches automatic mode
-
-	// initialise ALP/GraphBLAS
-	grb::RC grb_rc = grb::init( s, P, ctx );
-	if( grb_rc != grb::SUCCESS ) {
-		std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl;
-		assert( false );
-		return;
-	}
-
-	// retrieve and run the function to be executed
-	if( args.f_size == 1 ) {
-		typedef void ( *grb_func_t )( void *, size_t, U & );
-		grb_func_t grb_program =
-			reinterpret_cast< grb_func_t >( args.f_symbols[ 0 ] );
-		( *grb_program )( (void *)data_in, size, data_out );
-	} else {
-		// assume we are performning benchmarks
-		typedef void ( *grb_func_t )( void *, size_t, U & );
-		typedef void ( *bench_func_t )( void ( *grb_program )( void *, size_t, U & ),
-				void *, size_t,
-				U &, lpf_pid_t
-		);
-		bench_func_t bench_program = reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] );
-		grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] );
-		( *bench_program )( grb_program, (void *)data_in, size, data_out, s );
-	}
-
-	// finalise ALP/GraphBLAS
-	grb_rc = grb::finalize();
-	if( grb_rc != grb::SUCCESS ) {
-		std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl;
-		assert( false );
-	}
-}
 
-namespace grb {
+		/**
+		 * Base class for Launcher's, with common logic and information; mainly
+		 * wrapping user #exec() parameters into internal data structures and calling
+		 * LPF.
+		 *
+		 * @tparam mode grb::EXEC_MODE LPF execution mode
+		 */
+		template< enum EXEC_MODE mode >
+		class BaseLpfLauncher {
+
+			protected:
+
+				/** The LPF init struct. Will be initialised during construction. */
+				lpf_init_t init;
+
+				/** Base constructor. */
+				BaseLpfLauncher() : init( LPF_INIT_NONE ) {}
+
+				/** Disable copy constructor. */
+				BaseLpfLauncher( const BaseLpfLauncher< mode > & ) = delete;
+
+				/** Disable copy constructor. */
+				BaseLpfLauncher & operator=( const BaseLpfLauncher< mode > & ) = delete;
+
+				/**
+				 * Run the given \a alp_program with the given pointers to input and output
+				 * arguments.
+				 *
+				 * @tparam T              Input type.
+				 * @tparam U              Output type.
+				 * @tparam DispatcherType Type of the data structure that holds input and
+				 *                        call information.
+				 *
+				 * @param[in]  alp_program The ALP program to execute.
+				 * @param[in]  data_in     Pointer to the input argument.
+				 * @param[in]  in_size     Byte size of the input arugment.
+				 * @param[out] data_out    Pointer to where to write output.
+				 *
+				 * @return RC status code of the LPF call.
+				 *
+				 * \warning Issues with default-constructibility of the input type \a T
+				 *          (in the case of AUTOMATIC mode and no-broadcasting), while
+				 *          caught in the SPMD program itself as a safety measure, should
+				 *          be caught before a call to this function in order to comply with
+				 *          the specification.
+				 *
+				 * \note This function is factored out for use with the BSP
+				 *       #grb::Benchmarker.
+				 */
+				template<
+					typename T, typename U,
+					typename DispatcherType
+				>
+				RC run_lpf(
+					const lpf_func_t alp_program,
+					const void * const data_in,
+					const size_t in_size,
+					U * const data_out
+				) const {
+					// construct LPF I/O args
+					lpf_args_t args = {
+						data_in, in_size,
+						data_out, sizeof( U ),
+						&alp_program, 1
+					};
+
+					// get LPF function pointer
+					lpf_spmd_t fun = reinterpret_cast< lpf_spmd_t >(
+						internal::alp_exec_dispatch< T, U, DispatcherType > );
+
+					// execute
+					const lpf_err_t spmdrc = init == LPF_INIT_NONE
+						? lpf_exec( LPF_ROOT, LPF_MAX_P, fun, args )
+						: lpf_hook( init, fun, args );
+
+					// check error code
+					if( spmdrc != LPF_SUCCESS ) {
+						return PANIC;
+					}
 
-	/**
-	 * No implementation notes.
-	 */
-	template<>
-	class Launcher< FROM_MPI, BSP1D > {
+					// done
+					return SUCCESS;
+				}
 
 
-		protected:
+			private:
+
+				/**
+				 * Pack data received from user into an internal::ExecDispatcher data
+				 * structure and run the ALP program.
+				 *
+				 * @tparam T            Input type.
+				 * @tparam U            Output type.
+				 * @tparam untyped_call Whether the ALP function is typed.
+				 *
+				 * \note If \a untyped_call is <tt>true</tt>, then \a T must be
+				 *       <tt>void</tt>.
+				 *
+				 * @param[in] alp_program The ALP program to execute.
+				 * @param[in] data_in     Pointer to input data.
+				 * @param[in] in_size     Size of the input data
+				 *
+				 * \warning \a in_size must equal <tt>sizeof( T )</tt> if \a untyped_call
+				 *          equals <tt>false</tt>.
+				 *
+				 * @param[out] data_out  Pointer to where to write output data.
+				 * @param[in]  broadcast Whether to broadcast input from node 0 to all
+				 *                       others.
+				 *
+				 * \warning Issues with default-constructibility of the input type \a T
+				 *          (in the case of AUTOMATIC mode and no-broadcasting), while
+				 *          caught in the SPMD program itself as a safety measure, should
+				 *          be caught before a call to this function in order to comply with
+				 *          the specification.
+				 *
+				 * @returns #grb::SUCCESS When the ALP program was launched successfully.
+				 * @returns #grb::PANIC   On error in the communication layer while
+				 *                        launching the program, during program execution,
+				 *                        or while terminating the program.
+				 */
+				template< typename T, typename U, bool untyped_call >
+				RC pack_data_and_run(
+					const lpf_func_t alp_program,
+					const T * const data_in,
+					const size_t in_size,
+					U * const data_out,
+					const bool broadcast
+				) const {
+					static_assert( std::is_void< T >::value || !untyped_call,
+						"If T is not void, this must refer to a typed ALP program call" );
+					if( !untyped_call ) {
+						assert( grb::utils::SizeOf< T >::value == in_size );
+					}
+					if( broadcast ) {
+						typedef internal::ExecDispatcher< T, U, mode, true, untyped_call > Disp;
+						return run_lpf< T, U, Disp >( alp_program, data_in, in_size, data_out );
+					} else {
+						typedef internal::ExecDispatcher< T, U, mode, false, untyped_call > Disp;
+						return run_lpf< T, U, Disp >( alp_program, data_in, in_size, data_out );
+					}
+				}
 
-			/** The LPF init struct. Will be initialised during construction. */
-			lpf_init_t init;
 
+			public:
+
+				/**
+				 * Run a typed ALP function distributed via LPF.
+				 *
+				 * In case of AUTOMATIC mode, input data is allocated by default (if the type
+				 * allows) or as a sequence of bytes. This assumes the default allocator does
+				 * not have \b any side affect (like memory allocation). In case of broadcast
+				 * request, data is trivially serialized: hence, non-trivial objects (e.g.,
+				 * storing pointers to memory buffers) are not valid anymore in processes
+				 * other than the master.
+				 *
+				 * @tparam T Input type.
+				 * @tparam U Output type.
+				 *
+				 * @param[in]  alp_program ALP function to run in parallel.
+				 * @param[in]  data_in     Input data.
+				 * @param[out] data_out    Output data.
+				 * @param[in]  broadcast   Whether to broadcast input from node 0 to the
+				 *                         others.
+				 *
+				 * @returns #grb::SUCCESS When the ALP program was launched successfully.
+				 * @returns #grb::ILLEGAL When the ALP program was launched in AUTOMATIC
+				 *                        mode, without broadcasting, while \a T was not
+				 *                        default-constructible.
+				 * @returns #grb::PANIC   On error in the communication layer while
+				 *                        launching the program, during program execution,
+				 *                        or while terminating the program.
+				 */
+				template< typename T, typename U >
+				RC exec(
+					const AlpTypedFunc< T, U > alp_program,
+					const T &data_in,
+					U &data_out,
+					const bool broadcast = false
+				) {
+					static_assert(
+						mode != AUTOMATIC ||
+							std::is_default_constructible< U >::value,
+						"The output type U should be default-constructible when using automatic "
+						"mode launchers."
+					);
+					if(
+						mode == AUTOMATIC && broadcast == false &&
+						!std::is_default_constructible< T >::value
+					) {
+						return grb::ILLEGAL;
+					} else {
+						return pack_data_and_run< T, U, false >(
+							reinterpret_cast< lpf_func_t >( alp_program ),
+							&data_in, sizeof( T ),
+							&data_out, broadcast
+						);
+					}
+				}
+
+				/**
+				 * Run an untyped ALP function in parallel via LPF.
+				 *
+				 * Input data has variable size, known only at runtime. Therefore, input
+				 * data cannot be costructed by default, but are serialized and replicated as
+				 * a mere sequence of bytes.
+				 *
+				 * @tparam T Input type.
+				 * @tparam U Output type.
+				 *
+				 * @param[in]  alp_program ALP function to run in parallel.
+				 * @param[in]  data_in     Pointer to input data.
+				 * @param[in]  in_size     Size of input data.
+				 * @param[out] data_out    Output data.
+				 * @param[in]  broadcast   Whether to broadcast input from node 0 to the
+				 *                         others.
+				 *
+				 * @returns #grb::SUCCESS When the ALP program was launched successfully.
+				 * @returns #grb::PANIC   On error in the communication layer while
+				 *                        launching the program, during program execution,
+				 *                        or while terminating the program.
+				 */
+				template< typename U >
+				RC exec(
+					const AlpUntypedFunc< U > alp_program,
+					const void * const data_in, const size_t in_size,
+					U &data_out,
+					const bool broadcast = false
+				) {
+					static_assert(
+						mode != AUTOMATIC ||
+							std::is_default_constructible< U >::value,
+						"The output type U should be default-constructible when using automatic "
+						"mode launchers."
+					);
+					return pack_data_and_run< void, U, true >(
+						reinterpret_cast< lpf_func_t >( alp_program ),
+						data_in, in_size, &data_out, broadcast
+					);
+				}
+
+		};
+
+	} // end namespace internal
+
+	/**
+	 * Specialization of Launcher to be used when MPI has already been
+	 * initialised but not LPF.
+	 */
+	template<>
+	class Launcher< FROM_MPI, BSP1D > :
+		public internal::BaseLpfLauncher< FROM_MPI >
+	{
 
 		public:
 
@@ -293,13 +821,6 @@ namespace grb {
 			 * @throws runtime_error When a standard MPI call fails.
 			 */
 			Launcher( const MPI_Comm comm = MPI_COMM_WORLD ) {
-				// run-time sanity check when using MPI:
-				// we (as in LPF) should NOT be managing MPI
-				if( LPF_MPI_AUTO_INITIALIZE ) {
-					throw std::runtime_error( "Program was not linked with the symbol "
-						"LPF_MPI_AUTO_INITIALIZE set to 0 while an instance of "
-						"Launcher<Manual> or Launcher<FROM_MPI> is being requested." );
-				}
 
 				// init from communicator
 				const lpf_err_t initrc = lpf_mpi_initialize_with_mpicomm( comm, &init );
@@ -307,22 +828,15 @@ namespace grb {
 				// check for success
 				if( initrc != LPF_SUCCESS ) {
 					throw std::runtime_error(
-						"LPF could not connect launcher group over TCP/IP."
+						"LPF could not be initialized via the given MPI communicator."
 					);
 				}
 
 				// done!
 			}
 
-			/** Disable copy constructor. */
-			Launcher( const Launcher & ) = delete;
-
-			/** Disable copy constructor. */
-			Launcher & operator=( const Launcher & ) = delete;
-
 			/**
-			 * Implementation note: this Launcher will clear a field of
-			 * type \a lpf_init_t.
+			 * Implementation note: this Launcher will clear #init.
 			 */
 			~Launcher() {
 				assert( init != LPF_INIT_NONE );
@@ -335,66 +849,6 @@ namespace grb {
 				init = LPF_INIT_NONE;
 			}
 
-			/** No implementation notes. */
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
-				U &data_out,
-				const bool broadcast = false
-			) const {
-				// check input arguments
-				if( in_size > 0 && data_in == nullptr ) {
-					return ILLEGAL;
-				}
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { data_in, in_size, &data_out, sizeof( U ), fargs, 1 };
-
-				// do hook
-				const lpf_err_t spmdrc = broadcast ?
-					lpf_hook( init, &(_grb_exec_varin_spmd< U, true >), args ) :
-					lpf_hook( init, &(_grb_exec_varin_spmd< U, false >), args );
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
-
-				// done
-				return SUCCESS;
-			}
-
-			/** No implementation notes. */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out,           // input & output data
-				const bool broadcast = false
-			) {
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 1 };
-
-				// do hook
-				const lpf_err_t spmdrc = broadcast ?
-					lpf_hook( init, &(_grb_exec_spmd< T, U, true >), args ) :
-					lpf_hook( init, &(_grb_exec_spmd< T, U, false >), args );
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
-
-				// done
-				return SUCCESS;
-			}
-
 			/**
 			 * Since the user is using ALP/GraphBLAS directly from MPI, the user codes
 			 * should call MPI_Finalize. This function thus is a no-op in this particular
@@ -407,64 +861,83 @@ namespace grb {
 	};
 
 	/**
-	 * No implementation notes.
+	 * Specialisation of Launcher for the automatic mode.
+	 *
+	 * Assumes LPF takes care of any initialisation requirements.
 	 */
-	template< enum EXEC_MODE mode >
-	class Launcher< mode, BSP1D > {
-
-
-		private:
-
-			// we should never be called for FROM_MPI mode-- the above
-			// specialisation should be used instead
-			static_assert( mode != FROM_MPI,
-				"EXEC_MODE::FROM_MPI for BSP1D is implemented in specialised class" );
-
-			/** The user process ID in this launcher group. */
-			const size_t _s;
+	template<>
+	class Launcher< AUTOMATIC, BSP1D > :
+		public internal::BaseLpfLauncher< AUTOMATIC >
+	{
 
-			/** The total number of user processes in this launcher group. */
-			const size_t _P;
+		public:
 
-			/** The connection broker in this launcher group. */
-			const std::string _hostname;
+			Launcher() = default;
 
-			/** The port at #_hostname used for brokering connections. */
-			const std::string _port;
+			~Launcher() {
+				assert( init == LPF_INIT_NONE );
+			}
 
+			static RC finalize() {
+				return grb::SUCCESS;
+			}
 
-		protected:
+	};
 
-			/** The LPF init struct. Will be initialised during construction. */
-			lpf_init_t init;
+	/**
+	 * Specialisation of Launcher for the manual mode.
+	 *
+	 * The callee here manually connects existing processes into a joint LPF
+	 * context, that is then used to execute (parallel) ALP programs.
+	 *
+	 * Assumes the pre-existing processes may be connected via TCP/IP.
+	 */
+	template< enum EXEC_MODE mode >
+	class Launcher< mode, BSP1D > : public internal::BaseLpfLauncher< mode > {
 
+		static_assert( mode == MANUAL, "Expected manual launcher mode" );
 
 		public:
 
 			/**
-			 * When \a mode is #AUTOMATIC, this implementation adheres to
-			 * the base specification. When \a mode is #MANUAL, this
-			 * implementation specifies additionally the following:
+			 * Constructs a manual mode launcher.
 			 *
-			 * The time-out of this constructor is thirty seconds.
+			 * This implementation specifies the following constraints on the specified
+			 * input arguments.
 			 *
-			 * @param[in] hostname May not be empty. Must resolve to an IP.
-			 * @param[in] port     May not be empty. Must be either a port
-			 *                     number of a registered service name.
+			 * @param[in] process_id User process ID.
+			 * @param[in] nprocs     Total number of user processes.
+			 * @param[in] hostname   Host name (or IP) of one of the user processes
+			 *                       involved in the collective construction of this
+			 *                       launcher. May not be empty.
+			 * @param[in] port       A free port for connecting to \a hostname during the
+			 *                       collective construction of this launcher. May not be
+			 *                       empty. Must be either a port number of a registered
+			 *                       service name.
 			 *
-			 * In addition to the standard-defined exceptions, the following
-			 * may additionally be thrown:
-			 * @throws invalid_argument When hostname or port are empty.
-			 * @throws runtime_error    When the requested launcher group
-			 *                          could not be created.
+			 * The time-out of this constructor is two minutes.
+			 *
+			 * If giving a \a hostname as a string, it must resolve to an IP; if
+			 * resolution fails, this constructor call will fail.
+			 *
+			 * If giving a \a port as a string, it must resolve to a port number; if
+			 * resolution fails, this constructor call will fail.
+			 *
+			 * In addition to the standard-defined exceptions, the following errors may
+			 * additionally be thrown:
+			 *
+			 * @throws invalid_argument When hostname or port are empty but \a nprocs is
+			 *                          larger than one.
+			 * @throws runtime_error    When the requested launcher group could not be
+			 *                          created.
 			 */
 			Launcher(
-				const size_t process_id = 0,              // user process ID
-				const size_t nprocs = 1,                  // total number of user processes
-				const std::string hostname = "localhost", // one of the process' hostnames
-				const std::string port = "0",             // a free port at hostname
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string &hostname = "localhost",
+				const std::string &port = "0",
 				const bool is_mpi_inited = false
-			) : _s( process_id ), _P( nprocs ), _hostname( hostname ), _port( port ) {
+			) {
 				// sanity check
 				if( nprocs == 0 ) {
 					throw std::invalid_argument( "Total number of user processes must be "
@@ -474,161 +947,49 @@ namespace grb {
 					throw std::invalid_argument( "Process ID must be strictly smaller than "
 						"total number of user processes." );
 				}
-
-				// when using MPI in hook mode
-				if( mode == MANUAL ) {
-					// run-time sanity check when using MPI:
-					// we (as in LPF) should NOT be managing MPI
-					if( LPF_MPI_AUTO_INITIALIZE ) {
-						throw std::runtime_error( "Program was not linked with the symbol "
-							"LPF_MPI_AUTO_INITIALIZE set to 0 while an instance of "
-							"Launcher<Manual> or Launcher<FROM_MPI> is being requested." );
-					}
-					// initialise MPI if not already done
-					if( !is_mpi_inited && !_grb_mpi_initialized ) {
-						if( MPI_Init( NULL, NULL ) != MPI_SUCCESS ) {
-							throw std::runtime_error( "Call to MPI_Init failed." );
-						} else {
-							_grb_mpi_initialized = true;
-						}
-					}
+				if( nprocs > 1 && (hostname.empty() || port.empty()) ) {
+					throw std::invalid_argument( "Host or port names may not be empty if the "
+						"launcher group contains more than one process." );
 				}
 
-				// handle each mode's specifics
-				if( mode == MANUAL ) {
-					// additional sanity check
-					if( hostname.compare( "" ) == 0 || port.compare( "" ) == 0 ) {
-						throw std::invalid_argument(
-							"Hostname and/or port name cannot be empty."
-						);
+				// initialise MPI if not already done
+				// TODO FIXME the MPI_Init should not be here. See GitHub issue #240.
+				if( !is_mpi_inited && !internal::grb_mpi_initialized ) {
+					if( MPI_Init( NULL, NULL ) != MPI_SUCCESS ) {
+						throw std::runtime_error( "Call to MPI_Init failed." );
+					} else {
+						internal::grb_mpi_initialized = true;
 					}
+				}
 
-					// try and create a lpf_init_t
-					const lpf_err_t initrc = lpf_mpi_initialize_over_tcp(
-						hostname.c_str(), port.c_str(), // server info
-						120000,                         // time out
-						process_id, nprocs,             // process info
-						&init
-					);
+				// try and create a lpf_init_t
+				const lpf_err_t initrc = lpf_mpi_initialize_over_tcp(
+					hostname.c_str(), port.c_str(), // server info
+					120000,                         // time out
+					process_id, nprocs,             // process info
+					&(this->init)
+				);
 
-					// check for success
-					if( initrc != LPF_SUCCESS ) {
+				// check for success
+				if( initrc != LPF_SUCCESS ) {
 #ifndef _GRB_NO_STDIO
-						throw std::runtime_error(
-							"LPF could not connect launcher group over TCP/IP."
-						);
+					throw std::runtime_error(
+						"LPF could not connect launcher group over TCP/IP."
+					);
 #endif
-					}
-				} else {
-					// sanity check: we should be in automatic mode
-					assert( mode == AUTOMATIC );
-					// otherwise, we don't need init
-					init = LPF_INIT_NONE;
 				}
-
 			}
 
-			/** Disable copy constructor. */
-			Launcher( const Launcher & ) = delete;
-
-			/** Disable copy constructor. */
-			Launcher & operator=( const Launcher & ) = delete;
-
-			/**
-			 * Implementation note: this Launcher may need to clear a field of
-			 * type \a lpf_init_t when used in MANUAL mode.
-			 */
 			~Launcher() {
-				if( mode == MANUAL ) {
-					assert( init != LPF_INIT_NONE );
-					// try and destroy the lpf_init_t
-					const lpf_err_t finrc = lpf_mpi_finalize( init );
-					if( finrc != LPF_SUCCESS ) {
+				assert( this->init != LPF_INIT_NONE );
+				// try and destroy the lpf_init_t
+				const lpf_err_t finrc = lpf_mpi_finalize( this->init );
+				if( finrc != LPF_SUCCESS ) {
 #ifndef _GRB_NO_STDIO
-						std::cerr << "Warning: could not destroy launcher::init from ~launcher.\n";
+					std::cerr << "Warning: could not destroy launcher::init from ~launcher.\n";
 #endif
-					}
-					init = LPF_INIT_NONE;
-				} else {
-					assert( init == LPF_INIT_NONE );
 				}
-			}
-
-			/** No implementation notes. */
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
-				U &data_out,
-				const bool broadcast = false
-			) const {
-				// check input arguments
-				if( in_size > 0 && data_in == nullptr ) {
-					return ILLEGAL;
-				}
-
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { data_in, in_size, &data_out, sizeof( U ), fargs, 1 };
-
-				// launch
-				lpf_err_t spmdrc = LPF_SUCCESS;
-				if( mode == MANUAL ) {
-					// do hook
-					spmdrc = broadcast ?
-						lpf_hook( init, &(_grb_exec_varin_spmd< U, true >), args ) :
-						lpf_hook( init, &(_grb_exec_varin_spmd< U, false >), args );
-				} else {
-					assert( mode == AUTOMATIC );
-					// do exec
-					spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P,
-						&(_grb_exec_varin_spmd< U >), args );
-				}
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
-
-				// done
-				return SUCCESS;
-			}
-
-			/** No implementation notes. */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out,           // input & output data
-				const bool broadcast = false
-			) {
-				// prepare args
-				lpf_func_t fargs[ 2 ];
-				lpf_args_t args;
-				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 1 };
-
-				// launch
-				lpf_err_t spmdrc = LPF_SUCCESS;
-				if( mode == MANUAL ) {
-					// do hook
-					spmdrc = broadcast ?
-						lpf_hook( init, &(_grb_exec_spmd< T, U, true >), args ) :
-						lpf_hook( init, &(_grb_exec_spmd< T, U, false >), args );
-				} else {
-					assert( mode == AUTOMATIC );
-					// do exec
-					spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P, &(_grb_exec_spmd< T, U >), args );
-				}
-
-				// check error code
-				if( spmdrc != LPF_SUCCESS ) {
-					return PANIC;
-				}
-
-				// done
-				return SUCCESS;
+				this->init = LPF_INIT_NONE;
 			}
 
 			/**
@@ -636,15 +997,14 @@ namespace grb {
 			 */
 			static RC finalize() {
 				// finalise MPI when in manual mode
-				if( mode == MANUAL && _grb_mpi_initialized ) {
-					_grb_mpi_initialized = false;
-					if( MPI_Finalize() != MPI_SUCCESS ) {
+				// TODO FIXME the MPI_Finalize should not be here. See GitHub issue #240.
+				if( internal::grb_mpi_initialized && MPI_Finalize() != MPI_SUCCESS ) {
 #ifndef _GRB_NO_STDIO
-						std::cerr << "Warning: MPI_Finalize returned non-SUCCESS exit code.\n";
+					std::cerr << "Warning: MPI_Finalize returned non-SUCCESS exit code.\n";
 #endif
-						return grb::PANIC;
-					}
+					return grb::PANIC;
 				}
+				internal::grb_mpi_initialized = false;
 				return grb::SUCCESS;
 			}
 
diff --git a/include/graphblas/bsp1d/matrix.hpp b/include/graphblas/bsp1d/matrix.hpp
index 13ce193aa..ea0ed76b3 100644
--- a/include/graphblas/bsp1d/matrix.hpp
+++ b/include/graphblas/bsp1d/matrix.hpp
@@ -197,7 +197,7 @@ namespace grb {
 				// check default fields that should have been set by public constructor
 				assert( _m == 0 );
 				assert( _n == 0 );
-				assert( _id = std::numeric_limits< uintptr_t >::max() );
+				assert( _id == std::numeric_limits< uintptr_t >::max() );
 				assert( _ptr == nullptr );
 				assert( _cap == 0 );
 				// these default values correspond to an empty matrix and which the
@@ -265,7 +265,7 @@ namespace grb {
 					size_t global_cap = 0;
 					try {
 						// complete local initialisation
-						_local.initialize( &_id, local_m, local_n, local_nz );
+						_local.initialize( &id, local_m, local_n, local_nz );
 
 						// sync global capacity
 						global_cap = capacity( _local );
diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp
index 2bcf796aa..f7ecb8cc2 100644
--- a/include/graphblas/exec.hpp
+++ b/include/graphblas/exec.hpp
@@ -45,7 +45,10 @@
 
 #ifdef _GRB_BACKEND
 namespace grb {
-	template< enum EXEC_MODE mode, enum Backend implementation = config::default_backend >
+	template<
+		enum EXEC_MODE mode,
+		enum Backend implementation = config::default_backend
+	>
 	class Launcher;
 }
 #endif
diff --git a/include/graphblas/hyperdags/benchmark.hpp b/include/graphblas/hyperdags/benchmark.hpp
index 23502f33c..5492a8b6e 100644
--- a/include/graphblas/hyperdags/benchmark.hpp
+++ b/include/graphblas/hyperdags/benchmark.hpp
@@ -27,10 +27,8 @@
 #ifndef _H_GRB_HYPERDAGS_BENCH
 #define _H_GRB_HYPERDAGS_BENCH
 
-#include <graphblas/base/benchmark.hpp>
 #include <graphblas/rc.hpp>
-
-#include "exec.hpp"
+#include <graphblas/base/benchmark.hpp>
 
 
 namespace grb {
@@ -38,60 +36,27 @@ namespace grb {
 	/** \internal Simply wraps around the underlying Benchmarker implementation. */
 	template< enum EXEC_MODE mode >
 	class Benchmarker< mode, hyperdags > :
-		protected Launcher< mode, hyperdags >, protected internal::BenchmarkerBase
+		public Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING >
 	{
 
 		private:
 
 			typedef Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING > MyBenchmarkerType;
 
-			MyBenchmarkerType benchmarker;
-
 
 		public:
 
-			/** \internal Simple delegation. */
-			Benchmarker(
-				const size_t process_id = 0,
-				const size_t nprocs = 1,
-				const std::string hostname = "localhost",
-				const std::string port = "0"
-			) :
-				benchmarker( process_id, nprocs, hostname, port )
-			{}
-
-			/** \internal Simple delegation. */
-			template< typename U >
-			RC exec( void ( *grb_program )( const void *, const size_t, U & ),
-				const void * const data_in, const size_t in_size,
-				U &data_out,
-				const size_t inner, const size_t outer,
-				const bool broadcast = false
-			) const {
-				return benchmarker.exec(
-					grb_program,
-					data_in, in_size,
-					data_out,
-					inner, outer,
-					broadcast
-				);
-			}
-
-			/** \internal Simple delegation. */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ),
-				const T &data_in, U &data_out,
-				const size_t inner, const size_t outer,
-				const bool broadcast = false
-			) {
-				return benchmarker.exec(
-					grb_program,
-					data_in, data_out,
-					inner, outer,
-					broadcast
-				);
-			}
+			/**
+			 * \internal Delegates to #grb::Benchmarker constructor. By default, this
+			 *           reverts to the <tt>reference</tt> backend.
+			 */
+			using MyBenchmarkerType::Benchmarker;
+
+			/**
+			 * \internal Delegates to #grb::Benchmarker finalize. By default, this
+			 *           reverts to the <tt>reference</tt> backend.
+			 */
+			using MyBenchmarkerType::finalize;
 
 	};
 
diff --git a/include/graphblas/hyperdags/exec.hpp b/include/graphblas/hyperdags/exec.hpp
index 376e78b5b..14001e4be 100644
--- a/include/graphblas/hyperdags/exec.hpp
+++ b/include/graphblas/hyperdags/exec.hpp
@@ -37,64 +37,28 @@ namespace grb {
 	 * No implementation notes.
 	 */
 	template< EXEC_MODE mode >
-	class Launcher< mode, hyperdags > {
+	class Launcher< mode, hyperdags > :
+		public Launcher< mode, _GRB_WITH_HYPERDAGS_USING >
+	{
 
 		private:
 
-			/**
-			 * Rely on underlying backend.
-			 */
 			typedef Launcher< mode, _GRB_WITH_HYPERDAGS_USING > MyLauncherType;
 
-			/**
-			 * Instantiate the sub-backend.
-			 */
-			MyLauncherType launcher;
-
 
 		public:
 
 			/**
-			 * Default constructor.
-			 *
-			 * Simply calls that of the underlying constructor.
-			 */
-			Launcher(
-				const size_t process_id = 0, const size_t nprocs = 1,
-				const std::string hostname = "localhost",
-				const std::string port = "0"
-			) : launcher( process_id, nprocs, hostname, port ) {}
-
-			/**
-			 * Variable input-size execution.
-			 *
-			 * Simply calls underlying launcher.
+			 * \internal Delegates to #grb::Launcher (reference) constructor. By
+			 *           default, this reverts to the <tt>reference</tt> backend.
 			 */
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in,
-				const size_t in_size,
-				U &data_out,
-				const bool broadcast = false
-			) {
-				return launcher.exec( grb_program, data_in, in_size, data_out, broadcast );
-			}
+			using MyLauncherType::Launcher;
 
 			/**
-			 * Fixed-size execution.
-			 *
-			 * Simply calls underlying launcher.
+			 * \internal Delegates to #grb::Launcher finalize. By default, this reverts
+			 *           to the <tt>reference</tt> backend.
 			 */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ),
-				const T &data_in,
-				U &data_out,
-				const bool broadcast = false
-			) {
-				return launcher.exec( grb_program, data_in, data_out, broadcast );
-			}
+			using MyLauncherType::finalize;
 
 	};
 
diff --git a/include/graphblas/nonblocking/benchmark.hpp b/include/graphblas/nonblocking/benchmark.hpp
index 8b62cb016..627002f02 100644
--- a/include/graphblas/nonblocking/benchmark.hpp
+++ b/include/graphblas/nonblocking/benchmark.hpp
@@ -27,10 +27,8 @@
 #ifndef _H_GRB_NONBLOCKING_BENCH
 #define _H_GRB_NONBLOCKING_BENCH
 
-#include <graphblas/base/benchmark.hpp>
 #include <graphblas/rc.hpp>
-
-#include "exec.hpp"
+#include <graphblas/reference/benchmark.hpp>
 
 
 namespace grb {
@@ -41,51 +39,15 @@ namespace grb {
 	 * \internal The public API simply wraps the reference Benchmarker.
 	 */
 	template< enum EXEC_MODE mode >
-	class Benchmarker< mode, nonblocking > {
-
-		private:
-
-			/** \internal Reuse reference benchmarker. */
-			Benchmarker< mode, reference > ref;
-
+	class Benchmarker< mode, nonblocking >: public Benchmarker< mode, reference > {
 
 		public:
 
-			/** \internal Mirror reference constructor. */
-			Benchmarker(
-				size_t process_id = 0,
-				size_t nprocs = 1,
-				std::string hostname = "localhost",
-				std::string port = "0"
-			) :
-				ref(process_id, nprocs, hostname, port)
-			{}
-
-			/** \internal Mirror reference exec. */
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
-				U &data_out,
-				const size_t inner, const size_t outer,
-				const bool broadcast = false
-			) const {
-				return ref.exec(
-					grb_program, data_in, in_size, data_out, inner, outer, broadcast
-				);
-			}
+			/** \internal Delegates to #grb::Benchmarker (reference) constructor. */
+			using Benchmarker< mode, reference >::Benchmarker;
 
-			/** \internal Mirror reference exec. */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ),
-				const T &data_in, U &data_out,
-				const size_t inner,
-				const size_t outer,
-				const bool broadcast = false
-			) {
-				return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast );
-			}
+			/** \internal Delegates to #grb::Benchmarker (reference) finalize. */
+			using Benchmarker< mode, reference >::finalize;
 
 	};
 
diff --git a/include/graphblas/nonblocking/exec.hpp b/include/graphblas/nonblocking/exec.hpp
index 09f679526..80f46f79d 100644
--- a/include/graphblas/nonblocking/exec.hpp
+++ b/include/graphblas/nonblocking/exec.hpp
@@ -28,74 +28,23 @@
 #define _H_GRB_NONBLOCKING_EXEC
 
 #include <graphblas/backends.hpp>
-#include <graphblas/base/exec.hpp>
-
-#include "init.hpp"
+#include <graphblas/reference/exec.hpp>
 
 
 namespace grb {
 
 	/** The Launcher class is based on that of the reference backend */
 	template< EXEC_MODE mode >
-	class Launcher< mode, nonblocking > {
-
-		private:
-
-			Launcher< mode, reference > ref;
+	class Launcher< mode, nonblocking >: public Launcher< mode, reference > {
 
 		public:
 
-			/**
-			 * This implementation only accepts a single user process. It ignores
-			 * \a hostname and \a port.
-			 */
-			Launcher(
-				const size_t process_id = 0,
-				const size_t nprocs = 1,
-				const std::string hostname = "localhost",
-				const std::string port = "0"
-			) {
-				// ignore hostname and port
-				(void) hostname;
-				(void) port;
-				// sanity checks
-				if( nprocs != 1 ) {
-					throw std::invalid_argument( "Total number of user processes must be "
-						"exactly one when using the nonblocking implementation."
-					);
-				}
-				if( process_id != 0 ) {
-					throw std::invalid_argument( "Process ID must always be zero in the "
-						"nonblocking implementation."
-					);
-				}
-			}
-
-			/** No implementation notes. */
-			~Launcher() {}
-
-			/** exec is based on that of the reference backend */
-			template< typename U >
-			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
-				U &data_out, const bool broadcast = false
-			) const {
-				return ref.exec( grb_program, data_in, in_size, data_out, broadcast );
-			}
+			/** \internal Delegates to #grb::Launcher (reference) constructor. */
+			using Launcher< mode, reference >::Launcher;
 
-			/** exec is based on that of the reference backend */
-			template< typename T, typename U >
-			RC exec(
-				void ( *grb_program )( const T &, U & ),
-				const T &data_in, U &data_out,
-				const bool broadcast = false
-			) {
-				return ref.exec( grb_program, data_in, data_out, broadcast );
-			}
+			/** \internal Delegates to #grb::Launcher (reference) finalize. */
+			using Launcher< mode, reference >::finalize;
 
-			/** finalize is based on that of the reference backend */
-			grb::RC finalize() { return ref.finalize(); }
 	};
 
 } // namespace grb
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
index ff40be8dd..9b6a58782 100644
--- a/include/graphblas/nonblocking/io.hpp
+++ b/include/graphblas/nonblocking/io.hpp
@@ -28,8 +28,9 @@
 #define _H_GRB_NONBLOCKING_IO
 
 #include <graphblas/base/io.hpp>
-#include <graphblas/vector.hpp>
-#include <graphblas/matrix.hpp>
+#include <graphblas/reference/io.hpp>
+#include "vector.hpp"
+#include "matrix.hpp"
 
 #include "lazy_evaluation.hpp"
 #include "boolean_dispatcher_io.hpp"
@@ -1334,7 +1335,7 @@ namespace grb {
 	}
 
 	template< typename InputType, typename RIT, typename CIT, typename NIT >
-	RC wait( const Matrix< InputType, nonblocking > &A ) {
+	RC wait( const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A ) {
 		(void) A;
 		//TODO: currently, matrices are read only and no action is required
 		//		once the level-3 primitives are implemented
diff --git a/include/graphblas/nonblocking/spmd.hpp b/include/graphblas/nonblocking/spmd.hpp
index 126d50f33..0f169593e 100644
--- a/include/graphblas/nonblocking/spmd.hpp
+++ b/include/graphblas/nonblocking/spmd.hpp
@@ -30,6 +30,7 @@
 #include <cstddef> //size_t
 
 #include <graphblas/base/spmd.hpp>
+#include <graphblas/reference/spmd.hpp>
 
 
 namespace grb {
diff --git a/include/graphblas/reference/benchmark.hpp b/include/graphblas/reference/benchmark.hpp
index 226500ecf..a6dd4ad1b 100644
--- a/include/graphblas/reference/benchmark.hpp
+++ b/include/graphblas/reference/benchmark.hpp
@@ -20,7 +20,7 @@
  * @date 17th of April, 2017
  */
 
-#if ! defined _H_GRB_REFERENCE_BENCH || defined _H_GRB_REFERENCE_OMP_BENCH
+#if !defined _H_GRB_REFERENCE_BENCH || defined _H_GRB_REFERENCE_OMP_BENCH
 #define _H_GRB_REFERENCE_BENCH
 
 #include <graphblas/base/benchmark.hpp>
@@ -28,6 +28,7 @@
 
 #include "exec.hpp"
 
+
 namespace grb {
 
 	/**
@@ -38,23 +39,18 @@ namespace grb {
 	 */
 	template< enum EXEC_MODE mode >
 	class Benchmarker< mode, reference > :
-		protected Launcher< mode, reference >, protected internal::BenchmarkerBase
+		public Launcher< mode, reference >, protected internal::BenchmarkerBase
 	{
 
 		public:
 
 			/** \internal Delegates to #grb::Launcher (reference) constructor. */
-			Benchmarker(
-				const size_t process_id = 0,        // user process ID
-				const size_t nprocs = 1,            // total number of user processes
-				std::string hostname = "localhost", // one of the user process hostnames
-				std::string port = "0"              // a free port at hostname
-			) : Launcher< mode, reference >( process_id, nprocs, hostname, port ) {}
+			using Launcher< mode, reference >::Launcher;
 
-			/** \internal No implementation notes. */
+			/** \internal Use base benchmarker. */
 			template< typename U >
 			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
+				AlpUntypedFunc< U > alp_program,
 				const void * data_in, const size_t in_size,
 				U &data_out,
 				const size_t inner, const size_t outer,
@@ -65,48 +61,32 @@ namespace grb {
 				if( in_size > 0 && data_in == nullptr ) {
 					return ILLEGAL;
 				}
-				// initialise GraphBLAS
-				RC ret = grb::init();
-
-				// call graphBLAS algo
-				if( ret == SUCCESS ) {
-					benchmark< U >( grb_program, data_in, in_size, data_out, inner, outer, 0 );
-				}
-				// finalise the GraphBLAS
-				const RC frc = grb::finalize();
-				if( ret == SUCCESS ) {
-					ret = frc;
-				}
-				// and done
-				return ret;
+				auto fun = [ data_in, in_size, &data_out, alp_program, inner, outer ] {
+					benchmark< U, reference >( alp_program, data_in, in_size, data_out, inner,
+						outer, 0 );
+				};
+				return Launcher< mode, reference >::init_and_run( fun, broadcast );
 			}
 
 			/** \internal No implementation notes. */
 			template< typename T, typename U >
 			RC exec(
-				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out, // input & output data
+				AlpTypedFunc< T, U > alp_program,
+				const T &data_in, U &data_out,
 				const size_t inner,
 				const size_t outer,
 				const bool broadcast = false
 			) {
-				(void) broadcast; // value doesn't matter for a single user process
-				// initialise GraphBLAS
-				RC ret = grb::init();
-				// call graphBLAS algo
-				if( ret == SUCCESS ) {
-					// call graphBLAS algo
-					benchmark< T, U >( grb_program, data_in, data_out, inner, outer, 0 );
-				}
-				// finalise the GraphBLAS
-				const RC frc = grb::finalize();
-				if( ret == SUCCESS ) {
-					ret = frc;
-				}
-				// and done
-				return ret;
+				auto fun = [ &data_in, &data_out, alp_program, inner, outer ] {
+					benchmark< T, U, reference >( alp_program, data_in, data_out, inner,
+						outer, 0 );
+				};
+				return Launcher< mode, reference >::init_and_run( fun, broadcast );
 			}
 
+			/** \internal Use reference Launcher finalize */
+			using Launcher< mode, reference >::finalize;
+
 	};
 
 } // namespace grb
diff --git a/include/graphblas/reference/exec.hpp b/include/graphblas/reference/exec.hpp
index d5d705f2c..e08463826 100644
--- a/include/graphblas/reference/exec.hpp
+++ b/include/graphblas/reference/exec.hpp
@@ -37,6 +37,27 @@ namespace grb {
 	template< EXEC_MODE mode >
 	class Launcher< mode, reference > {
 
+		protected:
+
+			template< typename Runner >
+			RC init_and_run(
+				Runner &runner,
+				const bool broadcast
+			) const {
+				// value doesn't matter for a single user process
+				(void) broadcast;
+				// intialise
+				RC ret = grb::init();
+				// call algo
+				if( ret == SUCCESS ) {
+					runner();
+					ret = grb::finalize();
+				}
+				// and done
+				return ret;
+			}
+
+
 		public:
 
 			/**
@@ -76,49 +97,36 @@ namespace grb {
 			/** No implementation notes. */
 			template< typename U >
 			RC exec(
-				void ( *grb_program )( const void *, const size_t, U & ),
-				const void * data_in, const size_t in_size,
+				AlpUntypedFunc< U > alp_program,
+				const void * const data_in, const size_t in_size,
 				U &data_out,
 				const bool broadcast = false
 			) const {
-				// value doesn't matter for a single user process
-				(void) broadcast;
 				// check input arguments
 				if( in_size > 0 && data_in == nullptr ) {
 					return ILLEGAL;
 				}
-				// intialise GraphBLAS
-				RC ret = grb::init();
-				// call graphBLAS algo
-				if( ret == SUCCESS ) {
-					(*grb_program)( data_in, in_size, data_out );
-					ret = grb::finalize();
-				}
-				// and done
-				return ret;
+				auto fun = [ data_in, in_size, &data_out, alp_program ] {
+					(*alp_program)( data_in, in_size, data_out );
+				};
+				return init_and_run( fun, broadcast );
 			}
 
 			/** No implementation notes. */
 			template< typename T, typename U >
 			RC exec(
-				void ( *grb_program )( const T &, U & ), // user ALP/GraphBLAS program
-				const T &data_in, U &data_out,           // input & output data
+				AlpTypedFunc< T, U > alp_program,
+				const T &data_in, U &data_out,
 				const bool broadcast = false
 			) {
-				(void) broadcast; // value doesn't matter for a single user process
-				// intialise ALP/GraphBLAS
-				RC ret = grb::init();
-				// call graphBLAS algo
-				if( ret == SUCCESS ) {
-					(*grb_program)( data_in, data_out );
-					ret = grb::finalize();
-				}
-				// and done
-				return ret;
+				auto fun = [ &data_in, &data_out, alp_program ] {
+					(*alp_program)( data_in, data_out );
+				};
+				return init_and_run( fun, broadcast );
 			}
 
 			/** No implementation notes. */
-			grb::RC finalize() { return grb::SUCCESS; }
+			static grb::RC finalize() { return grb::SUCCESS; }
 
 	};
 
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 10229d1c6..891b13488 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -25,8 +25,8 @@
 
 #include <graphblas/base/io.hpp>
 
-#include <graphblas/vector.hpp>
-#include <graphblas/matrix.hpp>
+#include "vector.hpp"
+#include "matrix.hpp"
 
 #define NO_CAST_ASSERT( x, y, z )                                              \
 	static_assert( x,                                                          \
diff --git a/include/graphblas/reference/pinnedvector.hpp b/include/graphblas/reference/pinnedvector.hpp
index 7a3332ab7..d51b3f59d 100644
--- a/include/graphblas/reference/pinnedvector.hpp
+++ b/include/graphblas/reference/pinnedvector.hpp
@@ -79,8 +79,8 @@ namespace grb {
 				_raw_deleter( x._raw_deleter ), _stack_deleter( x._buffer_deleter ),
 				_buffered_values( x._raw ), _buffered_coordinates( x._coordinates )
 			{
-				(void)mode; // sequential and parallel IO mode are equivalent for this
-				            // implementation.
+				(void) mode; // sequential and parallel IO mode are equivalent for this
+				             // implementation.
 			}
 
 			// default destructor is OK
diff --git a/include/graphblas/utils/TimerResults.hpp b/include/graphblas/utils/TimerResults.hpp
index 36e50238b..276579716 100644
--- a/include/graphblas/utils/TimerResults.hpp
+++ b/include/graphblas/utils/TimerResults.hpp
@@ -23,12 +23,16 @@
 #ifndef _H_GRB_TIMERRESULTS
 #define _H_GRB_TIMERRESULTS
 
+
 namespace grb {
+
 	namespace utils {
 
 		/**
-		 * A structure holding benchmarking results, with initial io, a preamble time for setup,
-		 * a useful time for actual processing, and a postamble time for cleaning up
+		 * A structure holding benchmark timing results.
+		 *
+		 * It keeps track of initial io, a preamble time for setup, a useful time for
+		 * actual processing, and a postamble time for cleaning up.
 		 */
 		struct TimerResults {
 			double io;
@@ -41,7 +45,7 @@ namespace grb {
 				useful = val;
 				postamble = val;
 			}
-			void accum( TimerResults & times ) {
+			void accum( TimerResults &times ) {
 				io += times.io;
 				preamble += times.preamble;
 				useful += times.useful;
@@ -54,13 +58,13 @@ namespace grb {
 				useful /= loops;
 				postamble /= loops;
 			}
-			void min( const TimerResults & times ) noexcept {
+			void min( const TimerResults &times ) noexcept {
 				io = ( times.io < io ) ? times.io : io;
 				preamble = ( times.preamble < preamble ) ? times.preamble : preamble;
 				useful = ( times.useful < useful ) ? times.useful : useful;
 				postamble = ( times.postamble < postamble ) ? times.postamble : postamble;
 			}
-			void max( const TimerResults & times ) noexcept {
+			void max( const TimerResults &times ) noexcept {
 				io = ( times.io > io ) ? times.io : io;
 				preamble = ( times.preamble > preamble ) ? times.preamble : preamble;
 				useful = ( times.useful > useful ) ? times.useful : useful;
@@ -69,5 +73,8 @@ namespace grb {
 		};
 
 	} // namespace utils
+
 } // namespace grb
+
 #endif // ``_H_GRB_TIMERRESULTS''
+
diff --git a/src/graphblas/CMakeLists.txt b/src/graphblas/CMakeLists.txt
index a562c2550..5668f5869 100644
--- a/src/graphblas/CMakeLists.txt
+++ b/src/graphblas/CMakeLists.txt
@@ -73,9 +73,10 @@ set( backend_reference_srcs
 	${CMAKE_CURRENT_SOURCE_DIR}/rc.cpp
 )
 
-# the only source file common to all BSP-based backends
+# source files common to all BSP-based backends
 set( backend_bsp_srcs
 	${CMAKE_CURRENT_SOURCE_DIR}/bsp/collectives.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/bsp/exec_broadcast_routines.cpp
 )
 
 # include only selected backends
diff --git a/src/graphblas/bsp/exec_broadcast_routines.cpp b/src/graphblas/bsp/exec_broadcast_routines.cpp
new file mode 100644
index 000000000..62cf84885
--- /dev/null
+++ b/src/graphblas/bsp/exec_broadcast_routines.cpp
@@ -0,0 +1,76 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 17th of April, 2017
+ */
+
+#include "graphblas/bsp/exec_broadcast_routines.hpp"
+
+#include <atomic>
+#include <algorithm>
+
+#include <assert.h>
+
+#include <lpf/collectives.h>
+#include <lpf/core.h>
+
+
+bool grb::internal::grb_mpi_initialized = false;
+
+lpf_err_t grb::internal::lpf_init_collectives_for_broadcast(
+	lpf_t &ctx,
+	const lpf_pid_t s, const lpf_pid_t P, const size_t max_regs,
+	lpf_coll_t &coll
+) {
+	assert( max_regs >= 2 );
+	lpf_err_t brc = lpf_resize_memory_register( ctx, max_regs );
+	assert( brc == LPF_SUCCESS );
+	// lpf_collectives_init needs at least one slot, and if this call is followed
+	// by lpf_register_and_broadcast (as is intended), then at least one more slot
+	// is needed.
+	brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll );
+	assert( brc == LPF_SUCCESS );
+	// required messages follows LPF collectives user manual
+	const size_t nmsgs = P > 1 ? std::max( P + 1, 2 * P - 3 ) : P + 1;
+	brc = lpf_resize_message_queue( ctx, nmsgs );
+	assert( brc == LPF_SUCCESS );
+	brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
+	assert( brc == LPF_SUCCESS );
+	return brc;
+}
+
+lpf_err_t grb::internal::lpf_register_and_broadcast(
+		lpf_t &ctx, lpf_coll_t &coll,
+		void * data, size_t size
+) {
+	lpf_memslot_t global;
+	lpf_err_t brc = lpf_register_global( ctx, data, size, &global );
+	assert( brc == LPF_SUCCESS );
+	// TODO FIXME: double sync for registrations on launcher::exec necessary?
+	brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
+	assert( brc == LPF_SUCCESS );
+	brc = lpf_broadcast( coll, global, global, size, 0 );
+	assert( brc == LPF_SUCCESS );
+	brc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
+	assert( brc == LPF_SUCCESS );
+	brc = lpf_deregister( ctx, global );
+	assert( brc == LPF_SUCCESS );
+	return brc;
+}
+
diff --git a/src/graphblas/bsp1d/CMakeLists.txt b/src/graphblas/bsp1d/CMakeLists.txt
index 0e62e623c..65b09b24d 100644
--- a/src/graphblas/bsp1d/CMakeLists.txt
+++ b/src/graphblas/bsp1d/CMakeLists.txt
@@ -75,7 +75,6 @@ endmacro( make_bsp1d_target )
 set( backend_bsp1d_srcs
 	"${backend_reference_srcs}"
 	"${backend_bsp_srcs}"
-	${CMAKE_CURRENT_SOURCE_DIR}/exec.cpp
 	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
 	${CMAKE_CURRENT_SOURCE_DIR}/config.cpp
 	${CMAKE_CURRENT_SOURCE_DIR}/io.cpp
diff --git a/src/graphblas/bsp1d/exec.cpp b/src/graphblas/bsp1d/exec.cpp
deleted file mode 100644
index f156e9dad..000000000
--- a/src/graphblas/bsp1d/exec.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * @author A. N. Yzelman
- * @date 17th of April, 2017
- */
-
-#include <graphblas/bsp1d/exec.hpp>
-
-#ifdef _GRB_MANUAL
-const int LPF_MPIRMA_AUTO_INITIALIZE = 0;
-#endif
-
-bool _grb_mpi_initialized = false;
diff --git a/src/graphblas/nonblocking/io.cpp b/src/graphblas/nonblocking/io.cpp
index 4c7e86885..c2e28980a 100644
--- a/src/graphblas/nonblocking/io.cpp
+++ b/src/graphblas/nonblocking/io.cpp
@@ -24,8 +24,7 @@
  * @date 16th of May, 2022
  */
 
-#include <graphblas.hpp>
-
+#include <graphblas/nonblocking/io.hpp>
 #include <graphblas/nonblocking/lazy_evaluation.hpp>
 
 
diff --git a/src/graphblas/reference/io.cpp b/src/graphblas/reference/io.cpp
index 6d4be2d93..b0e5d073f 100644
--- a/src/graphblas/reference/io.cpp
+++ b/src/graphblas/reference/io.cpp
@@ -20,7 +20,7 @@
  * @date 29th of March, 2022
  */
 
-#include <graphblas.hpp>
+#include <graphblas/reference/io.hpp>
 
 
 namespace grb {
diff --git a/tests/smoke/label_test.cpp b/tests/smoke/label_test.cpp
index 246b31e53..3b9025e22 100644
--- a/tests/smoke/label_test.cpp
+++ b/tests/smoke/label_test.cpp
@@ -179,7 +179,7 @@ int main( int argc, char ** argv ) {
 
 	grb::Launcher< AUTOMATIC > launcher;
 
-	enum grb::RC rc = launcher.exec( &grbProgram, in, out );
+	enum grb::RC rc = launcher.exec( &grbProgram, in, out, true );
 	if( rc != SUCCESS ) {
 		std::cerr << "launcher.exec returns with non-SUCCESS error code "
 			<< toString(rc) << std::endl;
@@ -190,10 +190,11 @@ int main( int argc, char ** argv ) {
 
 	// done
 	if( out.error_code != SUCCESS ) {
-		std::cout << "Test FAILED\n\n";
+		std::cerr << std::flush;
+		std::cout << "Test FAILED\n" << std::endl;
 		return 1;
 	}
-	std::cout << "Test OK\n\n";
+	std::cout << "Test OK\n" << std::endl;
 	return 0;
 }
 
diff --git a/tests/smoke/simple_pagerank_from_mpi.cpp b/tests/smoke/simple_pagerank_from_mpi.cpp
index 9788c36d1..e9560ddd2 100644
--- a/tests/smoke/simple_pagerank_from_mpi.cpp
+++ b/tests/smoke/simple_pagerank_from_mpi.cpp
@@ -56,7 +56,7 @@ struct output_vector {
 	grb::utils::TimerResults times;
 };
 
-void grbProgram( const input_matrix & A, struct output_vector & out ) {
+void grbProgram( const input_matrix &A, struct output_vector &out ) {
 
 	// assume successful run
 	out.error_code = 0;
@@ -147,7 +147,7 @@ int main( int argc, char ** argv ) {
 		}
 
 		// create more convenient view of in_size
-		const struct input_matrix & A = *reinterpret_cast< struct input_matrix * >( data_in );
+		const struct input_matrix &A = *reinterpret_cast< struct input_matrix * >( data_in );
 
 		// output vector
 		struct output_vector pr;
@@ -157,6 +157,7 @@ int main( int argc, char ** argv ) {
 
 		grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD );
 
+		// note: this exec passes pointers within a single process
 		const enum grb::RC rc = launcher.exec( &grbProgram, A, pr );
 		if( rc != SUCCESS ) {
 			std::cerr << "grb::Launcher< FROM_MPI >::exec returns with "
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 5ac228625..1fb38f7fb 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -73,6 +73,10 @@ add_grb_executables( id id.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+add_grb_executables( id_distributed id_distributed.cpp
+	BACKENDS bsp1d hybrid
+)
+
 add_grb_executables( dot dot.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
@@ -236,8 +240,11 @@ add_grb_executables( eWiseApply_matrix eWiseApply_matrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+# in the below, test_utils_headers is retained in case CMake is configured to
+# include _DEBUG flags
 add_grb_executables( eWiseApplyMatrixReference eWiseApplyMatrixReference.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( outer outer.cpp
@@ -288,6 +295,25 @@ add_grb_executables( matrix_type static_asserts/matrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+add_grb_executables( launch_benchmark_auto launcherAndBenchmarker.cpp
+	BACKENDS bsp1d hybrid
+	COMPILE_DEFINITIONS DISTRIBUTED_EXECUTION
+)
+
+add_grb_executables( launch_benchmark_auto launcherAndBenchmarker.cpp
+	BACKENDS reference reference_omp hyperdags nonblocking
+)
+
+add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp
+	BACKENDS bsp1d hybrid
+	COMPILE_DEFINITIONS DISTRIBUTED_EXECUTION NO_LPF_AUTO_INIT
+)
+
+add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp
+	BACKENDS reference reference_omp hyperdags nonblocking
+	COMPILE_DEFINITIONS NO_LPF_AUTO_INIT
+)
+
 # targets to list and build the test for this category
 get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit )
 add_custom_target( "list_tests_category_unit"
diff --git a/tests/unit/auto_launcher.cpp b/tests/unit/auto_launcher.cpp
index b2686a957..d5e574acb 100644
--- a/tests/unit/auto_launcher.cpp
+++ b/tests/unit/auto_launcher.cpp
@@ -54,13 +54,14 @@ int main( int argc, char ** argv ) {
 	grb::Launcher< grb::AUTOMATIC > launcher;
 
 	// run
-	if( launcher.exec( &grbProgram, P, exit_status ) != grb::SUCCESS ) {
+	if( launcher.exec( &grbProgram, P, exit_status, true ) != grb::SUCCESS ) {
 		std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl;
 		return 200;
 	}
 
 	// master process reports test success
 	if( exit_status ) {
+		std::cerr << std::flush;
 		std::cout << "Test FAILED (exit code " << exit_status << ").\n" << std::endl;
 	} else {
 		std::cout << "Test OK\n" << std::endl;
diff --git a/tests/unit/buildVector.cpp b/tests/unit/buildVector.cpp
index aea50b842..d056af3d5 100644
--- a/tests/unit/buildVector.cpp
+++ b/tests/unit/buildVector.cpp
@@ -297,6 +297,7 @@ int main( int argc, char ** argv ) {
 	if( error == 0 ) {
 		std::cout << "Test OK" << std::endl;
 	} else {
+		std::cerr << std::flush;
 		std::cout << "Test FAILED" << std::endl;
 	}
 
diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp
index 6d675aa97..1a1982b5d 100644
--- a/tests/unit/eWiseApplyMatrixReference.cpp
+++ b/tests/unit/eWiseApplyMatrixReference.cpp
@@ -125,15 +125,24 @@ void checkCRSandCCS(
 		const auto & crsExpected = internal::getCRS( expected );
 		for( size_t i = 0; i < nrows( obtained ); ++i ) {
 			for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) {
-				if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) {
-					std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), "
-							  << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) "
-							  << "instead (CRS).\n";
+				const auto nValuesInRow = crsObtained.col_start[ i + 1 ] - crsObtained.col_start[ i ];
+				const auto expectedValuesInRow = crsExpected.col_start[ i + 1 ] - crsExpected.col_start[ i ];
+				if( nValuesInRow != expectedValuesInRow ) {
+					std::cerr << "Error: unexpected number of non-zero entries in row " << i << "; "
+							  << "expected " << expectedValuesInRow << ", "
+							  << "obtained " << nValuesInRow << " (CRS).\n";
 					rc = FAILED;
 				}
-				if( crsObtained.values[ k ] != crsExpected.values[ k ] ) {
-					std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; "
-							  << "expected " << crsExpected.values[ k ] << " (CRS).\n";
+				const auto searchedJ = crsObtained.row_index[ k ];
+				const auto searchedV = crsObtained.values[ k ];
+				bool found = false;
+				for( size_t l = crsExpected.col_start[ i ]; l < crsExpected.col_start[ i + 1 ]; ++l ) {
+					found |= ( crsExpected.row_index[ l ] == searchedJ ) && ( crsExpected.values[ l ] == searchedV );
+				}
+				if( !found ) {
+					std::cerr << "Error: Can not found entry "
+							  << "( " << i << ", " << searchedJ << " ) = "
+							  << searchedV << " (CRS).\n";
 					rc = FAILED;
 				}
 			}
@@ -143,18 +152,26 @@ void checkCRSandCCS(
 	{ // check CCS output
 		const auto & ccsObtained = internal::getCCS( obtained );
 		const auto & ccsExpected = internal::getCCS( expected );
-		for( size_t j = 0; j < ncols( obtained ); ++j ) {
-			for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) {
-				if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) {
-					std::cerr << "Error: unexpected entry at "
-							  << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), "
-							  << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) "
-							  << "instead (CCS).\n";
+		for( size_t i = 0; i < ncols( obtained ); ++i ) {
+			for( size_t k = ccsExpected.col_start[ i ]; k < ccsExpected.col_start[ i + 1 ]; ++k ) {
+				const auto nValuesInRow = ccsObtained.col_start[ i + 1 ] - ccsObtained.col_start[ i ];
+				const auto expectedValuesInRow = ccsExpected.col_start[ i + 1 ] - ccsExpected.col_start[ i ];
+				if( nValuesInRow != expectedValuesInRow ) {
+					std::cerr << "Error: unexpected number of non-zero entries in row " << i << "; "
+							  << "expected " << expectedValuesInRow << ", "
+							  << "obtained " << nValuesInRow << " (CCS).\n";
 					rc = FAILED;
 				}
-				if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) {
-					std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; "
-							  << "expected " << ccsExpected.values[ k ] << " (CCS).\n";
+				const auto searchedJ = ccsObtained.row_index[ k ];
+				const auto searchedV = ccsObtained.values[ k ];
+				bool found = false;
+				for( size_t l = ccsExpected.col_start[ i ]; l < ccsExpected.col_start[ i + 1 ]; ++l ) {
+					found |= ( ccsExpected.row_index[ l ] == searchedJ ) && ( ccsExpected.values[ l ] == searchedV );
+				}
+				if( !found ) {
+					std::cerr << "Error: Can not found entry "
+							  << "( " << i << ", " << searchedJ << " ) = "
+							  << searchedV << " (CCS).\n";
 					rc = FAILED;
 				}
 			}
diff --git a/tests/unit/id_distributed.cpp b/tests/unit/id_distributed.cpp
new file mode 100644
index 000000000..9a7a6a8d9
--- /dev/null
+++ b/tests/unit/id_distributed.cpp
@@ -0,0 +1,307 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <iostream>
+
+#include <graphblas.hpp>
+
+
+struct input {
+	bool check;
+	std::array< size_t, 3 > values;
+};
+
+struct output {
+	grb::RC rc;
+	std::array< size_t, 3 > IDs;
+};
+
+// test grb::getID on vectors
+void grb_program1( const struct input &in, struct output &out ) {
+	grb::RC &rc = out.rc;
+	assert( rc == grb::SUCCESS );
+	if( grb::spmd<>::pid() == 0 ) {
+		if( in.check ) {
+			std::cerr << "\t in vector check, phase 4/4\n";
+		} else {
+			std::cerr << "\t in initial vector test, phase 1/4\n";
+		}
+	}
+
+	grb::Vector< std::pair< int, float > > one( 1000000 );
+	grb::Vector< size_t > two( 5000000 );
+	const size_t oneLocalID = grb::getID( grb::internal::getLocal( one ) );
+	out.IDs[ 0 ] = oneLocalID;
+	const size_t twoLocalID = grb::getID( grb::internal::getLocal( two ) );
+	out.IDs[ 1 ] = twoLocalID;
+	if( oneLocalID == twoLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( oneLocalID != grb::getID( grb::internal::getLocal( one ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( twoLocalID != grb::getID( grb::internal::getLocal( two ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+
+	if( in.check ) {
+		if( oneLocalID != in.values[ 0 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (IV)\n";
+			rc = grb::FAILED;
+			return;
+		}
+		if( twoLocalID != in.values[ 1 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (V)\n";
+			rc = grb::FAILED;
+			return;
+		}
+	}
+
+	grb::Vector< size_t > three( two );
+	const size_t threeLocalID = grb::getID( grb::internal::getLocal( three ) );
+	out.IDs[ 2 ] = threeLocalID;
+	if( threeLocalID != grb::getID( grb::internal::getLocal( three ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (III): " << threeLocalID << " vs. " << grb::getID( grb::internal::getLocal( three ) ) << "\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( oneLocalID == threeLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( twoLocalID == threeLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (III)\n";
+		rc = grb::FAILED;
+		return;
+	}
+
+	if( in.check ) {
+		if( threeLocalID != in.values[ 2 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (VI): "
+				<< threeLocalID << " vs. " << in.values[ 2 ] << "\n";
+			rc = grb::FAILED;
+			return;
+		}
+	}
+
+	std::swap( two, three );
+	if( twoLocalID != grb::getID( grb::internal::getLocal( three ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap "
+			<< "produce different IDs (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( threeLocalID != grb::getID( grb::internal::getLocal( two ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap "
+			<< "produce different IDs (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+}
+
+// test grb::getID on matrices
+void grb_program2( const struct input &in, struct output &out ) {
+	grb::RC &rc = out.rc;
+	assert( rc == grb::SUCCESS );
+	if( grb::spmd<>::pid() == 0 ) {
+		if( in.check ) {
+			std::cerr << "\t in matrix check, phase 4/4\n";
+		} else {
+			std::cerr << "\t in initial matrix test, phase 1/4\n";
+		}
+	}
+
+	grb::Matrix< std::pair< int, float > > one( 1000000, 100000 );
+	grb::Matrix< size_t > two( 5000000, 100000 );
+	const size_t oneLocalID = grb::getID( grb::internal::getLocal( one ) );
+	out.IDs[ 0 ] = oneLocalID;
+	const size_t twoLocalID = grb::getID( grb::internal::getLocal( two ) );
+	out.IDs[ 1 ] = twoLocalID;
+	if( oneLocalID == twoLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( oneLocalID != grb::getID( grb::internal::getLocal( one ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( twoLocalID != grb::getID( grb::internal::getLocal( two ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+
+	if( in.check ) {
+		if( oneLocalID != in.values[ 0 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (IV)\n";
+			rc = grb::FAILED;
+			return;
+		}
+		if( twoLocalID != in.values[ 1 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (V)\n";
+			rc = grb::FAILED;
+			return;
+		}
+	}
+
+	grb::Matrix< size_t > three( two );
+	const size_t threeLocalID = grb::getID( grb::internal::getLocal( three ) );
+	out.IDs[ 2 ] = threeLocalID;
+	if( threeLocalID != grb::getID( grb::internal::getLocal( three ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different "
+			<< "IDs (III): " << threeLocalID << " vs. " << grb::getID( grb::internal::getLocal( three ) ) << "\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( oneLocalID == threeLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( twoLocalID == threeLocalID ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the "
+			<< "same ID (III)\n";
+		rc = grb::FAILED;
+		return;
+	}
+
+	if( in.check ) {
+		if( threeLocalID != in.values[ 2 ] ) {
+			std::cerr << "\t container ID is not consistent with previous run (VI): "
+				<< threeLocalID << " vs. " << in.values[ 2 ] << "\n";
+			rc = grb::FAILED;
+			return;
+		}
+	}
+
+	std::swap( two, three );
+	if( twoLocalID != grb::getID( grb::internal::getLocal( three ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap "
+			<< "produce different IDs (I)\n";
+		rc = grb::FAILED;
+		return;
+	}
+	if( threeLocalID != grb::getID( grb::internal::getLocal( two ) ) ) {
+		std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap "
+			<< "produce different IDs (II)\n";
+		rc = grb::FAILED;
+		return;
+	}
+}
+
+// NOTE:
+//  the spec does not promise anything when called on empty containers such as
+//  grb::Vector< T > empty_vector( 0 ) or grb::Matrix< T > empty_matrix( 0 ),
+//  therefore we cannot unit test the behaviour of grb::getID on such
+//  containers.
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+
+	// error checking
+	if( argc != 1 ) {
+		printUsage = true;
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< grb::AUTOMATIC > launcher;
+	struct input in_vector{ false, {0,0,0} };
+	struct input in_matrix{ false, {0,0,0} };
+	struct output out;
+	out.rc = grb::SUCCESS;
+	in_vector.check = in_matrix.check = false;
+
+	if( launcher.exec( &grb_program1, in_vector, out, true ) != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 1 FAILED (launcher error)" << std::endl;
+		return 255;
+	}
+	if( out.rc != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 1 FAILED (" << grb::toString( out.rc ) << ")" << std::endl;
+		return 255;
+	}
+	std::copy( out.IDs.begin(), out.IDs.end(), in_vector.values.begin() );
+
+	assert( out.rc == grb::SUCCESS );
+	if( launcher.exec( &grb_program2, in_matrix, out, true ) != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 2 FAILED (launcher error)" << std::endl;
+		return 255;
+	}
+	if( out.rc != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 2 FAILED (" << grb::toString( out.rc ) << ")" << std::endl;
+		return 255;
+	}
+	std::copy( out.IDs.begin(), out.IDs.end(), in_matrix.values.begin() );
+
+	in_matrix.check = true;
+	assert( out.rc == grb::SUCCESS );
+	if( launcher.exec( &grb_program2, in_matrix, out, true ) != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 3 FAILED (launcher error)" << std::endl;
+		return 255;
+	}
+	if( out.rc != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 3 FAILED (" << grb::toString( out.rc ) << ")" << std::endl;
+		return 255;
+	}
+
+	in_vector.check = true;
+	assert( out.rc == grb::SUCCESS );
+	if( launcher.exec( &grb_program1, in_vector, out, true ) != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 4 FAILED (launcher error)" << std::endl;
+		return 255;
+	}
+	if( out.rc != grb::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test 4 FAILED (" << grb::toString( out.rc ) << ")" << std::endl;
+		return 255;
+	}
+
+	std::cout << "Test OK" << std::endl;
+	return 0;
+}
+
diff --git a/tests/unit/launcherAndBenchmarker.cpp b/tests/unit/launcherAndBenchmarker.cpp
new file mode 100644
index 000000000..17a4990a1
--- /dev/null
+++ b/tests/unit/launcherAndBenchmarker.cpp
@@ -0,0 +1,680 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Tests the grb::Launcher abstraction.
+ *
+ * @author Alberto Scolari
+ * @date August 2023
+ */
+
+
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <limits>
+
+#include <string.h>
+#include <stdio.h>
+#ifdef DISTRIBUTED_EXECUTION
+	#include <mpi.h>
+#endif
+
+#include <graphblas.hpp>
+#include <graphblas/utils/ranges.hpp>
+
+
+#ifdef NO_LPF_AUTO_INIT
+ const int LPF_MPI_AUTO_INITIALIZE = 0;
+#endif
+
+constexpr size_t STR_LEN = 1024;
+
+static const char prelude[ STR_LEN + 1 ] = "O Earth O Earth return!\n"
+	"Arise from out the dewy grass;";
+
+static const char truth[ STR_LEN + 1 ] = "Night is worn,\n"
+	"and the morn\n"
+	"rises from the slumberous mass.";
+
+static const char default_str[ STR_LEN + 1 ] = "Hear the voice of the Bard!\n"
+	"Who Present, Past, and Future, sees;";
+
+struct input {
+	char str[ STR_LEN + 1 ];
+
+	input() {
+		(void) strncpy( str, default_str, STR_LEN + 1 );
+	}
+};
+
+// same as input, just not default-constructible for a testing scenarion
+struct nd_input : input {
+
+	nd_input() = delete; // make this non default-constructible
+
+	nd_input( const char * _str ) {
+		(void) strncpy( this->str, _str, STR_LEN + 1 );
+	}
+};
+
+bool operator==( const struct input &obj, const char * ext ) {
+	return strnlen( obj.str, STR_LEN + 1 ) == strnlen( ext, STR_LEN + 1 ) &&
+		strncmp( obj.str, ext, STR_LEN + 1 ) == 0;
+}
+
+bool operator==( const char * ext, const struct input &obj ) {
+	return obj == ext;
+}
+
+struct output {
+	int exit_code;
+	size_t P;
+	grb::utils::TimerResults times;
+};
+
+template< grb::EXEC_MODE mode, bool broadcasted, typename InputT >
+void grbProgram( const InputT &in, struct output &out ) {
+	static_assert( std::is_base_of< input, InputT >::value );
+	out.times.preamble = 2.0;
+	out.times.useful = 2.0;
+	out.times.io = out.times.postamble = 2.0;
+	out.times.postamble = 2.0;
+
+	const size_t P = grb::spmd<>::nprocs();
+	const size_t s = grb::spmd<>::pid();
+	out.P = P;
+
+	const char * expected = nullptr;
+
+	if( broadcasted ) {
+		// independently from mode is or process id, every process must have the same
+		// string
+		expected = truth;
+	} else {
+		// in non-broadcasting mode, what a process has depends on its rank and the
+		// launcher mode.
+		switch (mode) {
+			case grb::AUTOMATIC:
+				// here, only the master process can have the "new" string
+				// while the other processes have the "default" string
+				expected = s == 0 ? truth : default_str;
+				break;
+			case grb::FROM_MPI:
+			case grb::MANUAL:
+				// the master must have the new string, while other processes the prelude
+				expected = s == 0 ? truth : prelude;
+				break;
+			default:
+				out.exit_code = 1;
+				printf( "- ERROR: unknown mode %d\n", mode );
+				return;
+				break;
+		}
+	}
+	out.exit_code = in == expected ? 0 : 1;
+
+	std::cout << "--- PID " << s << " of " << P << ": ";
+	if( out.exit_code == 0 ) {
+		std::cout << "MATCH\n";
+	} else {
+		std::cout << "ERROR! Input string\n\"" << in.str
+			<< "\"\n!= Expected string\n\"" << expected << "\"\n";
+	}
+}
+
+template< grb::EXEC_MODE mode, bool broadcasted, typename InputT >
+void vgrbProgram(
+	const void * const __in, const size_t size,
+	struct output &out
+) {
+	if( size != STR_LEN + 1 ) {
+		const size_t P = grb::spmd<>::nprocs();
+		const size_t s = grb::spmd<>::pid();
+		out.P = P;
+		std::cout << "--- PID " << s << " of " << P << ": "
+			<< "ERROR! Input size " << size << " !- expected " << (STR_LEN+1) << "\n";
+		return;
+	}
+	const struct input &in = *reinterpret_cast< const struct input *>( __in );
+	grbProgram< mode, broadcasted, InputT >( in, out );
+}
+
+void autoVgrbProgram(
+	const void * const __in, const size_t size,
+	struct output &out
+) {
+	const size_t P = grb::spmd<>::nprocs();
+	const size_t s = grb::spmd<>::pid();
+	out.P = P;
+	if( s == 0 ) {
+		const input &in = *static_cast< const input * >( __in );
+		out.exit_code = size == sizeof( input ) &&
+			in == truth ? 0 : 1;
+		std::cout << "--- PID " << s << " of " << P << ": ";
+		if( out.exit_code == 0 ) {
+			std::cout << "MATCH\n";
+		} else {
+			std::cout << "ERROR! Input size is " << size << ", "
+				<< "string\n\"" << in.str << "\"\n!= "
+				<< "expected\n\"" << truth << "\"\n";
+		}
+	} else {
+		out.exit_code = __in == nullptr && size == 0 ? 0 : 1;
+		std::cout << "--- PID " << s << " of " << P << ": ";
+		if( out.exit_code == 0 ) {
+			std::cout << "MATCH, got expected values (nullptr and 0)\n";
+		} else {
+			std::cout << "ERROR! Got " << __in << " != nullptr and " << size
+				<< " != 0\n";
+		}
+	}
+}
+
+template< grb::EXEC_MODE mode, bool broadcasted, typename InputT >
+struct caller {
+	static constexpr grb::AlpTypedFunc< InputT, output > fun =
+		grbProgram< mode, broadcasted, InputT >;
+};
+
+template< grb::EXEC_MODE mode, bool broadcasted, typename InputT >
+struct vcaller {
+	static constexpr grb::AlpUntypedFunc< output > fun =
+		vgrbProgram< mode, broadcasted, input >;
+};
+
+template< typename InputT >
+struct vcaller< grb::AUTOMATIC, false, InputT > {
+	static constexpr grb::AlpUntypedFunc< output > fun = autoVgrbProgram;
+};
+
+template< typename InputT >
+class Runner {
+
+	public:
+
+		virtual grb::RC launch_typed(
+			grb::AlpTypedFunc< InputT, output >,
+			const InputT &, output &,
+			bool
+		) = 0;
+
+		virtual grb::RC launch_untyped(
+			grb::AlpUntypedFunc< output >,
+			const void *, size_t,
+			output &,
+			bool
+		) = 0;
+
+		virtual grb::RC finalize() = 0;
+
+		virtual ~Runner() = default;
+
+};
+
+template< grb::EXEC_MODE mode, typename InputT >
+class bsp_launcher :
+	public grb::Launcher< mode >, public Runner< InputT >
+{
+
+	public:
+
+		using grb::Launcher< mode >::Launcher;
+
+		grb::RC launch_typed(
+			grb::AlpTypedFunc< InputT, output > grbProgram,
+			const InputT &in, output &out, bool bc
+		) override {
+			return this->exec( grbProgram, in, out, bc );
+		}
+
+		grb::RC launch_untyped(
+			grb::AlpUntypedFunc< output > grbProgram,
+			const void * in, size_t in_size,
+			output &out, bool bc
+		) override {
+			return this->exec( grbProgram, in, in_size, out, bc );
+		}
+
+		virtual grb::RC finalize() override {
+			return grb::Launcher< mode >::finalize();
+		}
+
+};
+
+template< grb::EXEC_MODE mode, typename InputT >
+class bsp_benchmarker :
+	public grb::Benchmarker< mode >, public Runner< InputT >
+{
+
+	private:
+
+		size_t inner = 2;
+		size_t outer = 2;
+
+
+	public:
+
+		using grb::Benchmarker< mode >::Benchmarker;
+
+		grb::RC launch_typed(
+			grb::AlpTypedFunc< InputT, output > grbProgram,
+			const InputT &in, output &out,
+			bool bc
+		) override {
+			return this->exec( grbProgram, in, out, inner, outer, bc );
+		}
+
+		grb::RC launch_untyped(
+			const grb::AlpUntypedFunc< output > grbProgram,
+			const void * const in, const size_t in_size,
+			output &out, const bool bc
+		) override {
+			return this->exec( grbProgram, in, in_size, out, inner, outer, bc );
+		}
+
+		virtual grb::RC finalize() override {
+			return grb::Benchmarker< mode >::finalize();
+		}
+
+};
+
+
+enum RunnerType { Launch, Benchmark };
+
+template< typename InputT >
+std::unique_ptr< Runner< InputT > > make_runner(
+	grb::EXEC_MODE mode, RunnerType type,
+	size_t s, size_t P,
+	const std::string &host, const std::string &port,
+	const bool mpi_inited
+) {
+	Runner< InputT > *ret = nullptr;
+#ifndef DISTRIBUTED_EXECUTION
+	(void) mpi_inited;
+#endif
+
+	switch (type) {
+
+		case Launch:
+
+			switch (mode) {
+				case grb::AUTOMATIC:
+					ret = new bsp_launcher< grb::AUTOMATIC, InputT >;
+					break;
+#ifdef DISTRIBUTED_EXECUTION
+				case grb::FROM_MPI:
+					ret = new bsp_launcher< grb::FROM_MPI, InputT >( MPI_COMM_WORLD );
+					break;
+
+				case grb::MANUAL:
+					ret = new bsp_launcher< grb::MANUAL, InputT >( s, P, host, port,
+						mpi_inited );
+					break;
+#else
+				case grb::MANUAL:
+					ret = new bsp_launcher< grb::MANUAL, InputT >( s, P, host, port );
+					break;
+#endif
+				default:
+					break;
+			}
+			break;
+
+		case Benchmark:
+			switch (mode) {
+				case grb::AUTOMATIC:
+					ret = new bsp_benchmarker< grb::AUTOMATIC, InputT >;
+					break;
+#ifdef DISTRIBUTED_EXECUTION
+				case grb::FROM_MPI:
+					ret = new bsp_benchmarker< grb::FROM_MPI, InputT >( MPI_COMM_WORLD );
+					break;
+
+				case grb::MANUAL:
+					ret = new bsp_benchmarker< grb::MANUAL, InputT >( s, P, host, port,
+						mpi_inited );
+					break;
+#else
+				case grb::MANUAL:
+					ret = new bsp_benchmarker< grb::MANUAL, InputT >( s, P, host, port );
+					break;
+
+				case grb::FROM_MPI:
+#endif
+
+				default:
+					break;
+			}
+			break;
+
+		default:
+			// error is caught later
+			break;
+
+	}
+
+	if( ret == nullptr ) {
+		throw std::runtime_error( "Error while creating runner" );
+	}
+	return std::unique_ptr< Runner< InputT > >( ret );
+}
+
+#define ERROR_ON( cond, str ) if( cond ) {                                  \
+		std::cerr << __FILE__ ", " << __LINE__ << ": " << str << std::endl; \
+		std::cout << "Test FAILED\n" << std::endl;                          \
+		throw std::runtime_error( "check failed" );                         \
+	}
+
+
+template<
+	template< grb::EXEC_MODE, bool, typename InputT > class FunT,
+	grb::EXEC_MODE mode, typename RetT, typename InputT
+>
+RetT getFun( bool broadcast ) {
+	return broadcast
+		? FunT< mode, true, InputT >::fun
+		: FunT< mode, false, InputT >::fun;
+}
+
+template<
+	template< grb::EXEC_MODE, bool, typename InputT > class CallerT,
+	typename RetT, typename InputT
+>
+RetT getALPFun( grb::EXEC_MODE mode, bool broadcast ) {
+	switch (mode) {
+		case grb::AUTOMATIC:
+			return getFun< CallerT, grb::AUTOMATIC, RetT, InputT >( broadcast );
+			break;
+		case grb::FROM_MPI:
+			return getFun< CallerT, grb::FROM_MPI, RetT, InputT >( broadcast );
+			break;
+		case grb::MANUAL:
+			return getFun< CallerT, grb::MANUAL, RetT, InputT >( broadcast );
+			break;
+		default:
+			std::cerr << __FILE__ ", " << __LINE__ << ": " << "unknown mode " << mode
+				<< std::endl;
+			throw std::runtime_error( "unknown mode" );
+			break;
+	}
+}
+
+template< typename InputT >
+std::unique_ptr< Runner< InputT > > create_runner(
+	grb::EXEC_MODE mode, RunnerType rt,
+	size_t s, size_t P,
+	const std::string &host, const std::string &port,
+	bool mpi_inited
+) {
+	try {
+		return make_runner< InputT >(
+			mode, rt, s, P,
+			host,
+			port,
+			mpi_inited
+		);
+	} catch( std::runtime_error &e ) {
+		std::cerr << "got a runtime exception: " << e.what() << std::endl;
+		std::cout << "Test FAILED\n" << std::endl;
+		throw e;
+	} catch( std::exception &e ) {
+		std::cerr << "got an exception: " << e.what() << std::endl;
+		std::cout << "Test FAILED\n" << std::endl;
+		throw e;
+	} catch( ... ) {
+		std::cerr << "got an unknown exception" << std::endl;
+		std::cout << "Test FAILED\n" << std::endl;
+		throw std::runtime_error( "unknown exception" );
+	}
+	return std::unique_ptr< Runner< InputT > >();
+}
+
+int main( int argc, char ** argv ) {
+
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+#ifdef DISTRIBUTED_EXECUTION
+	int lpf_mpi_inited = 0;
+	int success = MPI_Initialized( &lpf_mpi_inited );
+	ERROR_ON( success != MPI_SUCCESS, "cannot determine initalization info" );
+#endif
+	const char * host = nullptr;
+	const char * port = nullptr;
+#ifdef DISTRIBUTED_EXECUTION
+	typedef lpf_pid_t test_pid_t;
+#else
+	typedef size_t test_pid_t;
+#endif
+	// default values for shared-memory execution
+	test_pid_t P = 1;
+	test_pid_t s = 0;
+	grb::EXEC_MODE mode = grb::AUTOMATIC;
+
+#ifdef DISTRIBUTED_EXECUTION
+	if( lpf_mpi_inited != 0 ) {
+		mode = grb::AUTOMATIC;
+		ERROR_ON( argc != 1, "no argument needed" );
+	} else {
+		if( argc == 1 ) {
+			mode = grb::FROM_MPI;
+		} else if( argc == 5 ) {
+			mode = grb::MANUAL;
+		} else {
+			ERROR_ON( true, "either no arguments or four arguments expected.\n"
+			       "For the four-argument variant, the following are expected:\n"
+				" - hostname\n"
+				" - portname\n"
+				" - total number of processes\n"
+				" - unique ID of this process\n"
+			);
+		}
+	}
+#else
+	if( argc == 1 ) {
+		mode = grb::AUTOMATIC;
+	} else if( argc == 5 ) {
+		mode = grb::MANUAL;
+	} else {
+		ERROR_ON( true, "either no arguments or four arguments expected.\n"
+				"For the four-argument variant, the following are expected:\n"
+			" - hostname\n"
+			" - portname\n"
+			" - total number of processes\n"
+			" - unique ID of this process\n"
+		);
+	}
+#endif
+	const char *mode_str = nullptr;
+
+	switch( mode ) {
+		case grb::AUTOMATIC:
+			mode_str = "AUTOMATIC";
+			break;
+#ifdef DISTRIBUTED_EXECUTION
+		case grb::FROM_MPI:
+			mode_str = "FROM_MPI";
+			break;
+#endif
+		case grb::MANUAL:
+			mode_str = "MANUAL";
+			break;
+		default:
+			ERROR_ON( true, "unrecognised or invalid option: " << mode );
+			break;
+	}
+
+	std::cout << "\n===> chosen initialisation method: " << mode_str << " <==="
+		<< std::endl;
+
+	if( mode == grb::MANUAL ) {
+		// read command-line args
+		host = argv[ 1 ];
+		port = argv[ 2 ];
+		try {
+			P = static_cast< test_pid_t >( std::stoi( argv[ 3 ] ) );
+			s = static_cast< test_pid_t >( std::stoi( argv[ 4 ] ) );
+		} catch( std::exception &e ) {
+			std::cerr << "Caught exception: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return EXIT_FAILURE;
+		}
+
+		// input sanity checks
+		ERROR_ON( host == nullptr || strlen( host ) == 0,
+			"Invalid hostname: " << argv[ 1 ] );
+		ERROR_ON( port == nullptr || strlen( port ) == 0,
+			"value for port name or number: " << argv[ 2 ] );
+		ERROR_ON( !grb::utils::is_in_normalized_range( s, P ),
+			"Invalid value for PID: " << argv[ 4 ] );
+	}
+#ifdef DISTRIBUTED_EXECUTION
+	if( mode == grb::FROM_MPI || mode == grb::MANUAL ) {
+		success = MPI_Init( NULL, NULL );
+		ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Init failed" );
+	}
+	if( mode == grb::FROM_MPI ) {
+		int rank;
+		success = MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+		ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Comm_rank failed" );
+		s = static_cast< test_pid_t >( rank );
+	}
+#endif
+
+	const char * input_str = ( mode == grb::AUTOMATIC ) ? truth :
+		( s == 0 ) ? truth : prelude;
+
+	struct input in;
+	struct output out;
+	for( const bool broadcast : { true, false } ) {
+		for( const RunnerType rt : { Launch, Benchmark } ) {
+			const char * const runner_name = rt == Launch ? "Launch" : "Benchmark";
+			const char * const bc_str = broadcast ? "true" : "false";
+			std::cout << "\n ==> runner type: " << runner_name << ", "
+				<< "broadcast: " << bc_str << std::endl;
+			std::unique_ptr< Runner< input > > runner = create_runner< input >(
+				mode, rt, s, P,
+				std::string( (host != nullptr ? host : "" ) ),
+				std::string( (port != nullptr ? port : "" ) ),
+				true
+			);
+			std::cout << "  => untyped call\n" << std::endl;
+			(void) strncpy( in.str, input_str, STR_LEN + 1 );
+			grb::AlpUntypedFunc< output > vfun =
+				getALPFun< vcaller, grb::AlpUntypedFunc< output >, input >(
+					mode, broadcast
+				);
+			out.exit_code = 256; // the ALP function MUST set to 0
+			grb::RC ret = runner->launch_untyped(
+				vfun,
+				reinterpret_cast< void * >( &in ), sizeof( input ),
+				out, broadcast
+			);
+			ERROR_ON( ret != grb::SUCCESS,
+				"untyped test FAILED with code: " << grb::toString( ret ) );
+			ERROR_ON( out.exit_code != 0,
+				"untyped test FAILED with exit code " << out.exit_code );
+
+			std::cout << "\n  => typed call\n" << std::endl;
+			grb::AlpTypedFunc< input, output > fun =
+				getALPFun< caller, grb::AlpTypedFunc< input, output >, input >(
+					mode, broadcast
+				);
+			out.exit_code = 256;
+			ret = runner->launch_typed( fun, in, out, broadcast );
+			ERROR_ON( ret != grb::SUCCESS,
+				"typed test FAILED with code: " << grb::toString( ret ) );
+			ERROR_ON( out.exit_code != 0,
+				"typed test FAILED with exit code " << out.exit_code );
+
+			ret = runner->finalize();
+
+			ERROR_ON( ret != grb::SUCCESS,
+				"finalisation FAILED with code: " << grb::toString( ret ) );
+			std::cout << "  => OK" << std::endl;
+
+			if( mode == grb::AUTOMATIC ) {
+				// AUTOMTIC mode must implement a specific behaviour for
+				// non-default-constructible input types like nd_input, here tested
+
+				std::unique_ptr< Runner< nd_input > > nd_runner = create_runner< nd_input >(
+					mode, rt, s, P,
+					std::string( (host != nullptr ? host : "" ) ),
+					std::string( (port != nullptr ? port : "" ) ),
+					true
+				);
+
+				std::cout << "\n  => untyped call, non-default-constructible input\n"
+					<< std::endl;
+				out.exit_code = 256;
+				nd_input ndin( input_str );
+				ret = nd_runner->launch_untyped(
+					vfun,
+					reinterpret_cast< void * >( &ndin ), sizeof( nd_input ),
+					out, broadcast
+				);
+				// untyped calls must succeed even with a non-default-constructible input
+				ERROR_ON( ret != grb::SUCCESS,
+					"untyped test FAILED with code: " << grb::toString( ret ) );
+				ERROR_ON( out.exit_code != 0,
+					"untyped test FAILED with exit code " << out.exit_code );
+
+				std::cout << "\n  => typed call, non-default-constructible input\n"
+					<< std::endl;
+				out.exit_code = 256;
+				grb::AlpTypedFunc< nd_input, output > ndfun =
+					getALPFun< caller, grb::AlpTypedFunc< nd_input, output >, nd_input >(
+						mode, broadcast
+					);
+				ret = nd_runner->launch_typed( ndfun, ndin, out, broadcast );
+				// get P from process, as it may not be known outside of the
+				// launcher (e.g., for AUTOMATIC mode)
+				const bool should_fail = ( !broadcast ) && out.P > 1;
+				int expected_retval = should_fail ? 256 : 0;
+				// typed call should fail if ALL of the following conditions are met:
+				// - AUTOMATIC mode
+				// - non-default-constructible input
+				// - no broadcast requested
+				// - more than one process to run.
+				// The idea is that process 0 receives the "original" input via
+				// the launcher, but other processes cannot create a meaningful
+				// one, because the input is non-default-constructible and
+				// because broadcast has not been requested (note: broadcast
+				// occurs ONLY on user's request): in such a case, the call
+				// cannot proceed and is aborted
+				ERROR_ON( should_fail && ret == grb::SUCCESS,
+					"run is successful, but should have failed" );
+				ERROR_ON( out.exit_code != expected_retval,
+					"typed test FAILED with exit code " << out.exit_code );
+			}
+		}
+	}
+#ifdef DISTRIBUTED_EXECUTION
+	if( mode == grb::FROM_MPI || mode == grb::MANUAL ) {
+		success = MPI_Finalize();
+		ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Finalize failed" );
+	}
+#endif
+
+	std::cout << "\nTest OK\n" << std::endl;
+	return EXIT_SUCCESS;
+}
+
diff --git a/tests/unit/mxv.cpp b/tests/unit/mxv.cpp
index 226fa5999..c35270376 100644
--- a/tests/unit/mxv.cpp
+++ b/tests/unit/mxv.cpp
@@ -112,7 +112,7 @@ int main( int argc, char ** argv ) {
 
 	grb::Launcher< AUTOMATIC > automatic_launcher;
 
-	if( automatic_launcher.exec( &grbProgram, in, out ) != SUCCESS ) {
+	if( automatic_launcher.exec( &grbProgram, in, out, true ) != SUCCESS ) {
 		std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl;
 		return EXIT_FAILURE;
 	}
diff --git a/tests/unit/pinnedVector.cpp b/tests/unit/pinnedVector.cpp
index c9a4ce7b4..3a28c895a 100644
--- a/tests/unit/pinnedVector.cpp
+++ b/tests/unit/pinnedVector.cpp
@@ -342,7 +342,7 @@ int runTests( struct input< T > &in ) {
 	for( const auto &test : AllTests ) {
 		// run test
 		in.test = test;
-		rc = rc ? rc : launcher.exec( &grbProgram, in, out );
+		rc = rc ? rc : launcher.exec( &grbProgram, in, out, true );
 		if( out.error_code != SUCCESS ) {
 			return offset + 10;
 		}
@@ -531,6 +531,26 @@ int runTests( struct input< T > &in ) {
 	return 0;
 }
 
+// default-constructible and trivially copiable pair of values for launcher
+struct Couple {
+	size_t a; float b;
+	bool operator==( const struct Couple &c ) const {
+		return c.a == a && c.b == b;
+	}
+
+	bool operator!=( const struct Couple &c ) const {
+		return !((*this) == c );
+	}
+};
+
+#ifdef _DEBUG
+// adaptor to output stream
+std::ostream & operator<<( std::ostream &out, const struct Couple &c ) {
+	out << "( " << c.a << ", " << c.b << " )";
+	return out;
+}
+#endif
+
 int main( int argc, char ** argv ) {
 	// sanity check
 	if( argc != 1 ) {
@@ -562,12 +582,19 @@ int main( int argc, char ** argv ) {
 
 		// run tests using a non-fundamental type
 		if( error == 0 ) {
-			std::cout << "\t running tests with std::pair vector entries...\n";
+			std::cout << "\t running tests with DC and SL vector entries...\n";
 			struct input< std::pair< size_t, float > > in_pair;
 			in_pair.element = std::make_pair< size_t, float >( 17, -2.7 );
 			in_pair.mode = mode;
 			error = runTests( in_pair );
 		}
+		if( error == 0 ) {
+			std::cout << "\t running tests with DC and TC vector entries...\n";
+			struct input< struct Couple > in_pair;
+			in_pair.element = { 17, -2.7 };
+			in_pair.mode = mode;
+			error = runTests( in_pair );
+		}
 		if( error ) { break; }
 	}
 
diff --git a/tests/unit/sparse_mxv.cpp b/tests/unit/sparse_mxv.cpp
index ee82484bb..f56c42b7d 100644
--- a/tests/unit/sparse_mxv.cpp
+++ b/tests/unit/sparse_mxv.cpp
@@ -190,14 +190,15 @@ void grbProgram( const int &, int &error ) {
 }
 
 int main( int argc, char ** argv ) {
-	(void)argc;
-	(void)printf( "Functional test executable: %s\n", argv[ 0 ] );
+	(void) argc;
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
 
 	// sanity check against metabugs
 	int error = 0;
 	for( size_t i = 0; i < 15; ++i ) {
 		if( ! grb::utils::equals( data1[ i ] * data2[ i ], chk[ i ] ) ) {
-			(void)fprintf( stderr, "Sanity check error at position %zd: %d + %d does not equal %d.\n", i, data1[ i ], data2[ i ], chk[ i ] );
+			std::cerr << "Sanity check error at position " << i << ": " << data1[ i ]
+				<< " + " << data2[ i ] << " does not equal " << chk[ i ] << ".\n";
 			error = 1;
 		}
 	}
@@ -205,15 +206,16 @@ int main( int argc, char ** argv ) {
 	if( !error ) {
 		grb::Launcher< AUTOMATIC > launcher;
 		if( launcher.exec( &grbProgram, error, error ) != grb::SUCCESS ) {
-			(void)fprintf( stderr, "Fatal error: could not launch test.\n" );
+			std::cerr << "Fatal error: could not launch test.\n";
 			error = 2;
 		}
 	}
 
 	if( !error ) {
-		(void)printf( "Test OK\n\n" );
+		std::cout << "Test OK\n" << std::endl;
 	} else {
-		(void)printf( "Test FAILED\n\n" );
+		std::cerr << std::flush;
+		std::cout << "Test FAILED\n" << std::endl;
 	}
 
 	// done
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index f34229c16..5963dc16c 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -179,6 +179,14 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
+					echo ">>>      [x]           [ ]       Testing grb::id on distributed vectors and matrices"
+					$runner ${TEST_BIN_DIR}/id_distributed_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log
+					head -1 ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+					echo " "
+				fi
+
 				echo ">>>      [x]           [ ]       Testing grb::capacity, grb::resize, and default"
 				echo "                                 and explicit capacities set during container"
 				echo "                                 construction"
@@ -643,6 +651,14 @@ for MODE in ${MODES}; do
 				fi
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing Launcher and Benchmarker, AUTOMATIC mode."
+				test_name=launch_benchmark_auto_${MODE}_${BACKEND}
+				test_log=${TEST_OUT_DIR}/${test_name}_${P}_${T}.log
+				$runner ${TEST_BIN_DIR}/${test_name} &> ${test_log}
+				head -1 ${test_log}
+				grep -i 'Test OK' ${test_log} || echo "Test FAILED"
+				echo " "
+
 				#if [ "$BACKEND" = "reference_omp" ]; then
 				#	echo "Additional standardised unit tests not yet supported for the ${BACKEND} backend"
 				#	echo
@@ -651,9 +667,40 @@ for MODE in ${MODES}; do
 
 				#none here: all unit tests are operational for reference_omp
 
+				if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
+					echo ">>>      [x]           [ ]       Testing Launcher and Benchmarker, FROM_MPI mode for distributed backends."
+					test_name=launch_benchmark_frommpi_manual_${MODE}_${BACKEND}
+					test_log=${TEST_OUT_DIR}/launch_benchmark_frommpi_${MODE}_${BACKEND}_${P}_${T}.log
+					$runner ${TEST_BIN_DIR}/${test_name} &> ${test_log}
+					head -1 ${test_log}
+					grep -i 'Test OK' ${test_log} || echo "Test FAILED"
+					echo " "
+				else
+					echo ">>>      [x]           [ ]       Testing Launcher and Benchmarker, MANUAL mode for shared-memory backends."
+					test_log=${TEST_OUT_DIR}/launch_benchmark_manual_${MODE}_${BACKEND}_${P}_${T}.log
+					$runner ${TEST_BIN_DIR}/launch_benchmark_frommpi_manual_${MODE}_${BACKEND} localhost 77770 1 0 &> ${test_log}
+					head -1 ${test_log}
+					grep -i 'Test OK' ${test_log} || echo "Test FAILED"
+					echo " "
+				fi
+
 			done
+
 		done
 
+		if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
+			echo ">>>      [x]           [ ]       Testing Launcher and Benchmarker, MANUAL mode for distributed backends."
+			test_name=launch_benchmark_frommpi_manual_${MODE}_${BACKEND}
+			test_log=${TEST_OUT_DIR}/launch_benchmark_manual_${MODE}_${BACKEND}.log
+			bash -c "${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 0 &> ${test_log}.0 & \
+				${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 3 &> ${test_log}.3 & \
+				${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 1 &> ${test_log}.1 & \
+				${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 2 &> ${test_log}.2 & \
+				wait"
+			(grep -q 'Test OK' ${test_log}.1 && grep -q 'Test OK' ${test_log}.2 && grep -q 'Test OK' ${test_log}.3 \
+				&& grep -q 'Test OK' ${test_log}.0 && printf "Test OK.\n\n") || (printf "Test FAILED.\n\n")
+		fi
+
 		if [ "$BACKEND" = "bsp1d" ]; then
 			echo "Additional unit tests for the BSP1D backend:"
 			echo " "
diff --git a/tests/unit/vmxa.cpp b/tests/unit/vmxa.cpp
index 030117a3f..cd8048aa4 100644
--- a/tests/unit/vmxa.cpp
+++ b/tests/unit/vmxa.cpp
@@ -118,7 +118,7 @@ int main( int argc, char ** argv ) {
 	if( error == 0 ) {
 		grb::RC rc = grb::SUCCESS;
 		grb::Launcher< grb::AUTOMATIC > launcher;
-		rc = launcher.exec( alpProgram, rc, error );
+		rc = launcher.exec( alpProgram, rc, error, true );
 		if( rc != grb::SUCCESS ) {
 			std::cerr << "Could not launch the ALP program.\n";
 			error = 10;
diff --git a/tests/unit/vxm.cpp b/tests/unit/vxm.cpp
index 1cccf2ebf..4430aa416 100644
--- a/tests/unit/vxm.cpp
+++ b/tests/unit/vxm.cpp
@@ -113,7 +113,7 @@ int main( int argc, char ** argv ) {
 
 	grb::Launcher< AUTOMATIC > automatic_launcher;
 
-	if( automatic_launcher.exec( &grbProgram, in, out ) != SUCCESS ) {
+	if( automatic_launcher.exec( &grbProgram, in, out, true ) != SUCCESS ) {
 		std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl;
 		return EXIT_FAILURE;
 	}
diff --git a/tests/unit/wait.cpp b/tests/unit/wait.cpp
index b07920ffc..1456b6473 100644
--- a/tests/unit/wait.cpp
+++ b/tests/unit/wait.cpp
@@ -155,11 +155,12 @@ int main( int argc, char ** argv ) {
 	std::cout << "This is functional test " << argv[ 0 ] << "\n";
 	grb::Launcher< grb::AUTOMATIC > launcher;
 	grb::RC out;
-	if( launcher.exec( &grbProgram, input, out, false ) != grb::SUCCESS ) {
+	if( launcher.exec( &grbProgram, input, out, true ) != grb::SUCCESS ) {
 		std::cerr << "Launching test FAILED\n";
 		return 255;
 	}
 	if( out != grb::SUCCESS ) {
+		std::cerr << std::flush;
 		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
 		return out;
 	} else {
diff --git a/tests/utils/output_verification.hpp b/tests/utils/output_verification.hpp
index 531695129..c9c3ae403 100644
--- a/tests/utils/output_verification.hpp
+++ b/tests/utils/output_verification.hpp
@@ -27,13 +27,13 @@
 
 #include <graphblas.hpp>
 
-#include <limits>
 #include <cmath>
-#include <complex>
+#include <limits>
 #include <string>
-#include <iostream>
+#include <complex>
 #include <fstream>
 #include <sstream>
+#include <iostream>
 
 #include <assert.h>
 
@@ -238,7 +238,7 @@ int vector_verification(
 		// the condition evaluated by the function isless will be false and then
 		// the whole condition of the if-statement will be evaluated to true
 		// making the verification to fail as expected
-		if( !isless( curInfNorm, c2 * magnitudeInf + eps ) ) {
+		if( !std::isless( curInfNorm, c2 * magnitudeInf + eps ) ) {
 			std::cerr << "Output vector failed inf-norm verification at index "
 				<< i << ":\n"
 				<< "\tmeasured absolute error at this index: " << curInfNorm << "\n"
@@ -260,7 +260,7 @@ int vector_verification(
 
 	// isgreaterequal is used to ensure that the condition norm_inf >= 0
 	// will be evaluated to false when norm_inf is equal to NaN or -NaN
-	if( !isgreaterequal( norm_inf, 0 ) ) {
+	if( !std::isgreaterequal( norm_inf, 0 ) ) {
 		std::cerr << "Output vector failed inf-norm verification:\n"
 			<< "\tinf-norm is neither positive nor zero -- "
 			<< "it reads " << norm_inf << " instead\n";
@@ -275,7 +275,7 @@ int vector_verification(
 
 	// isgreaterequal is used to ensure that the condition norm2 >= 0
 	// will be evaluated to false when norm2 is equal to NaN or -NaN
-	if( isgreaterequal( norm2, 0 ) ) {
+	if( std::isgreaterequal( norm2, 0 ) ) {
 		norm2 = sqrt( norm2 );
 	} else {
 		std::cerr << "Output vector failed 2-norm verification:\n"
@@ -293,7 +293,7 @@ int vector_verification(
 	delete [] raw_output_vector;
 
 	// perform check and return
-	if( !isless( norm2, c1 * magnitude2 + n * eps ) ) {
+	if( !std::isless( norm2, c1 * magnitude2 + n * eps ) ) {
 		std::cerr << "Output vector failed 2-norm verification:\n"
 			<< "\t2-norm is " << norm2 << ".\n"
 			<< "\t2-norm is larger than the specified relative tolerance of "
@@ -306,7 +306,7 @@ int vector_verification(
 			<< "\t2-norm is " << norm2 << " which is smaller or equal to the effective "
 			<< "relative tolerance of " << (c1 * magnitude2 + n * eps) << "\n";
 	}
-	if( !isless( norm_inf, c2 * magnitudeInf + eps ) ) {
+	if( !std::isless( norm_inf, c2 * magnitudeInf + eps ) ) {
 		std::cerr << "Output vector failed inf-norm verification:\n"
 		<< "\tinf-norm is " << norm_inf << " at index " << norm_inf_at << "\n"
 		<< "\tinf-norm is larger than the specified relative tolerance of "
diff --git a/tests/utils/print_vec_mat.hpp b/tests/utils/print_vec_mat.hpp
index 4db9d0afb..761664ed4 100644
--- a/tests/utils/print_vec_mat.hpp
+++ b/tests/utils/print_vec_mat.hpp
@@ -20,32 +20,58 @@
 
 /**
  * @file print_vec_mat.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Routines to print a grb::Vector, a grb::Matrix and a grb::PinnedVector; they are in templated form
- *          to be generic w.r.t. stored data type and backend implementation.
- * @version 0.1
- * @date 2021-04-30
+ *
+ * Utilities to print grb containers and objects.
+ *
+ * @authors
+ * - Alberto Scolari (alberto.scolari@huawei.com)
+ * - Benjamin Lozes (benjamin.lozes@huawei.com)
+ *
+ * Routines to print:
+ * - grb::Vector, grb::Matrix & grb::PinnedVector: These primitives are in
+ *   templated form to be generic w.r.t. stored data type and
+ *   backend implementation.
+ * - reference/CompressedStorage (CRS & CCS): These primitives are in
+ *   templated form to be generic w.r.t. stored data type, but only for
+ *   reference and reference_omp backends.
+ *
+ * @version 0.2
+ * @date 25th of August 2023
  */
+
 #include <algorithm>
 #include <utility>
+#include <iostream>
+#include <iomanip>
 
 #include <graphblas.hpp>
 
 
+using namespace grb;
+
 /**
- * @brief Prints the first \p _limit items (including zeroes) of vector \p x with optional heading \p head.
+ * Prints the first \p limit items (including zeroes) of vector \p x
+ * with optional heading \p head.
  *
- * @tparam T vector data type
- * @tparam B GraphBLAS backend storing the vector
- * @param x vector to print
- * @param _limit max number of elements to print; 0 for the entire vector
+ * Contents will be printed to the standard output stream.
+ *
+ * @tparam T Vector data type.
+ * @tparam B Vector backend.
+ *
+ * @param[in] x     The vector to print
+ * @param[in] limit Max. number of elements to print; 0 for the entire vector
  * @param head optional heading to print \b before the vector
+ *
+ * \warning Assumes iterators over \a x are ordered.
  */
 template< typename T, enum grb::Backend B >
-void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const char * head = nullptr ) {
-	// const T * const raw{grb::internal::getRaw(x)};
-	size_t x_size { grb::size( x ) };
-	size_t limit { _limit == 0 ? x_size : std::min( x_size, _limit ) };
+void print_vector(
+	const grb::Vector< T, B > &x,
+	size_t limit = 10UL,
+	const char * const head = nullptr
+) {
+	size_t x_size = grb::size( x );
+	limit = limit == 0 ? x_size : std::min( x_size, limit );
 
 	if( head != nullptr ) {
 		std::cout << "<<< " << head << " >>>" << std::endl;
@@ -55,19 +81,21 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch
 		std::cout << "(size 0 vector)";
 	}
 
-	typename grb::Vector< T, B >::const_iterator it { x.cbegin() };
-	typename grb::Vector< T, B >::const_iterator end { x.cend() };
+	typename grb::Vector< T, B >::const_iterator it = x.cbegin();
+	typename grb::Vector< T, B >::const_iterator end = x.cend();
 
-	size_t previous_nnz { it == end ? limit : it->first };
+	size_t previous_nnz = it == end ? limit : it->first;
 	if( previous_nnz == 0 ) {
 		std::cout << it->second;
-		++it;
+		(void) ++it;
 	} else if( x_size > 0 ) {
 		std::cout << 0;
 	}
-	size_t next_nnz { it == end ? limit : it->first }, position { 1 };
+	size_t next_nnz, position;
+	next_nnz = it == end ? limit : it->first;
+	position = 1;
 	while( position < limit ) {
-		size_t zero_streak { std::min( next_nnz, limit ) };
+		size_t zero_streak = std::min( next_nnz, limit );
 		// print sequence of zeroes
 		for( ; position < zero_streak; ++position ) {
 			std::cout << ", ";
@@ -76,8 +104,8 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch
 		if( position < limit ) {
 			std::cout << ", ";
 			std::cout << it->second;
-			++position;
-			++it;
+			(void) ++position;
+			(void) ++it;
 			next_nnz = it->first;
 		}
 	}
@@ -85,18 +113,23 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch
 }
 
 /**
- * @brief Prints the first \p limit items of pinned vector \p x with optional
+ * Prints the first \p limit items of pinned vector \p x with optional
  * heading \p head.
  *
+ * Contents will be printed to the standard output stream.
+ *
  * @tparam T vector data type
  * @tparam B GraphBLAS backend storing the vector
  *
- * @param[in] v pinned vector to print
- * @param[in] _limit max number of elements to print; 0 for the entire vector
+ * @param[in] v      Pinned vector to print
+ * @param[in] limit Max number of elements to print; 0 for the entire vector
  * @param[in]  head optional heading to print \b before the vector
+ *
+ * \warning Nonzero values will be printed in an undefined order.
  */
 template< typename T, enum grb::Backend B >
-void print_vector( const grb::PinnedVector< T, B > &v,
+void print_vector(
+	const grb::PinnedVector< T, B > &v,
 	const size_t limit = 10UL,
 	const char * const head = nullptr
 ) {
@@ -107,83 +140,137 @@ void print_vector( const grb::PinnedVector< T, B > &v,
 		std::cout << "<<< " << head << " >>>" << std::endl;
 	}
 	std::cout << "First " << limit << " nonzeroes of x are: ( ";
-	size_t k { 0 };
+	size_t k = 0;
 	if( k < v.nonzeroes() && limit > 0 ) {
 		std::cout << v.getNonzeroValue( k++ );
 	}
-	for( size_t nnzs { 1 }; nnzs < limit && k < v.nonzeroes(); k++ ) {
+	for( size_t nnzs = 1; nnzs < limit && k < v.nonzeroes(); k++ ) {
 		std::cout << ", " << v.getNonzeroValue( k );
-		++nnzs;
+		(void) ++nnzs;
 	}
 	std::cout << " )" << std::endl;
 }
 
 /**
- * @brief Easy matrix container to store a matrix in a \b dense format, thus <b>also zeroes are stored</b>
- *          and the memory occupation is <b>proportional to the full size of the matrix</b>; hence, use with case!
+ * Easy matrix container to store a matrix in a \b dense format.
+ *
+ * \warning Thus, <b>also zeroes are stored</b> and the memory occupation is
+ *          <b>proportional to the full size of the matrix</b>. Hence, use this
+ *          function with care!
+ *
+ * @tparam T the type of the matrix values.
  *
- * @tparam T the type of the matrix values
  */
 template< typename T >
 struct dense_mat {
-	const size_t rows, cols; ///< matrix dimensions
-	T * const dense;              ///< pointer to data, stored in a linear format (row-wise)
+
+	/** The number of rows in the matrix. */
+	const size_t rows;
+
+	/** The number of columns in the matrix. */
+	const size_t cols;
+
+	/** Pointer to the raw data, row-major storage. */
+	T * const dense;
 
 	/**
-	 * @brief Construct a new dense_mat object of given rows and columns, <b>allocating the necessary
-	 *          physical memory for dense storage</b>.
+	 * Construct a new dense_mat object of given rows and columns.
+	 *
+	 * This function <b>allocates the necessary physical memory for dense
+	 * storage</b>.
+	 *
+	 * @param[in] rows          The number of matrix rows.
+	 * @param[in] cols          The number of matrix columns.
+	 * @param[in] initial_value Optional; by default equal to zero.
+	 *
+	 * \warning This function assumes that zero maps to the literal <tt>0</tt>.
+	 *
+	 * @throws Out of memory errors in case #::dense cannot be allocated.
 	 */
-	dense_mat( size_t _nrows, size_t _ncols ) :
-		rows( _nrows ), cols( _ncols ), dense( new T[ rows * cols ] ) // we assume new throws if not enough memory
+	dense_mat(
+		const size_t _nrows, const size_t _ncols,
+		const T initial_value = T( 0 )
+	) :
+		rows( _nrows ), cols( _ncols ),
+		dense( new T[ rows * cols ] )
 	{
 		assert( rows != 0 );
 		assert( cols != 0 );
-		memset( dense, T( 0 ), rows * cols * ( sizeof( T ) ) );
+		std::fill( dense, dense + rows * cols, initial_value );
 	}
 
+	/**
+	 * Releases the resources corresponding to this instance.
+	 */
 	~dense_mat() {
 		delete[] dense;
 	}
 
 	/**
-	 * @brief Operator to access an entire row, which simply returns the pointer to the first row element;
-	 *          this way, one can conveniently write \code mat[i][j]] \endcode to access each element.
+	 * Operator to access an entire row.
+	 *
+	 * @param[in] row The row to access.
+	 *
+	 * Simply returns the pointer to the first row element; this way, one can
+	 * conveniently write \code mat[i][j]] \endcode to access each element.
 	 */
-	inline T * operator[]( size_t row ) {
+	inline T * operator[]( const size_t row ) {
 		return dense + row * cols;
 	}
 
 	/**
-	 * @brief Operator to access an entire row, which simply returns the const pointer to the first row element;
-	 *          this way, one can conveniently write \code mat[i][j]] \endcode to access each element.
+	 * Operator to access an entire row.
+	 *
+	 * @param[in] row The row to access.
+	 *
+	 * Simply returns the const pointer to the first row element; this way, one can
+	 * conveniently write \code mat[i][j]] \endcode to access each element.
 	 */
-	inline const T * operator[]( size_t row ) const {
+	inline const T * operator[]( const size_t row ) const {
 		return dense + row * cols;
 	}
 };
 
 /**
- * @brief Prints up to \p _limit rows and columns of matrix \p mat with optional heading \p head.
+ * Prints up to \p limit rows and columns of matrix \p mat with optional
+ * heading \p head.
  *
  * @tparam T matrix data type
- * @tparam B GraphBLAS backend storing the matrix
- * @param mat matrix to print
- * @param _limit max number of rows and columns to print (0 for all)
- * @param head optional heading to print \b before the matrix
+ * @tparam B ALP/GraphBLAS backend storing the matrix
+ *
+ * @param[in] mat   Matrix to print
+ * @param[in] limit Max. number of rows and columns to print (0 for all)
+ * @param[in] head  Optional heading to print \b before the matrix
+ *
+ * \warning This first casts \a mat to a dense matrix.
+ *
+ * \warning This function does not guard against iterators over \a mat
+ *          (erroneously) returning an element at the same coordinate more
+ *          than once.
  */
-template< typename T, enum grb::Backend B >
-void print_matrix( const grb::Matrix< T, B > & mat, size_t _limit = 0, const char * head = nullptr ) {
+template<
+	typename T,
+	enum grb::Backend B,
+	typename std::enable_if< !std::is_void< T >::value >::type * = nullptr
+>
+void print_matrix(
+	const grb::Matrix< T, B > &mat,
+	const size_t limit = 0,
+	const char * const head = nullptr
+) {
 	const size_t rows = grb::nrows( mat );
 	const size_t cols = grb::ncols( mat );
-	size_t row_limit = _limit == 0 ? rows : std::min( _limit, rows );
-	size_t col_limit = _limit == 0 ? cols : std::min( _limit, cols );
+	size_t row_limit = limit == 0 ? rows : std::min( limit, rows );
+	size_t col_limit = limit == 0 ? cols : std::min( limit, cols );
 	// create and dump only relevant portion
-	dense_mat< T > dump( row_limit, col_limit );
-	for( const std::pair< std::pair< size_t, size_t >, T > & t : mat ) {
-		size_t row { t.first.first };
-		size_t col { t.first.second };
+	dense_mat< std::pair< bool, T> > dump(
+		row_limit, col_limit, std::make_pair( false, static_cast< T >( 0 ) )
+	);
+	for( const std::pair< std::pair< size_t, size_t >, T > &t : mat ) {
+		size_t row = t.first.first;
+		size_t col = t.first.second;
 		if( row < row_limit && col < col_limit ) {
-			dump[ row ][ col ] = t.second;
+			dump[ row ][ col ] = std::make_pair( true, t.second );
 		}
 	}
 
@@ -194,18 +281,307 @@ void print_matrix( const grb::Matrix< T, B > & mat, size_t _limit = 0, const cha
 	std::cout << "Size: " << rows << " x " << cols << std::endl;
 	for( size_t i = 0; i < row_limit; ++i ) {
 		for( size_t j = 0; j < col_limit; ++j ) {
-			double val = dump[ i ][ j ];
-			std::cout << val;
-			if( val == 0.0 ) {
-				std::cout << "  ";
+			bool assigned = dump[ i ][ j ].first;
+			auto val = dump[ i ][ j ].second;
+			if( assigned ) {
+				std::cout << val;
 			} else {
-				std::cout << " ";
+				std::cout << "_";
 			}
+			std::cout << " ";
 		}
 		std::cout << std::endl;
 	}
 	std::cout << "==============" << std::endl << std::endl;
 }
 
+/**
+ * Prints up to \p limit rows and columns of matrix \p mat with optional header
+ * \p head.
+ *
+ * Specialisation for void matrices.
+ *
+ * @tparam T matrix data type
+ * @tparam B GraphBLAS backend storing the matrix
+ *
+ * @param[in] mat   Matrix to print
+ * @param[in] limit Max. number of rows and columns to print (0 for all)
+ * @param[in] head  Optional heading to print \b before the matrix
+ *
+ * \warning This first casts \a mat to a dense matrix.
+ *
+ * \warning This function does not guard against iterators over \a mat
+ *          (erroneously) returning an element at the same coordinate more
+ *          than once.
+ */
+template<
+	typename T,
+	enum grb::Backend B,
+	typename std::enable_if< std::is_void< T >::value >::type * = nullptr
+>
+void print_matrix(
+	const grb::Matrix< T, B > &mat,
+	size_t limit = 0,
+	const char * head = nullptr
+) {
+	const size_t rows = grb::nrows( mat );
+	const size_t cols = grb::ncols( mat );
+	size_t row_limit = limit == 0 ? rows : std::min( limit, rows );
+	size_t col_limit = limit == 0 ? cols : std::min( limit, cols );
+	// create and dump only relevant portion
+	dense_mat< bool > assigned( row_limit, col_limit, false );
+	for( const auto &t : mat ) {
+		auto row = t.first;
+		auto col = t.second;
+		assigned[ row ][ col ] = ( row < row_limit && col < col_limit );
+	}
+
+	if( head != nullptr ) {
+		std::cout << "<<< " << head << " >>>" << std::endl;
+	}
+	std::cout << "=== PATTERN-MATRIX ===" << std::endl;
+	std::cout << "Size: " << rows << " x " << cols << std::endl;
+	for( size_t i = 0; i < row_limit; ++i ) {
+		for( size_t j = 0; j < col_limit; ++j ) {
+			if( assigned[ i ][ j ] ) {
+				std::cout << "X";
+			} else {
+				std::cout << "_";
+			}
+			std::cout << " ";
+		}
+		std::cout << std::endl;
+	}
+	std::cout << "==============" << std::endl << std::endl;
+}
+
+namespace {
+
+	/**
+	 * \internal
+	 * Helper function for printing a void reference CompressedStorage object.
+	 * \endinternal
+	 */
+	template< typename D, class Storage >
+	void printCompressedStorage(
+		const Storage &storage,
+		const size_t n,
+		const size_t nnz,
+		std::ostream &os = std::cout,
+		const typename std::enable_if<
+			std::is_void< D >::value, void
+		>::type * const = nullptr
+	) {
+		os << "  col_start (" << n + 1 << "): [ ";
+		for( size_t i = 0; i <= n; ++i ) {
+			os << storage.col_start[ i ] << " ";
+		}
+		os << "]" << std::endl;
+		os << "  row_index (" << nnz << "): \n[\n";
+		for( size_t i = 0; i < n; ++i ) {
+			os << " " << std::setfill( '0' ) << std::setw( 2 ) << i << ":  ";
+			for( auto t = storage.col_start[ i ]; t < storage.col_start[ i + 1 ]; t++ )
+				os << std::setfill( '0' ) << std::setw( 2 ) << storage.row_index[ t ] << " ";
+			os << std::endl;
+		}
+		os << "]" << std::endl;
+	}
+
+	/**
+	 * \internal
+	 * Helper function for printing a general reference CompressedStorage object.
+	 * \endinternal
+	 */
+	template< typename D, class Storage >
+	void printCompressedStorage(
+		const Storage &storage,
+		const size_t n,
+		const size_t nnz,
+		std::ostream &os,
+		const typename std::enable_if<
+			!std::is_void< D >::value, void
+		>::type * const = nullptr
+	) {
+		printCompressedStorage< void >( storage, n, nnz, os );
+		os << "  values    (" << nnz << "): [ ";
+		for( size_t i = 0; i < nnz; ++i ) {
+			os << storage.values[ i ] << " ";
+		}
+		os << "]" << std::endl << std::flush;
+	}
+
+} // namespace
+
+/**
+ * Print the CRS structure of a grb::Matrix.
+ *
+ * @tparam Enabled boolean flag to enable/disable the function.
+ *
+ * @param[in]     mat   Matrix CRS to print.
+ * @param[in]     label Label to print before the matrix.
+ * @param[in]     limit Max number of rows and columns to print (-1 for all).
+ * @param[in,out] os    Output stream (optional; default is <tt>std::cout</tt>).
+ *
+ * \warning This function does \em not convert to CRS; if the implementing
+ *          backend is not back by a CRS-like format, calling this function will
+ *          not compile.
+ */
+template<
+	bool Enabled = true,
+	typename D, typename RIT, typename CIT, typename NIT,
+	Backend implementation
+>
+void printCRS(
+	const Matrix< D, implementation, RIT, CIT, NIT > &,
+	const std::string & = "",
+	const size_t limit = 128,
+	std::ostream & = std::cout,
+	const typename std::enable_if<
+		implementation != reference &&
+		implementation != reference_omp,
+	void >::type * const  = nullptr
+) {
+	static_assert(
+		implementation != reference &&
+		implementation != reference_omp,
+		"printCRS() is only available for reference and reference_omp backends"
+	);
+}
+
+/**
+ * Print the CRS structure of a grb::Matrix.
+ *
+ * This is the specialisation for the reference and reference_omp backends.
+ *
+ * @tparam Enabled boolean flag to enable/disable the function
+ *
+ * @param[in]     mat   Matrix CRS to print.
+ * @param[in]     label Label to print before the matrix.
+ * @param[in]     limit Max number of rows and columns to print (-1 for all).
+ * @param[in,out] os    Output stream (optional; default is <tt>std::cout</tt>).
+ *
+ * \note The value -1 for \a limit refers to SIZE_MAX.
+ */
+template<
+	bool Enabled = true,
+	typename D, typename RIT, typename CIT, typename NIT,
+	Backend implementation
+>
+void printCRS(
+	const Matrix< D, implementation, RIT, CIT, NIT > &mat,
+	const std::string &label = "",
+	const size_t limit = 128,
+	std::ostream &os = std::cout,
+	const typename std::enable_if<
+		implementation == reference ||
+		implementation == reference_omp,
+	void >::type * const  = nullptr
+) {
+	constexpr const size_t smax = std::numeric_limits< size_t >::max();
+	if( !Enabled ) { return; }
+	if( limit < smax && (nrows( mat ) > limit || ncols( mat ) > limit) ) { return; }
+
+	const grb::RC rc = grb::wait( mat );
+	if( rc != grb::SUCCESS ) {
+		throw std::runtime_error( grb::toString( rc ) );
+	}
+	os << "CRS \"" << label
+		<< "\" (" << nrows( mat ) << "x" << ncols( mat ) << "):\n";
+	printCompressedStorage< D >(
+		internal::getCRS( mat ),
+		grb::nrows( mat ),
+		grb::nnz( mat ),
+		os
+	);
+}
+
+/**
+ * Print the CCS structure of a grb::Matrix.
+ *
+ * @tparam Enabled boolean flag to enable/disable the function.
+ *
+ * @param[in]     mat   Matrix CCS to print.
+ * @param[in]     label Label to print before the matrix.
+ * @param[in]     limit Max number of rows and columns to print (-1 for all).
+ * @param[in,out] os    Output stream (optional, default is <tt>std::cout</tt>.
+ *
+ * \note The value -1 for \a limit refers to SIZE_MAX.
+ *
+ * \warning This function does \em not convert to CCS; if the implementing
+ *          backend is not back by a CCS-like format, calling this function will
+ *          not compile.
+ */
+template<
+	bool Enabled = true,
+	typename D, typename RIT, typename CIT, typename NIT,
+	Backend implementation
+>
+void printCCS(
+	const Matrix< D, implementation, RIT, CIT, NIT > &mat,
+	const std::string &label = "",
+	const size_t limit = 128,
+	std::ostream &os = std::cout,
+	const typename std::enable_if<
+		implementation != reference &&
+		implementation != reference_omp,
+	void >::type * const  = nullptr
+) {
+	static_assert(
+		implementation != reference &&
+		implementation != reference_omp,
+		"printCCS() is only available for reference and reference_omp backends"
+	);
+}
+
+/**
+ * Print the CCS structure of a grb::Matrix.
+ *
+ * This is the specialisation for the reference and reference_omp backends.
+ *
+ * @tparam Enabled boolean flag to enable/disable the function.
+ *
+ * @param[in]     mat   Matrix CCS to print.
+ * @param[in]     label Label to print before the matrix.
+ * @param[in]     limit Max number of rows and columns to print (-1 for all).
+ * @param[in,out] os    Output stream (optional, default is <tt>std::cout</tt>.
+ *
+ * \note The value -1 for \a limit refers to SIZE_MAX.
+ */
+template<
+	bool Enabled = true,
+	typename D, typename RIT, typename CIT, typename NIT,
+	Backend implementation
+>
+void printCCS(
+	const Matrix< D, implementation, RIT, CIT, NIT > &mat,
+	const std::string &label = "",
+	const size_t limit = 128,
+	std::ostream &os = std::cout,
+	const typename std::enable_if<
+		implementation == reference ||
+		implementation == reference_omp,
+	void >::type * const = nullptr
+) {
+	constexpr const size_t smax = std::numeric_limits< size_t >::max();
+	if( !Enabled ) { return; }
+
+	const long rows = static_cast< long >( nrows( mat ) );
+	const long cols = static_cast< long >( ncols( mat ) );
+	if( limit < smax && (rows > limit || cols > limit) ) { return; }
+
+	const grb::RC rc = grb::wait( mat );
+	if( rc != grb::SUCCESS ) {
+		throw std::runtime_error( grb::toString( rc ) );
+	}
+	os << "CCS \"" << label
+		<< "\" (" << nrows( mat ) << "x" << ncols( mat ) << "):\n" ;
+	printCompressedStorage< D >(
+		internal::getCCS( mat ),
+		grb::ncols( mat ),
+		grb::nnz( mat ),
+		os
+	);
+}
+
 #endif // _H_TEST_UTILS_PRINT_VEC_MAT
 

From 95e87742c6d816b72fa04b8e9d87be744989355d Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 2 Nov 2023 13:36:13 +0100
Subject: [PATCH 23/37] Clean blas3.hpp

---
 include/graphblas/reference/blas3.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 1f382f017..8c17995e7 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1513,7 +1513,6 @@ namespace grb {
 						const auto j = coors1.index( k );
 						const auto A_val = getValue(vbuf1, j, identity_A);
 						const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B;
-						std::cout << " * (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n";
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );
@@ -1543,7 +1542,6 @@ namespace grb {
 						}
 						const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A;
 						const auto B_val = getValue(vbuf2, j, identity_B);
-						std::cout << " # (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n";
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );

From f58de690ff843524e8600d07388e43d98b77ffca Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 28 Nov 2023 13:49:49 +0100
Subject: [PATCH 24/37] Refactor test to support bsp1d

---
 tests/unit/eWiseApplyMatrix_variants.cpp | 188 ++++++++++++-----------
 1 file changed, 99 insertions(+), 89 deletions(-)

diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index cab4b4ec7..71875d755 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -171,25 +171,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 	rc = SUCCESS;
 }
 
-int main( int argc, char ** argv ) {
-	(void) argc;
-	(void) argv;
-
-	size_t N = 10;
-
-	if( argc > 2 ) {
-		std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl;
-		return 1;
-	}
-	if( argc == 2 ) {
-		N = std::stoul( argv[ 1 ] );
-	}
-
-	std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush;
-
-	Launcher< AUTOMATIC > launcher;
-
-	// Create input data
+void test_program( const size_t& N, size_t& ) {
 	/** Matrix A: Row matrix filled with A_INITIAL_VALUE
 	 *  X X X X X
 	 * 	_ _ _ _ _
@@ -198,14 +180,19 @@ int main( int argc, char ** argv ) {
 	 * 	_ _ _ _ _
 	 * 	  (...)
 	 */
-	Matrix< nz_type > A( N, N, N );
+	Matrix< nz_type > A( N, N );
+	grb::resize( A, N );
 	{
 		std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
 		std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
 		std::iota( A_cols.begin(), A_cols.end(), 0 );
-		if( SUCCESS !=
+		if(
+			SUCCESS !=
 			buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL )
-		) { return 2; }
+		) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+			+ "Test FAILED: buildMatrixUnique" );
+		}
 	}
 
 	/** Matrix B: Column matrix filled with B_INITIAL_VALUE
@@ -221,9 +208,13 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
 		std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
 		std::iota( B_rows.begin(), B_rows.end(), 0 );
-		if( SUCCESS !=
-			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL )
-		) { return 3; }
+		if(
+				SUCCESS !=
+			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL
+			) ) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED: buildMatrixUnique" );
+		}
 	}
 
 	{ // C = A .+ B
@@ -246,15 +237,18 @@ int main( int argc, char ** argv ) {
 		std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + nrows( A ), A_INITIAL_VALUE );
 		std::fill( C_monoid_truth_values.begin() + nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE );
 		if( SUCCESS !=
-			buildMatrixUnique(
-				C_monoid_truth,
-				C_monoid_truth_rows.data(),
-				C_monoid_truth_cols.data(),
-				C_monoid_truth_values.data(),
-				C_monoid_truth_values.size(),
-				SEQUENTIAL
-			)
-		) { return 4; }
+		    buildMatrixUnique(
+				    C_monoid_truth,
+				    C_monoid_truth_rows.data(),
+				    C_monoid_truth_cols.data(),
+				    C_monoid_truth_values.data(),
+				    C_monoid_truth_values.size(),
+				    SEQUENTIAL
+		    )
+				) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED: buildMatrixUnique" );
+		}
 
 		/** Matrix C_op_truth: Intersection of A and B
 		 *  X+Y ___ ___ ___ ___
@@ -268,30 +262,28 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 );
 		std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE );
 		if( SUCCESS !=
-			buildMatrixUnique(
-				C_op_truth,
-				C_op_truth_rows.data(),
-				C_op_truth_cols.data(),
-				C_op_truth_values.data(),
-				C_op_truth_values.size(),
-				SEQUENTIAL
-			)
-		) { return 5; }
+		    buildMatrixUnique(
+				    C_op_truth,
+				    C_op_truth_rows.data(),
+				    C_op_truth_cols.data(),
+				    C_op_truth_values.data(),
+				    C_op_truth_values.size(),
+				    SEQUENTIAL
+		    )
+				) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED: buildMatrixUnique" );
+		}
 
 		input_t<
-			Monoid< operators::add< nz_type >, identities::zero >
+				Monoid< operators::add< nz_type >, identities::zero >
 		> input { A, B, C_monoid_truth, C_op_truth };
 		output_t output { SUCCESS };
 		// Run the test
-		RC rc = launcher.exec( &grb_program, input, output, false );
-		// Check the result
-		if( rc != SUCCESS ) {
-			std::cerr << "Error: Launcher::exec\n";
-			return 6;
-		}
+		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
-			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
-			return 7;
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
 
@@ -311,30 +303,27 @@ int main( int argc, char ** argv ) {
 		std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+A_INITIAL_VALUE );
 		std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 );
 		if( SUCCESS !=
-			buildMatrixUnique(
-				C_truth,
-				C_truth_rows.data(),
-				C_truth_cols.data(),
-				C_truth_values.data(),
-				C_truth_values.size(),
-				SEQUENTIAL
-			)
-		) { return 8; }
+		    buildMatrixUnique(
+				    C_truth,
+				    C_truth_rows.data(),
+				    C_truth_cols.data(),
+				    C_truth_values.data(),
+				    C_truth_values.size(),
+				    SEQUENTIAL
+		    )) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+					+ "Test FAILED: buildMatrixUnique" );
+		}
 
 		input_t<
-			Monoid< operators::add< nz_type >, identities::zero >
+				Monoid< operators::add< nz_type >, identities::zero >
 		> input { A, A, C_truth, C_truth };
 		output_t output { SUCCESS };
 		// Run the test
-		RC rc = launcher.exec( &grb_program, input, output, false );
-		// Check the result
-		if( rc != SUCCESS ) {
-			std::cerr << "Error: Launcher::exec\n";
-			return 9;
-		}
+		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
-			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
-			return 10;
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
 
@@ -354,33 +343,54 @@ int main( int argc, char ** argv ) {
 		std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+B_INITIAL_VALUE );
 		std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 );
 		if( SUCCESS !=
-			buildMatrixUnique(
-				C_truth,
-				C_truth_rows.data(),
-				C_truth_cols.data(),
-				C_truth_values.data(),
-				C_truth_values.size(),
-				SEQUENTIAL
-			)
-		) { return 8; }
+		    buildMatrixUnique(
+				    C_truth,
+				    C_truth_rows.data(),
+				    C_truth_cols.data(),
+				    C_truth_values.data(),
+				    C_truth_values.size(),
+				    SEQUENTIAL
+		    )) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED: buildMatrixUnique" );
+		}
 
 		input_t<
-			Monoid< operators::add< nz_type >, identities::zero >,
-			descriptors::transpose_right
+				Monoid< operators::add< nz_type >, identities::zero >,
+				descriptors::transpose_right
 		> input { A, B, C_truth, C_truth };
 		output_t output { SUCCESS };
 		// Run the test
-		RC rc = launcher.exec( &grb_program, input, output, false );
-		// Check the result
-		if( rc != SUCCESS ) {
-			std::cerr << "Error: Launcher::exec\n";
-			return 9;
-		}
+		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
-			std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl;
-			return 10;
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ "Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
+}
+
+int main( int argc, char ** argv ) {
+	(void) argc;
+	(void) argv;
+
+	std::cerr << __func__ << " is not implemented yet" << std::endl;
+
+	size_t N = 10;
+
+	if( argc > 2 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl;
+		return 1;
+	}
+	if( argc == 2 ) {
+		N = std::stoul( argv[ 1 ] );
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush;
+
+	Launcher< AUTOMATIC > launcher;
+
+	// Create input data
+	RC rc = launcher.exec( &test_program, N, N, false );
 
 	std::cerr << std::flush;
 	std::cout << "Test OK" << std::endl << std::flush;

From d0cecde6ae98fc02b1c4487531738926664ab81b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 28 Nov 2023 13:58:26 +0100
Subject: [PATCH 25/37] Simplify bsp1d API

---
 include/graphblas/bsp1d/blas3.hpp | 112 ++++++++++++------------------
 1 file changed, 45 insertions(+), 67 deletions(-)

diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp
index 386beb164..1cf46e98e 100644
--- a/include/graphblas/bsp1d/blas3.hpp
+++ b/include/graphblas/bsp1d/blas3.hpp
@@ -119,89 +119,67 @@ namespace grb {
 
 	/** \internal Simply delegates to process-local backend */
 	template<
-		Descriptor descr = descriptors::no_operation,
-		class MulMonoid,
-		typename OutputType, typename InputType1, typename InputType2,
-		typename RIT1, typename CIT1, typename NIT1,
-		typename RIT2, typename CIT2, typename NIT2,
-		typename RIT3, typename CIT3, typename NIT3
+			Descriptor descr = descriptors::no_operation,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
 	>
 	RC eWiseApply(
-		Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
-		const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
-		const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
-		const MulMonoid &mul,
-		const Phase phase = EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			grb::is_monoid< MulMonoid >::value,
-		void >::type * const = nullptr
+			Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
+			const MulMonoid &mul,
+			const Phase phase = EXECUTE,
+			const typename std::enable_if<
+					!grb::is_object< OutputType >::value &&
+					!grb::is_object< InputType1 >::value &&
+					!grb::is_object< InputType2 >::value &&
+					grb::is_monoid< MulMonoid >::value,
+					void >::type * const = nullptr
 	) {
 		assert( phase != TRY );
-		RC local_rc = SUCCESS;
-		if( phase == RESIZE ) {
-			RC ret = eWiseApply< descr >(
-				internal::getLocal( C ),
-				internal::getLocal( A ), internal::getLocal( B ),
-				mul,
-				RESIZE
-			);
-			if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) {
-				return PANIC;
-			} else {
-				return ret;
-			}
-		} else {
-			assert( phase == EXECUTE );
-			local_rc = eWiseApply< descr >(
+		RC ret = eWiseApply< descr >(
 				internal::getLocal( C ),
-				internal::getLocal( A ), internal::getLocal( B ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
 				mul,
-				EXECUTE
-			);
-		}
-		return internal::checkGlobalErrorStateOrClear( C, local_rc );
+				phase
+		);
+		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
 
 	/** \internal Simply delegates to process-local backend */
 	template<
-		Descriptor descr = descriptors::no_operation,
-		class Operator,
-		typename OutputType, typename InputType1, typename InputType2,
-		typename RIT1, typename CIT1, typename NIT1,
-		typename RIT2, typename CIT2, typename NIT2,
-		typename RIT3, typename CIT3, typename NIT3
+			Descriptor descr = descriptors::no_operation,
+			class Operator,
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
 	>
 	RC eWiseApply(
-		Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
-		const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
-		const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
-		const Operator &op,
-		const Phase phase = EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			grb::is_operator< Operator >::value,
-		void >::type * const = nullptr
+			Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
+			const Operator &op,
+			const Phase phase = EXECUTE,
+			const typename std::enable_if<
+					!grb::is_object< OutputType >::value &&
+					!grb::is_object< InputType1 >::value &&
+					!grb::is_object< InputType2 >::value &&
+					grb::is_operator< Operator >::value,
+					void >::type * const = nullptr
 	) {
 		assert( phase != TRY );
 		RC ret = eWiseApply< descr >(
-			internal::getLocal( C ),
-			internal::getLocal( A ), internal::getLocal( B ),
-			op,
-			phase
+				internal::getLocal( C ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
+				op,
+				phase
 		);
-		if( phase == RESIZE ) {
-			if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) {
-				return PANIC;
-			} else {
-				return SUCCESS;
-			}
-		}
-		assert( phase == EXECUTE );
 		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
 

From c524470b3e2c035885eeeca438908db471357c88 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 4 Dec 2023 14:03:23 +0100
Subject: [PATCH 26/37] Syntax arrangements

---
 include/graphblas/bsp1d/blas3.hpp        | 88 ++++++++++++------------
 include/graphblas/reference/blas3.hpp    | 66 +++++++++---------
 tests/unit/eWiseApplyMatrix_variants.cpp | 47 ++++++-------
 3 files changed, 101 insertions(+), 100 deletions(-)

diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp
index 1cf46e98e..0ddbdf41b 100644
--- a/include/graphblas/bsp1d/blas3.hpp
+++ b/include/graphblas/bsp1d/blas3.hpp
@@ -119,66 +119,66 @@ namespace grb {
 
 	/** \internal Simply delegates to process-local backend */
 	template<
-			Descriptor descr = descriptors::no_operation,
-			class MulMonoid,
-			typename OutputType, typename InputType1, typename InputType2,
-			typename RIT1, typename CIT1, typename NIT1,
-			typename RIT2, typename CIT2, typename NIT2,
-			typename RIT3, typename CIT3, typename NIT3
+		Descriptor descr = descriptors::no_operation,
+		class MulMonoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
 	>
 	RC eWiseApply(
-			Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
-			const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
-			const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
-			const MulMonoid &mul,
-			const Phase phase = EXECUTE,
-			const typename std::enable_if<
-					!grb::is_object< OutputType >::value &&
-					!grb::is_object< InputType1 >::value &&
-					!grb::is_object< InputType2 >::value &&
-					grb::is_monoid< MulMonoid >::value,
-					void >::type * const = nullptr
+		Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
+		const MulMonoid &mul,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< MulMonoid >::value, void
+			>::type * const = nullptr
 	) {
 		assert( phase != TRY );
 		RC ret = eWiseApply< descr >(
-				internal::getLocal( C ),
-				internal::getLocal( A ),
-				internal::getLocal( B ),
-				mul,
-				phase
+			internal::getLocal( C ),
+			internal::getLocal( A ),
+			internal::getLocal( B ),
+			mul,
+			phase
 		);
 		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
 
 	/** \internal Simply delegates to process-local backend */
 	template<
-			Descriptor descr = descriptors::no_operation,
-			class Operator,
-			typename OutputType, typename InputType1, typename InputType2,
-			typename RIT1, typename CIT1, typename NIT1,
-			typename RIT2, typename CIT2, typename NIT2,
-			typename RIT3, typename CIT3, typename NIT3
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
 	>
 	RC eWiseApply(
-			Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
-			const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
-			const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
-			const Operator &op,
-			const Phase phase = EXECUTE,
-			const typename std::enable_if<
-					!grb::is_object< OutputType >::value &&
-					!grb::is_object< InputType1 >::value &&
-					!grb::is_object< InputType2 >::value &&
-					grb::is_operator< Operator >::value,
-					void >::type * const = nullptr
+		Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value, void
+			>::type * const = nullptr
 	) {
 		assert( phase != TRY );
 		RC ret = eWiseApply< descr >(
-				internal::getLocal( C ),
-				internal::getLocal( A ),
-				internal::getLocal( B ),
-				op,
-				phase
+			internal::getLocal( C ),
+			internal::getLocal( A ),
+			internal::getLocal( B ),
+			op,
+			phase
 		);
 		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 8c17995e7..219bd93d4 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -961,12 +961,12 @@ namespace grb {
 			const Operator &oper,
 			const Phase &phase,
 			const typename std::enable_if<
-				!grb::is_object< OutputType >::value &&
-				!grb::is_object< InputType1 >::value &&
-				!grb::is_object< InputType2 >::value &&
-				grb::is_operator< Operator >::value,
-				void
-			>::type * const = nullptr
+					!grb::is_object< OutputType >::value &&
+					!grb::is_object< InputType1 >::value &&
+					!grb::is_object< InputType2 >::value &&
+					grb::is_operator< Operator >::value,
+					void
+				>::type * const = nullptr
 		) {
 #ifdef _DEBUG
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
@@ -1264,11 +1264,12 @@ namespace grb {
 			const Monoid &monoid,
 			const Phase &phase,
 			const typename std::enable_if<
-				!grb::is_object< OutputType >::value &&
-				!grb::is_object< InputType1 >::value &&
-				!grb::is_object< InputType2 >::value &&
-				grb::is_monoid< Monoid >::value,
-			void >::type * const = nullptr
+					!grb::is_object< OutputType >::value &&
+					!grb::is_object< InputType1 >::value &&
+					!grb::is_object< InputType2 >::value &&
+					grb::is_monoid< Monoid >::value,
+					void
+				>::type * const = nullptr
 		) {
 
 #ifdef _DEBUG
@@ -1623,11 +1624,11 @@ namespace grb {
 	} // namespace internal
 
 	/**
-	 * Computes \f$ C = A . B \f$ for a given monoid.
+	 * Computes \f$ C = A . B \f$ for a given monoid (union pattern).
 	 *
 	 * \internal Allows pattern matrix inputs.
 	 *
-	 * \internal Dispatches to internal::eWiseApply_matrix_generic
+	 * \internal Dispatches to internal::eWiseApply_matrix_generic_union
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -1642,12 +1643,14 @@ namespace grb {
 		const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
 		const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
 		const MulMonoid &mulmono,
-		const Phase phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			grb::is_monoid< MulMonoid >::value,
-		void >::type * const = nullptr
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< MulMonoid >::value,
+				void
+			>::type * const = nullptr
 	) {
 		// static checks
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
@@ -1679,13 +1682,12 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ C = A . B \f$ for a given binary operator.
+	 * Computes \f$ C = A . B \f$ for a given operator (intersection pattern).
 	 *
-	 * \internal Pattern matrices not allowed
+	 * \internal Allows pattern matrix inputs.
 	 *
-	 * \internal Dispatches to internal::eWiseApply_matrix_generic
+	 * \internal Dispatches to internal::eWiseApply_matrix_generic_intersection
 	 */
-
 	template<
 		Descriptor descr = grb::descriptors::no_operation,
 		class Operator,
@@ -1698,13 +1700,15 @@ namespace grb {
 		Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
 		const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A,
 		const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B,
-		const Operator &mulOp,
-		const Phase phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			grb::is_operator< Operator >::value,
-		void >::type * const = nullptr
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value,
+				void
+			>::type * const = nullptr
 	) {
 		// static checks
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
@@ -1737,7 +1741,7 @@ namespace grb {
 #endif
 
 		return internal::eWiseApply_matrix_generic_intersection< descr >(
-			C, A, B, mulOp, phase
+			C, A, B, op, phase
 		);
 	}
 
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index 71875d755..5e92b7d3c 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -43,12 +43,13 @@
 
 using namespace grb;
 
-
 using nz_type = int;
 
 constexpr nz_type A_INITIAL_VALUE = 1;
 constexpr nz_type B_INITIAL_VALUE = 3;
 
+// #define _DEBUG
+
 
 template< typename D >
 bool equals_matrix(
@@ -127,7 +128,6 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 			std::cerr << "Error: Phase::EXECUTE\n";
 			return;
 		}
-		print_matrix( C, 10, "C (intersection)" );
 		if( !equals_matrix( C, input.C_operator ) ) {
 			std::cerr << "Error: Wrong result\n";
 			rc = FAILED;
@@ -158,7 +158,6 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 			std::cerr << "Error: Phase::EXECUTE\n";
 			return;
 		}
-		print_matrix( C, 10, "C (union)" );
 		if( !equals_matrix( C, input.C_monoid ) ) {
 			std::cerr << "Error: Wrong result\n";
 			rc = FAILED;
@@ -180,8 +179,7 @@ void test_program( const size_t& N, size_t& ) {
 	 * 	_ _ _ _ _
 	 * 	  (...)
 	 */
-	Matrix< nz_type > A( N, N );
-	grb::resize( A, N );
+	Matrix< nz_type > A( N, N, N );
 	{
 		std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
 		std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
@@ -208,10 +206,9 @@ void test_program( const size_t& N, size_t& ) {
 		std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
 		std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
 		std::iota( B_rows.begin(), B_rows.end(), 0 );
-		if(
-				SUCCESS !=
-			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL
-			) ) {
+		if( SUCCESS !=
+			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL)
+		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
 				+ "Test FAILED: buildMatrixUnique" );
 		}
@@ -243,9 +240,8 @@ void test_program( const size_t& N, size_t& ) {
 				    C_monoid_truth_cols.data(),
 				    C_monoid_truth_values.data(),
 				    C_monoid_truth_values.size(),
-				    SEQUENTIAL
-		    )
-				) {
+				    SEQUENTIAL)
+		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
 				+ "Test FAILED: buildMatrixUnique" );
 		}
@@ -268,9 +264,8 @@ void test_program( const size_t& N, size_t& ) {
 				    C_op_truth_cols.data(),
 				    C_op_truth_values.data(),
 				    C_op_truth_values.size(),
-				    SEQUENTIAL
-		    )
-				) {
+				    SEQUENTIAL)
+		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
 				+ "Test FAILED: buildMatrixUnique" );
 		}
@@ -310,7 +305,8 @@ void test_program( const size_t& N, size_t& ) {
 				    C_truth_values.data(),
 				    C_truth_values.size(),
 				    SEQUENTIAL
-		    )) {
+		    )
+		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
 					+ "Test FAILED: buildMatrixUnique" );
 		}
@@ -350,7 +346,8 @@ void test_program( const size_t& N, size_t& ) {
 				    C_truth_values.data(),
 				    C_truth_values.size(),
 				    SEQUENTIAL
-		    )) {
+		    )
+		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
 				+ "Test FAILED: buildMatrixUnique" );
 		}
@@ -373,9 +370,7 @@ int main( int argc, char ** argv ) {
 	(void) argc;
 	(void) argv;
 
-	std::cerr << __func__ << " is not implemented yet" << std::endl;
-
-	size_t N = 10;
+	size_t N = 1000;
 
 	if( argc > 2 ) {
 		std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl;
@@ -387,13 +382,15 @@ int main( int argc, char ** argv ) {
 
 	std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush;
 
+	// Launch the test
 	Launcher< AUTOMATIC > launcher;
-
-	// Create input data
-	RC rc = launcher.exec( &test_program, N, N, false );
+	RC rc = launcher.exec( &test_program, N, N, true );
+	if( rc != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl;
+		return static_cast<int>( rc );
+	}
 
 	std::cerr << std::flush;
-	std::cout << "Test OK" << std::endl << std::flush;
-
+	std::cout << std::flush << "Test OK" << std::endl;
 	return 0;
 }

From 5559540e11a8930f157ee59bcc1dd4b6f4e3845e Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 4 Dec 2023 14:43:13 +0100
Subject: [PATCH 27/37] Enabling and testing pattern inputs matrices

---
 include/graphblas/reference/blas3.hpp    |  19 +--
 tests/unit/eWiseApplyMatrix_variants.cpp | 182 +++++++++++++++++++----
 2 files changed, 162 insertions(+), 39 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 219bd93d4..e47daf074 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -972,14 +972,7 @@ namespace grb {
 			std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n";
 #endif
 			assert( phase != TRY );
-			static_assert(
-				!(
-				    std::is_same< InputType1, void >::value ||
-				    std::is_same< InputType2, void >::value
-				),
-				"grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid"
-				" version of elementwise mxm can only be used if neither of the"
-				" input matrices is a pattern matrix (of type void)" );
+
 			constexpr bool crs_only = descr & descriptors::force_row_major;
 			// get whether the matrices should be transposed prior to execution
 			constexpr bool trans_left = descr & descriptors::transpose_left;
@@ -1729,12 +1722,10 @@ namespace grb {
 			"called with an output matrix C that does not match the output domain "
 			"of the given multiplication operator"
 		);
-		static_assert( ( !(
-				std::is_same< InputType1, void >::value ||
-				std::is_same< InputType2, void >::value )
-			), "grb::eWiseApply (reference, matrix <- matrix x matrix, operator): "
-			"the operator version of eWiseApply cannot be used if either of the "
-			"input matrices is a pattern matrix (of type void)"
+		static_assert(
+			!std::is_void< OutputType >::value,
+			"grb::eWiseApply: the elementwise mxm cannot be used if the"
+			" output matrix is a pattern matrix (of type void)"
 		);
 #ifdef _DEBUG
 		std::cout << "In grb::eWiseApply( reference, operator )\n";
diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp
index 5e92b7d3c..8b8cc7437 100644
--- a/tests/unit/eWiseApplyMatrix_variants.cpp
+++ b/tests/unit/eWiseApplyMatrix_variants.cpp
@@ -75,19 +75,26 @@ bool equals_matrix(
 	return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() );
 }
 
-template< class Monoid, Descriptor descr = descriptors::no_operation >
+
+template<
+	class Monoid,
+	typename ValueTypeA,
+	typename ValueTypeB,
+	typename ValueTypeC,
+	Descriptor descr = descriptors::no_operation
+>
 struct input_t {
-	const Matrix< nz_type > &A;
-	const Matrix< nz_type > &B;
-	const Matrix< nz_type > &C_monoid;
-	const Matrix< nz_type > &C_operator;
+	const Matrix< ValueTypeA > &A;
+	const Matrix< ValueTypeB > &B;
+	const Matrix< ValueTypeC > &C_monoid;
+	const Matrix< ValueTypeC > &C_operator;
 	const Monoid &monoid;
 
 	input_t(
-		const Matrix< nz_type > &A = {0,0},
-		const Matrix< nz_type > &B = {0,0},
-		const Matrix< nz_type > &C_monoid = {0,0},
-		const Matrix< nz_type > &C_operator = {0,0},
+		const Matrix< ValueTypeA > &A = {0,0},
+		const Matrix< ValueTypeB > &B = {0,0},
+		const Matrix< ValueTypeC > &C_monoid = {0,0},
+		const Matrix< ValueTypeC > &C_operator = {0,0},
 		const Monoid &monoid = Monoid()
 	) : A( A ),
 		B( B ),
@@ -96,12 +103,22 @@ struct input_t {
 		monoid( monoid ) {}
 };
 
+
 struct output_t {
 	RC rc;
 };
 
-template< class Monoid, Descriptor descr >
-void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
+template<
+	class Monoid,
+	typename ValueTypeA,
+	typename ValueTypeB,
+	typename ValueTypeC,
+	Descriptor descr
+>
+void grb_program(
+	const input_t< Monoid, ValueTypeA, ValueTypeB, ValueTypeC, descr > &input,
+	output_t &output
+) {
 	static_assert( is_monoid< Monoid >::value, "Monoid required" );
 	const auto &op = input.monoid.getOperator();
 
@@ -110,7 +127,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 	{ // Operator variant
 		std::cout << "  -- eWiseApply using Operator, supposed to be"
 					<< " annihilating non-zeroes -> INTERSECTION\n";
-		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
+		Matrix< ValueTypeC > C( nrows( input.A ), ncols( input.A ) );
 
 		rc = eWiseApply<descr>( C, input.A, input.B, op, RESIZE );
 		if( rc != SUCCESS ) {
@@ -140,7 +157,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 	{ // Monoid variant
 		std::cout << "  -- eWiseApply using Monoid, supposed to consider"
 					<< " non-zeroes as the identity -> UNION\n";
-		Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) );
+		Matrix< ValueTypeC > C( nrows( input.A ), ncols( input.A ) );
 
 		rc = eWiseApply<descr>( C, input.A, input.B, input.monoid, RESIZE );
 		if( rc != SUCCESS ) {
@@ -171,7 +188,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) {
 }
 
 void test_program( const size_t& N, size_t& ) {
-	/** Matrix A: Row matrix filled with A_INITIAL_VALUE
+	/** Matrix A: Matrix filled with A_INITIAL_VALUE
 	 *  X X X X X
 	 * 	_ _ _ _ _
 	 * 	_ _ _ _ _ (...)
@@ -180,6 +197,7 @@ void test_program( const size_t& N, size_t& ) {
 	 * 	  (...)
 	 */
 	Matrix< nz_type > A( N, N, N );
+	Matrix< void > A_void( N, N, N );
 	{
 		std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 );
 		std::vector< nz_type > A_values( N, A_INITIAL_VALUE );
@@ -189,10 +207,18 @@ void test_program( const size_t& N, size_t& ) {
 			buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL )
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-			+ "Test FAILED: buildMatrixUnique" );
+			+ ": Test FAILED: buildMatrixUnique" );
+		}
+		if(
+			SUCCESS !=
+			buildMatrixUnique( A_void, A_rows.data(), A_cols.data(), A_rows.size(), SEQUENTIAL )
+		) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+			+ ": Test FAILED: buildMatrixUnique" );
 		}
 	}
 
+
 	/** Matrix B: Column matrix filled with B_INITIAL_VALUE
 	 *  Y _ _ _ _
 	 * 	Y _ _ _ _
@@ -202,6 +228,7 @@ void test_program( const size_t& N, size_t& ) {
 	 * 	  (...)
 	 */
 	Matrix< nz_type > B( N, N, N );
+	Matrix< void > B_void( N, N, N );
 	{
 		std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 );
 		std::vector< nz_type > B_values( N, B_INITIAL_VALUE );
@@ -210,7 +237,14 @@ void test_program( const size_t& N, size_t& ) {
 			buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL)
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED: buildMatrixUnique" );
+				+ ": Test FAILED: buildMatrixUnique" );
+		}
+		if(
+			SUCCESS !=
+			buildMatrixUnique( B_void, B_rows.data(), B_cols.data(), B_rows.size(), SEQUENTIAL )
+		) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+			+ ": Test FAILED: buildMatrixUnique" );
 		}
 	}
 
@@ -243,7 +277,7 @@ void test_program( const size_t& N, size_t& ) {
 				    SEQUENTIAL)
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED: buildMatrixUnique" );
+				+ ": Test FAILED: buildMatrixUnique" );
 		}
 
 		/** Matrix C_op_truth: Intersection of A and B
@@ -267,18 +301,19 @@ void test_program( const size_t& N, size_t& ) {
 				    SEQUENTIAL)
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED: buildMatrixUnique" );
+				+ ": Test FAILED: buildMatrixUnique" );
 		}
 
 		input_t<
-				Monoid< operators::add< nz_type >, identities::zero >
+				Monoid< operators::add< nz_type >, identities::zero >,
+				nz_type, nz_type, nz_type
 		> input { A, B, C_monoid_truth, C_op_truth };
 		output_t output { SUCCESS };
 		// Run the test
 		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED (" + toString( output.rc ) + ")" );
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
 
@@ -308,21 +343,115 @@ void test_program( const size_t& N, size_t& ) {
 		    )
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-					+ "Test FAILED: buildMatrixUnique" );
+					+ ": Test FAILED: buildMatrixUnique" );
 		}
 
 		input_t<
-				Monoid< operators::add< nz_type >, identities::zero >
+				Monoid< operators::add< nz_type >, identities::zero >,
+				nz_type, nz_type, nz_type
 		> input { A, A, C_truth, C_truth };
 		output_t output { SUCCESS };
 		// Run the test
 		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED (" + toString( output.rc ) + ")" );
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
 
+	{ // C = A .+ A(void)
+		std::cout << "-- Test C = A .+ A(void)\n";
+		/** Matrix C_truth: Union/intersection of A and A
+		 * X+0 X+0 X+0 X+0 X+0
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___(...)
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		const Matrix< nz_type >& C_truth = A;
+
+		input_t<
+				Monoid< operators::add< nz_type >, identities::zero >,
+				nz_type, void, nz_type
+		> input { A, A_void, C_truth, C_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		grb_program(input, output );
+		if( output.rc != SUCCESS ) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
+		}
+	}
+
+	{ // C = A(void) .+ A
+		std::cout << "-- Test C = A(void) .+ A\n";
+		/** Matrix C_truth: Union/intersection of A and A
+		 * 0+X 0+X 0+X 0+X 0+X
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___(...)
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		const Matrix< nz_type >& C_truth = A;
+
+		input_t<
+				Monoid< operators::add< nz_type >, identities::zero >,
+				void, nz_type, nz_type
+		> input { A_void, A, C_truth, C_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		grb_program(input, output );
+		if( output.rc != SUCCESS ) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
+		}
+	}
+
+	{ // C = A(void) .+ A
+		std::cout << "-- Test C = A(void) .+ A(void)\n";
+		/** Matrix C_truth: Union/intersection of A and A
+		 * 0+0 0+0 0+0 0+0 0+0
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___(...)
+		 * ___ ___ ___ ___ ___
+		 * ___ ___ ___ ___ ___
+		 * 	      (...)
+		 */
+		Matrix< nz_type > C_truth( N, N );
+		size_t nvalues = ncols( A );
+		std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 );
+		std::vector< nz_type > C_truth_values( nvalues, 0 );
+		std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 );
+		if( SUCCESS !=
+		    buildMatrixUnique(
+				    C_truth,
+				    C_truth_rows.data(),
+				    C_truth_cols.data(),
+				    C_truth_values.data(),
+				    C_truth_values.size(),
+				    SEQUENTIAL
+		    )
+		) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+					+ ": Test FAILED: buildMatrixUnique" );
+		}
+
+		input_t<
+				Monoid< operators::add< nz_type >, identities::zero >,
+				void, void, nz_type
+		> input { A_void, A_void, C_truth, C_truth };
+		output_t output { SUCCESS };
+		// Run the test
+		grb_program(input, output );
+		if( output.rc != SUCCESS ) {
+			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
+		}
+	}
+
+
 	{ // C = A .+ Bt
 		std::cout << "-- Test C = A .+ Bt\n";
 		/** Matrix C_truth: Union/intersection of A and Bt
@@ -349,11 +478,12 @@ void test_program( const size_t& N, size_t& ) {
 		    )
 		) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED: buildMatrixUnique" );
+				+ ": Test FAILED: buildMatrixUnique" );
 		}
 
 		input_t<
 				Monoid< operators::add< nz_type >, identities::zero >,
+				nz_type, nz_type, nz_type,
 				descriptors::transpose_right
 		> input { A, B, C_truth, C_truth };
 		output_t output { SUCCESS };
@@ -361,9 +491,11 @@ void test_program( const size_t& N, size_t& ) {
 		grb_program(input, output );
 		if( output.rc != SUCCESS ) {
 			throw std::runtime_error("(LINE " + std::to_string(__LINE__)
-				+ "Test FAILED (" + toString( output.rc ) + ")" );
+				+ ": Test FAILED (" + toString( output.rc ) + ")" );
 		}
 	}
+
+
 }
 
 int main( int argc, char ** argv ) {

From fb4e14115526228ffa593e5d38a59a6be5500a3c Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 4 Dec 2023 14:53:22 +0100
Subject: [PATCH 28/37] Redundancy removal in static assertions

---
 include/graphblas/nonblocking/blas3.hpp | 26 -------------------------
 include/graphblas/reference/blas3.hpp   |  6 ++++--
 2 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
index 5ecb1fffa..8b6cd732b 100644
--- a/include/graphblas/nonblocking/blas3.hpp
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -500,32 +500,6 @@ namespace grb {
 			grb::is_operator< Operator >::value,
 		void >::type * const = nullptr
 	) {
-		// static checks
-		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
-			std::is_same< typename Operator::D1, InputType1 >::value ),
-			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
-			"called with a prefactor input matrix A that does not match the first "
-			"domain of the given multiplication operator"
-		);
-		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
-			std::is_same< typename Operator::D2, InputType2 >::value ),
-			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
-			"called with a postfactor input matrix B that does not match the first "
-			"domain of the given multiplication operator"
-		);
-		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
-			std::is_same< typename Operator::D3, OutputType >::value ),
-			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
-			"called with an output matrix C that does not match the output domain "
-			"of the given multiplication operator"
-		);
-		static_assert( ( !(
-				std::is_same< InputType1, void >::value ||
-				std::is_same< InputType2, void >::value )
-			), "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator): "
-			"the operator version of eWiseApply cannot be used if either of the "
-			"input matrices is a pattern matrix (of type void)"
-		);
 		if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) {
 			std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
 				<< "blocking implementation.\n"
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index e47daf074..aafe136ef 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1703,15 +1703,17 @@ namespace grb {
 				void
 			>::type * const = nullptr
 	) {
+		typedef typename std::conditional<std::is_void<InputType1>::value, typename Operator::D1, InputType1>::type ActualInputType1;
+		typedef typename std::conditional<std::is_void<InputType2>::value, typename Operator::D2, InputType1>::type ActualInputType2;
 		// static checks
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
-			std::is_same< typename Operator::D1, InputType1 >::value ),
+			std::is_same< typename Operator::D1, ActualInputType1 >::value ),
 			"grb::eWiseApply (reference, matrix <- matrix x matrix, operator)",
 			"called with a prefactor input matrix A that does not match the first "
 			"domain of the given multiplication operator"
 		);
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
-			std::is_same< typename Operator::D2, InputType2 >::value ),
+			std::is_same< typename Operator::D2, ActualInputType2 >::value ),
 			"grb::eWiseApply (reference, matrix <- matrix x matrix, operator)",
 			"called with a postfactor input matrix B that does not match the first "
 			"domain of the given multiplication operator"

From 5eea2358a846dfa632dad98ff9b725ac4c06a9b0 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 8 Dec 2023 16:53:55 +0100
Subject: [PATCH 29/37] temporary bugfix for parallel coordinates assignments

---
 include/graphblas/reference/blas3.hpp | 85 ++-------------------------
 1 file changed, 5 insertions(+), 80 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index e3efe56f6..d5d1eca1a 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1126,38 +1126,11 @@ namespace grb {
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel default(none) \
-						shared(coors1, valbuf) \
-						firstprivate(i, A_raw, dummy_identity)
-#endif
-					{
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update = coors1.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns = coors1.maxAsyncAssigns();
-						size_t assigns = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
-						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-							const size_t k_col = A_raw.row_index[ k ];
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors1.asyncAssign( k_col, local_update ) ) {
-								valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
-								if( ++assigns == maxAsyncAssigns ) {
-									coors1.joinUpdate( local_update );
-									assigns = 0;
-								}
-							}
-#else
-							if( !coors1.assign( k_col ) ) {
-								valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
-							}
-#endif
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						if( !coors1.assign( k_col ) ) {
+							valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
 						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors1.joinUpdate( local_update ) ) {}
-#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
@@ -1430,78 +1403,30 @@ namespace grb {
 					}
 				}
 
-
 				// do computations
-
 				nzc = 0;
+				std::cerr << "HERE\n";
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 					coors2.clear();
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel default(none) \
-						shared(coors1, vbuf1, coors2, vbuf2) \
-						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
-#endif
-					{
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update1 = coors1.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
-						size_t assigns1 = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
 						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 							const size_t k_col = A_raw.row_index[ k ];
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-								if( ++assigns1 == maxAsyncAssigns1 ) {
-									coors1.joinUpdate( local_update1 );
-									assigns1 = 0;
-								}
-							}
-#else
 							if( !coors1.assign( k_col ) ) {
 								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 							}
-#endif
 						}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors1.joinUpdate( local_update1 )) {}
-#endif
-
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update2 = coors2.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
-						size_t assigns2 = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
 						for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
 							const size_t k_col = B_raw.row_index[ k ];
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
-								if( ++assigns2 == maxAsyncAssigns2 ) {
-									coors2.joinUpdate( local_update2 );
-									assigns2 = 0;
-								}
-							}
-#else
 							if( !coors2.assign( k_col ) ) {
 								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
 							}
-#endif
 						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors2.joinUpdate( local_update2 )) {}
-#endif
-					}
 
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
 						const auto j = coors1.index( k );

From 7872b79fdec8fba67003ada251f4acd1523efdea Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 17 Jan 2024 11:43:09 +0100
Subject: [PATCH 30/37] Re-enable parallel Coordinates iteration

---
 include/graphblas/reference/blas3.hpp | 129 ++++++++++++++++++++------
 1 file changed, 99 insertions(+), 30 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index d5d1eca1a..6d8406e5e 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -63,22 +63,22 @@
 namespace grb::internal
 {
 	template< typename D, typename T >
-	static inline void assignValue(
+	static void assignValue(
 		D *array, size_t i, const T& value,
 		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
 	) { array[i] = value; }
 
 	template< typename T >
-	static inline void assignValue( void *, size_t, const T& ) { /* do nothing */ }
+	static void assignValue( void *, size_t, const T& ) { /* do nothing */ }
 
 	template< typename D, typename T >
-	static inline T getValue(
+	static T getValue(
 		const D *array, size_t i, const T&,
 		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
 	) { return array[i]; }
 
 	template< typename T >
-	static inline T getValue( const void *, size_t, const T& identity ) { return identity; }
+	static T getValue( const void *, size_t, const T& identity ) { return identity; }
 
 } // namespace grb::internal
 
@@ -961,11 +961,10 @@ namespace grb {
 			const Operator &oper,
 			const Phase &phase,
 			const typename std::enable_if<
-					!grb::is_object< OutputType >::value &&
-					!grb::is_object< InputType1 >::value &&
-					!grb::is_object< InputType2 >::value &&
-					grb::is_operator< Operator >::value,
-					void
+					!is_object< OutputType >::value &&
+					!is_object< InputType1 >::value &&
+					!is_object< InputType2 >::value &&
+					is_operator< Operator >::value
 				>::type * const = nullptr
 		) {
 #ifdef _DEBUG
@@ -1013,8 +1012,7 @@ namespace grb {
 			const auto dummy_identity = identities::zero< OutputType >::value();
 
 			// retrieve buffers
-			char * arr1, * arr3, * buf1, * buf3;
-			arr1 = buf1 = nullptr;
+			char * arr1 = nullptr, * arr3 = nullptr, * buf1 = nullptr, * buf3 = nullptr;
 			InputType1 * vbuf1 = nullptr;
 			OutputType * valbuf = nullptr;
 			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
@@ -1022,7 +1020,7 @@ namespace grb {
 			// end buffer retrieval
 
 			// initialisations
-			internal::Coordinates< reference > coors1;
+			Coordinates<reference> coors1;
 			coors1.set( arr1, false, buf1, n );
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1066,8 +1064,7 @@ namespace grb {
 			if( phase == EXECUTE ) {
 				nzc = 0;
 				// retrieve additional buffer
-				config::NonzeroIndexType * const C_col_index = internal::template
-					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
+				auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
 
 				if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1126,11 +1123,38 @@ namespace grb {
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						if( !coors1.assign( k_col ) ) {
-							valbuf[ k_col ] = A_raw.getValue( k, dummy_identity );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+					#pragma omp parallel default(none) \
+							shared(coors1, valbuf) \
+							firstprivate(i, A_raw, dummy_identity)
+#endif
+					{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						auto local_update = coors1.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns = coors1.maxAsyncAssigns();
+						size_t assigns = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
+						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+							const size_t k_col = A_raw.row_index[ k ];
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							if( !coors1.asyncAssign( k_col, local_update ) ) {
+								assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) );
+								if( ++assigns == maxAsyncAssigns ) {
+									coors1.joinUpdate( local_update );
+									assigns = 0;
+								}
+							}
+#else
+							if( !coors1.assign( k_col ) ) {
+								assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) );
+							}
+#endif
 						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors1.joinUpdate( local_update ) ) {}
+#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
@@ -1149,7 +1173,7 @@ namespace grb {
 
 						// update CCS
 						if( !crs_only ) {
-							C_col_index[ j ]++;
+							++C_col_index[ j ];
 							const size_t CCS_index =  CCS_raw.col_start[ j+1 ] - C_col_index[ j ];
 							CCS_raw.row_index[ CCS_index ] = i;
 							CCS_raw.setValue( CCS_index, result_value );
@@ -1230,11 +1254,10 @@ namespace grb {
 			const Monoid &monoid,
 			const Phase &phase,
 			const typename std::enable_if<
-					!grb::is_object< OutputType >::value &&
-					!grb::is_object< InputType1 >::value &&
-					!grb::is_object< InputType2 >::value &&
-					grb::is_monoid< Monoid >::value,
-					void
+					!is_object< OutputType >::value &&
+					!is_object< InputType1 >::value &&
+					!is_object< InputType2 >::value &&
+					is_monoid< Monoid >::value
 				>::type * const = nullptr
 		) {
 
@@ -1298,7 +1321,7 @@ namespace grb {
 			// end buffer retrieval
 
 			// initialisations
-			internal::Coordinates< reference > coors1, coors2;
+			Coordinates< reference > coors1, coors2;
 			coors1.set( arr1, false, buf1, n );
 			coors2.set( arr2, false, buf2, n );
 			if( !crs_only ) {
@@ -1343,8 +1366,7 @@ namespace grb {
 			// computational phase
 			if( phase == EXECUTE ) {
 				// retrieve additional buffer
-				config::NonzeroIndexType * const C_col_index = internal::template
-					getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 );
+				auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
 
 				// perform column-wise nonzero count
 				nzc = 0;
@@ -1379,9 +1401,8 @@ namespace grb {
 					const RC clear_rc = clear( C );
 					if( clear_rc != SUCCESS ) {
 						return PANIC;
-					} else {
-						return FAILED;
 					}
+					return FAILED;
 				}
 
 				// prefix sum for CCS_raw.col_start
@@ -1403,30 +1424,78 @@ namespace grb {
 					}
 				}
 
+
 				// do computations
+
 				nzc = 0;
-				std::cerr << "HERE\n";
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
 					coors2.clear();
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel default(none) \
+						shared(coors1, vbuf1, coors2, vbuf2) \
+						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
+#endif
+					{
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						auto local_update1 = coors1.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
+						size_t assigns1 = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
 						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 							const size_t k_col = A_raw.row_index[ k ];
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
+								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+								if( ++assigns1 == maxAsyncAssigns1 ) {
+									coors1.joinUpdate( local_update1 );
+									assigns1 = 0;
+								}
+							}
+#else
 							if( !coors1.assign( k_col ) ) {
 								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 							}
+#endif
 						}
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors1.joinUpdate( local_update1 )) {}
+#endif
+
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						auto local_update2 = coors2.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
+						size_t assigns2 = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
 						for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
 							const size_t k_col = B_raw.row_index[ k ];
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
+								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
+								if( ++assigns2 == maxAsyncAssigns2 ) {
+									coors2.joinUpdate( local_update2 );
+									assigns2 = 0;
+								}
+							}
+#else
 							if( !coors2.assign( k_col ) ) {
 								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
 							}
+#endif
 						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						while( !coors2.joinUpdate( local_update2 )) {}
+#endif
+					}
 
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
 						const auto j = coors1.index( k );

From ef342a8af6f7d557e8f834b93535a09f2fa2ecd0 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 24 Jan 2024 14:34:05 +0100
Subject: [PATCH 31/37] Fix potential bug when (A == B) is true. Thanks to
 @aleksamilisavljevic for noticing

---
 include/graphblas/reference/blas3.hpp | 46 +++++++++++++++++++--------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 6d8406e5e..a113ec91a 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1001,6 +1001,13 @@ namespace grb {
 				return MISMATCH;
 			}
 
+			if( getID(A) == getID(C) || getID(B) == getID(C) ) {
+#ifdef _DEBUG
+				std::cerr << "grb::eWiseApply: The output matrix can not simultaneously be "
+					<< "one of the input matrices\n";
+#endif
+			}
+
 			const auto &A_raw = !trans_left ?
 				internal::getCRS( A ) :
 				internal::getCCS( A );
@@ -1019,9 +1026,11 @@ namespace grb {
 			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
 			// end buffer retrieval
 
-			// initialisations
+			// initialisations of the coordinates
 			Coordinates<reference> coors1;
 			coors1.set( arr1, false, buf1, n );
+			// end initialisations of the coordinates
+
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp parallel for simd default(none) \
@@ -1279,6 +1288,13 @@ namespace grb {
 				return ILLEGAL;
 			}
 
+			if( getID(A) == getID(C) || getID(B) == getID(C) ) {
+#ifdef _DEBUG
+				std::cerr << "grb::eWiseApply: The output matrix can not simultaneously be "
+					<< "one of the input matrices\n";
+#endif
+			}
+
 			// run-time checks
 			const size_t m = nrows( C );
 			const size_t n = ncols( C );
@@ -1310,20 +1326,22 @@ namespace grb {
 
 
 			// retrieve buffers
-			char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3;
-			arr1 = arr2 = buf1 = buf2 = nullptr;
+			char *arr1 = nullptr, *arr3 = nullptr;
+			char *buf1 = nullptr, *buf3 = nullptr;
 			InputType1 * vbuf1 = nullptr;
-			InputType2 * vbuf2 = nullptr;
-			OutputType * valbuf = nullptr;
+			OutputType * vbuf3 = nullptr;
 			internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A );
-			internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B );
-			internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C );
+			internal::getMatrixBuffers( arr3, buf3, vbuf3, 1, C );
 			// end buffer retrieval
 
-			// initialisations
+			// initialisations of the coordinates
+			// Note: By using the buffer of the output matrix C, we can
+			//       allow A and B to be the same matrix (with the same buffer)
 			Coordinates< reference > coors1, coors2;
 			coors1.set( arr1, false, buf1, n );
-			coors2.set( arr2, false, buf2, n );
+			coors2.set( arr3, false, buf3, n );
+			// end initialisations of the coordinates
+
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n)
@@ -1435,7 +1453,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp parallel default(none) \
-						shared(coors1, vbuf1, coors2, vbuf2) \
+						shared(coors1, vbuf1, coors2, vbuf3) \
 						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
 #endif
 					{
@@ -1480,7 +1498,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
+								assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 								if( ++assigns2 == maxAsyncAssigns2 ) {
 									coors2.joinUpdate( local_update2 );
 									assigns2 = 0;
@@ -1488,7 +1506,7 @@ namespace grb {
 							}
 #else
 							if( !coors2.assign( k_col ) ) {
-								assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) );
+								assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 							}
 #endif
 						}
@@ -1500,7 +1518,7 @@ namespace grb {
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
 						const auto j = coors1.index( k );
 						const auto A_val = getValue(vbuf1, j, identity_A);
-						const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B;
+						const auto B_val = coors2.assigned(j) ? getValue(vbuf3, j, identity_B) : identity_B;
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );
@@ -1529,7 +1547,7 @@ namespace grb {
 							continue;
 						}
 						const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A;
-						const auto B_val = getValue(vbuf2, j, identity_B);
+						const auto B_val = getValue(vbuf3, j, identity_B);
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );

From 2d63fdfb3547d86ad415c6d1bebc28a50ef96b78 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 24 Jan 2024 14:54:34 +0100
Subject: [PATCH 32/37] Increase test size in CI

---
 tests/unit/unittests.sh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 5963dc16c..228b40c0b 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -554,10 +554,18 @@ for MODE in ${MODES}; do
 				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply (matrices, Monoid / Operator)"
-				$runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log
-				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
-				echo " "
+				echo "                                  using small matrices (100x100)"
+				$runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
+        echo ">>>      [x]           [ ]       Testing grb::eWiseApply (matrices, Monoid / Operator)"
+        echo "                                  using large matrices (100'000x100'000)"
+        $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log
+        head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log
+        grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+        echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::zip on two vectors of doubles and"
 				echo "                                 ints of size 10 000 000."

From 098481f9838fcdee0e7adc312ba7fc4c5ef81388 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 2 Feb 2024 00:03:35 +0100
Subject: [PATCH 33/37] Minor fix from the merge

---
 tests/unit/eWiseApply_matrix.cpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/tests/unit/eWiseApply_matrix.cpp b/tests/unit/eWiseApply_matrix.cpp
index 4fac4fa8d..01f92e9bf 100644
--- a/tests/unit/eWiseApply_matrix.cpp
+++ b/tests/unit/eWiseApply_matrix.cpp
@@ -42,14 +42,13 @@ void grb_program( const int &, grb::RC &rc ) {
 				<< "mixed-domain matrix check\n";
 			return;
 		}
-		
 		for( const auto &triple : C ) {
-			const auto &i = triple.first.first;
-			const auto &j = triple.first.second;
-			const auto &v = triple.second;
-			if( j != i+n ) {
-				std::cout << "Unexpected entry at position ( " << i << ", " << i+n << " ) "
-					<< "-- only expected entries on the n-th diagonal\n";
+			const size_t &i = triple.first.first;
+			const size_t &j = triple.first.second;
+			const size_t &v = triple.second;
+			if( i != j ) {
+				std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
+					<< "-- only expected entries on the diagonal\n";
 				rc = FAILED;
 			}
 			if( v != 4 ) {
@@ -68,14 +67,15 @@ void grb_program( const int &, grb::RC &rc ) {
 
 int main( int argc, char ** argv ) {
 	// defaults
-	size_t input = 1000; // unused
+	bool printUsage = false;
+	int input = 0; // unused
 
 	// error checking
 	if( argc > 1 ) {
-		input = std::strtoul( argv[ 1 ], nullptr, 10 );
+		printUsage = true;
 	}
-	if( argc > 2 ) {
-		std::cerr << "Usage: " << argv[ 0 ] << "[n]\n";
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
 		return 1;
 	}
 
@@ -94,4 +94,3 @@ int main( int argc, char ** argv ) {
 		return 0;
 	}
 }
-

From 40e62c2280806664230756036f87983c60e217a5 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 2 Feb 2024 16:46:34 +0100
Subject: [PATCH 34/37] Minor changes

---
 include/graphblas/reference/blas3.hpp | 92 +++++++++++++++++++--------
 tests/unit/unittests.sh               |  4 +-
 2 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index a113ec91a..445715873 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1358,18 +1358,60 @@ namespace grb {
 			// symbolic phase
 			if( phase == RESIZE ) {
 				nzc = 0;
-				for( size_t i = 0; i < m; ++i ) {
-					coors1.clear();
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						if( !coors1.assign( k_col ) ) {
-							(void) ++nzc;
+
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				#pragma omp parallel default(none) \
+					shared(coors1, vbuf1, coors2, vbuf3) \
+					firstprivate(A_raw, identity_A, B_raw, identity_B, m) \
+					reduction(+:nzc)
+#endif
+				{
+					for( size_t i = 0; i < m; ++i ) {
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						#pragma omp single
+#endif
+						{
+							coors1.clear();
 						}
-					}
-					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t l_col = B_raw.row_index[ l ];
-						if( !coors1.assigned( l_col ) ) {
-							(void) ++nzc;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						#pragma omp barrier
+#endif
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						auto local_update1 = coors1.EMPTY_UPDATE();
+						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
+						size_t assigns1 = 0;
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
+#endif
+						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+							const size_t k_col = A_raw.row_index[ k ];
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
+                                assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+                                if( ++assigns1 == maxAsyncAssigns1 ) {
+                                    coors1.joinUpdate( local_update1 );
+                                    assigns1 = 0;
+                                }
+                            }
+#else
+							(void)coors1.assign( k_col );
+#endif
+							(void)++nzc;
+						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	                    while( !coors1.joinUpdate( local_update1 ) ) {}
+#endif
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						#pragma omp barrier
+
+						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+#endif
+						for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+							const size_t l_col = B_raw.row_index[ l ];
+							if( !coors1.assigned( l_col ) ) {
+								(void)++nzc;
+							}
 						}
 					}
 				}
@@ -1392,9 +1434,9 @@ namespace grb {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						if( !coors1.assign( k_col ) ) {
-							(void) ++nzc;
-						}
+						(void)coors1.assign( k_col );
+						(void)++nzc;
+
 						if( !crs_only ) {
 							(void) ++CCS_raw.col_start[ k_col + 1 ];
 						}
@@ -1402,9 +1444,9 @@ namespace grb {
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( !coors1.assigned( l_col ) ) {
-							(void) ++nzc;
+							(void)++nzc;
 							if( !crs_only ) {
-								(void) ++CCS_raw.col_start[ l_col + 1 ];
+								(void)++CCS_raw.col_start[ l_col + 1 ];
 							}
 						}
 					}
@@ -1425,11 +1467,13 @@ namespace grb {
 
 				// prefix sum for CCS_raw.col_start
 				if( !crs_only ) {
-					assert( CCS_raw.col_start[ 0 ] == 0 );
 					for( size_t j = 1; j < n; ++j ) {
 						CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
 					}
+#ifndef NDEBUG
+					assert( CCS_raw.col_start[ 0 ] == 0 );
 					assert( CCS_raw.col_start[ n ] == nzc );
+#endif
 				}
 
 				// set C_col_index to all zero
@@ -1442,9 +1486,7 @@ namespace grb {
 					}
 				}
 
-
 				// do computations
-
 				nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
@@ -1457,7 +1499,6 @@ namespace grb {
 						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
 #endif
 					{
-
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 						auto local_update1 = coors1.EMPTY_UPDATE();
 						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
@@ -1469,17 +1510,15 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 								if( ++assigns1 == maxAsyncAssigns1 ) {
 									coors1.joinUpdate( local_update1 );
 									assigns1 = 0;
 								}
 							}
 #else
-							if( !coors1.assign( k_col ) ) {
-								assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-							}
+							(void)coors1.assign( k_col );
 #endif
+							assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 						}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1505,10 +1544,9 @@ namespace grb {
 								}
 							}
 #else
-							if( !coors2.assign( k_col ) ) {
-								assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
-							}
+							(void)coors2.assign( k_col );
 #endif
+							assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 						}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 						while( !coors2.joinUpdate( local_update2 )) {}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index a578e00f5..3b43c7872 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -605,8 +605,8 @@ for MODE in ${MODES}; do
 				echo " "
 
         echo ">>>      [x]           [ ]       Testing grb::eWiseApply (matrices, Monoid / Operator)"
-        echo "                                  using large matrices (100'000x100'000)"
-        $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log
+        echo "                                  using large matrices (10'000x10'000)"
+        $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 10000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log
         head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log
         grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
         echo " "

From b2965d939f2a2145bbd95b88196175e7fe144eb1 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sun, 25 Feb 2024 14:51:30 +0100
Subject: [PATCH 35/37] Review fixes

---
 include/graphblas/bsp1d/blas3.hpp       | 67 +++++++++++++++------
 include/graphblas/nonblocking/blas3.hpp | 32 ++++++++--
 include/graphblas/reference/blas3.hpp   | 79 +++++++++++--------------
 include/graphblas/utils.hpp             | 18 ++++++
 4 files changed, 128 insertions(+), 68 deletions(-)

diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp
index 0ddbdf41b..25f5902ab 100644
--- a/include/graphblas/bsp1d/blas3.hpp
+++ b/include/graphblas/bsp1d/blas3.hpp
@@ -140,14 +140,31 @@ namespace grb {
 			>::type * const = nullptr
 	) {
 		assert( phase != TRY );
-		RC ret = eWiseApply< descr >(
-			internal::getLocal( C ),
-			internal::getLocal( A ),
-			internal::getLocal( B ),
-			mul,
-			phase
-		);
-		return internal::checkGlobalErrorStateOrClear( C, ret );
+		RC local_rc = SUCCESS;
+		if( phase == RESIZE ) {
+			RC ret = eWiseApply< descr >(
+				internal::getLocal( C ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
+				mul,
+				RESIZE
+			);
+			if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) {
+				return PANIC;
+			} else {
+				return ret;
+			}
+		} else {
+			assert( phase == EXECUTE );
+			local_rc = eWiseApply< descr >(
+				internal::getLocal( C ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
+				mul,
+				EXECUTE
+			);
+		}
+		return internal::checkGlobalErrorStateOrClear( C, local_rc );
 	}
 
 	/** \internal Simply delegates to process-local backend */
@@ -173,15 +190,31 @@ namespace grb {
 			>::type * const = nullptr
 	) {
 		assert( phase != TRY );
-		RC ret = eWiseApply< descr >(
-			internal::getLocal( C ),
-			internal::getLocal( A ),
-			internal::getLocal( B ),
-			op,
-			phase
-		);
-		return internal::checkGlobalErrorStateOrClear( C, ret );
-	}
+		RC local_rc = SUCCESS;
+		if( phase == RESIZE ) {
+			RC ret = eWiseApply< descr >(
+				internal::getLocal( C ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
+				op,
+				RESIZE
+			);
+			if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) {
+				return PANIC;
+			} else {
+				return ret;
+			}
+		} else {
+			assert( phase == EXECUTE );
+			local_rc = eWiseApply< descr >(
+				internal::getLocal( C ),
+				internal::getLocal( A ),
+				internal::getLocal( B ),
+				op,
+				EXECUTE
+			);
+		}
+		return internal::checkGlobalErrorStateOrClear( C, local_rc );
 
 } // namespace grb
 
diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
index aaebe569c..50c640ae0 100644
--- a/include/graphblas/nonblocking/blas3.hpp
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -472,10 +472,10 @@ namespace grb {
 
 		// second, delegate to the reference backend
 		return eWiseApply< descr >(
-			internal::getRefMatrix( C ), 
-			internal::getRefMatrix( A ), 
+			internal::getRefMatrix( C ),
+			internal::getRefMatrix( A ),
 			internal::getRefMatrix( B ),
-			mulmono, 
+			mulmono,
 			phase
 		);
 	}
@@ -511,16 +511,36 @@ namespace grb {
 		std::cout << "In grb::eWiseApply (nonblocking, op)\n";
 #endif
 
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D1, InputType1 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D2, InputType2 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with a postfactor input matrix B that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D3, OutputType >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with an output matrix C that does not match the output domain "
+			"of the given multiplication operator"
+		);
+
 		// nonblocking execution is not supported
 		// first, execute any computation that is not completed
 		internal::le.execution();
 
 		// second, delegate to the reference backend
 		return eWiseApply< descr >(
-			internal::getRefMatrix( C ), 
-			internal::getRefMatrix( A ), 
+			internal::getRefMatrix( C ),
+			internal::getRefMatrix( A ),
 			internal::getRefMatrix( B ),
-			mulOp, 
+			mulOp,
 			phase
 		);
 	}
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 445715873..70d7a3642 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -57,32 +57,6 @@
 		"********************************************************************" \
 		"******************************\n" );
 
-#ifndef _H_GRB_REFERENCE_BLAS3_ACCESSORS
-#define _H_GRB_REFERENCE_BLAS3_ACCESSORS
-
-namespace grb::internal
-{
-	template< typename D, typename T >
-	static void assignValue(
-		D *array, size_t i, const T& value,
-		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
-	) { array[i] = value; }
-
-	template< typename T >
-	static void assignValue( void *, size_t, const T& ) { /* do nothing */ }
-
-	template< typename D, typename T >
-	static T getValue(
-		const D *array, size_t i, const T&,
-		typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
-	) { return array[i]; }
-
-	template< typename T >
-	static T getValue( const void *, size_t, const T& identity ) { return identity; }
-
-} // namespace grb::internal
-
-#endif // _H_GRB_REFERENCE_BLAS3_ACCESSORS
 
 
 namespace grb {
@@ -1077,11 +1051,18 @@ namespace grb {
 
 				if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-					#pragma omp parallel for simd default(none) \
-						shared(C_col_index) firstprivate(n)
+#pragma omp parallel
 #endif
-					for( size_t j = 0; j < n; ++j ) {
-						C_col_index[ j ] = 0;
+					{
+						size_t start = 0;
+						size_t end = n + 1;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						config::OMP::localRange( start, end, 0, n + 1 );
+						#pragma omp parallel for simd
+#endif
+						for( size_t j = start; j < end; ++j ) {
+							C_col_index[ j ] = 0;
+						}
 					}
 				}
 
@@ -1149,7 +1130,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 							if( !coors1.asyncAssign( k_col, local_update ) ) {
-								assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) );
+								utils::assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) );
 								if( ++assigns == maxAsyncAssigns ) {
 									coors1.joinUpdate( local_update );
 									assigns = 0;
@@ -1157,7 +1138,7 @@ namespace grb {
 							}
 #else
 							if( !coors1.assign( k_col ) ) {
-								assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) );
+								utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) );
 							}
 #endif
 						}
@@ -1387,7 +1368,7 @@ namespace grb {
 							const size_t k_col = A_raw.row_index[ k ];
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-                                assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+                                utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
                                 if( ++assigns1 == maxAsyncAssigns1 ) {
                                     coors1.joinUpdate( local_update1 );
                                     assigns1 = 0;
@@ -1518,7 +1499,7 @@ namespace grb {
 #else
 							(void)coors1.assign( k_col );
 #endif
-							assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+							utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
 						}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1537,7 +1518,7 @@ namespace grb {
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-								assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
+								utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 								if( ++assigns2 == maxAsyncAssigns2 ) {
 									coors2.joinUpdate( local_update2 );
 									assigns2 = 0;
@@ -1546,7 +1527,7 @@ namespace grb {
 #else
 							(void)coors2.assign( k_col );
 #endif
-							assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
+							utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 						}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 						while( !coors2.joinUpdate( local_update2 )) {}
@@ -1555,8 +1536,8 @@ namespace grb {
 
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
 						const auto j = coors1.index( k );
-						const auto A_val = getValue(vbuf1, j, identity_A);
-						const auto B_val = coors2.assigned(j) ? getValue(vbuf3, j, identity_B) : identity_B;
+						const auto A_val = utils::getValue(vbuf1, j, identity_A);
+						const auto B_val = coors2.assigned(j) ? utils::getValue(vbuf3, j, identity_B) : identity_B;
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );
@@ -1584,8 +1565,8 @@ namespace grb {
 						if( coors1.assigned(j) ) { // Intersection case: already handled
 							continue;
 						}
-						const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A;
-						const auto B_val = getValue(vbuf3, j, identity_B);
+						const auto A_val = coors1.assigned(j) ? utils::getValue(vbuf1, j, identity_A) : identity_A;
+						const auto B_val = utils::getValue(vbuf3, j, identity_B);
 
 						OutputType result_value;
 						(void)grb::apply( result_value, A_val, B_val, oper );
@@ -1753,8 +1734,14 @@ namespace grb {
 				void
 			>::type * const = nullptr
 	) {
-		typedef typename std::conditional<std::is_void<InputType1>::value, typename Operator::D1, InputType1>::type ActualInputType1;
-		typedef typename std::conditional<std::is_void<InputType2>::value, typename Operator::D2, InputType1>::type ActualInputType2;
+		typedef typename std::conditional<
+			std::is_void<InputType1>::value,
+			typename Operator::D1,
+			InputType1>::type ActualInputType1;
+		typedef typename std::conditional<
+			std::is_void<InputType2>::value,
+			typename Operator::D2,
+			InputType1>::type ActualInputType2;
 		// static checks
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
 			std::is_same< typename Operator::D1, ActualInputType1 >::value ),
@@ -1775,9 +1762,11 @@ namespace grb {
 			"of the given multiplication operator"
 		);
 		static_assert(
-			!std::is_void< OutputType >::value,
-			"grb::eWiseApply: the elementwise mxm cannot be used if the"
-			" output matrix is a pattern matrix (of type void)"
+			!std::is_void< OutputType >::value ||
+			( std::is_void< InputType1 >::value && std::is_void< InputType2 >::value ),
+			"grb::eWiseApply: the elementwise mxm only support"
+			" output pattern-matrix (of type void) if both"
+			" input matrices are also pattern matrices"
 		);
 #ifdef _DEBUG
 		std::cout << "In grb::eWiseApply( reference, operator )\n";
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index c5239afdc..a82aa3125 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -54,6 +54,24 @@ namespace grb {
 	 */
 	namespace utils {
 
+		template< typename D, typename T >
+		static void assignValue(
+			D *array, size_t i, const T& value,
+			typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
+		) { array[i] = value; }
+
+		template< typename T >
+		static void assignValue( void *, size_t, const T& ) { /* do nothing */ }
+
+		template< typename D, typename T >
+		static T getValue(
+			const D *array, size_t i, const T&,
+			typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr
+		) { return array[i]; }
+
+		template< typename T >
+		static T getValue( const void *, size_t, const T& identity ) { return identity; }
+
 		/**
 		 * Checks whether two values are equal.
 		 *

From 35f3a9b202afb5f377c0891a216a55310f45ed60 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 26 Feb 2024 11:14:07 +0100
Subject: [PATCH 36/37] Coordinates assign bugfix

---
 include/graphblas/reference/blas3.hpp | 292 +++++---------------------
 1 file changed, 49 insertions(+), 243 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 70d7a3642..4167c076e 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1007,8 +1007,7 @@ namespace grb {
 
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd default(none) \
-					shared(CCS_raw) firstprivate(n)
+				#pragma omp parallel for simd
 #endif
 				for( size_t j = 0; j <= n; ++j ) {
 					CCS_raw.col_start[ j ] = 0;
@@ -1047,22 +1046,15 @@ namespace grb {
 			if( phase == EXECUTE ) {
 				nzc = 0;
 				// retrieve additional buffer
-				auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
+				config::NonzeroIndexType * const C_col_index =
+					getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
 
 				if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel
-#endif
-					{
-						size_t start = 0;
-						size_t end = n + 1;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						config::OMP::localRange( start, end, 0, n + 1 );
-						#pragma omp parallel for simd
+					#pragma omp parallel for simd
 #endif
-						for( size_t j = start; j < end; ++j ) {
-							C_col_index[ j ] = 0;
-						}
+					for( size_t j = 0; j < n+1; ++j ) {
+						C_col_index[ j ] = 0;
 					}
 				}
 
@@ -1112,50 +1104,22 @@ namespace grb {
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-					#pragma omp parallel default(none) \
-							shared(coors1, valbuf) \
-							firstprivate(i, A_raw, dummy_identity)
-#endif
-					{
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update = coors1.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns = coors1.maxAsyncAssigns();
-						size_t assigns = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
-						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-							const size_t k_col = A_raw.row_index[ k ];
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors1.asyncAssign( k_col, local_update ) ) {
-								utils::assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) );
-								if( ++assigns == maxAsyncAssigns ) {
-									coors1.joinUpdate( local_update );
-									assigns = 0;
-								}
-							}
-#else
-							if( !coors1.assign( k_col ) ) {
-								utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) );
-							}
-#endif
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = A_raw.row_index[ k ];
+						if( !coors1.assign( k_col ) ) {
+							utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) );
 						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors1.joinUpdate( local_update ) ) {}
-#endif
 					}
 
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-						const size_t j = B_raw.row_index[ l ];
+						const auto j = B_raw.row_index[ l ];
 						if( !coors1.assigned( j ) ) { // Union case: ignored
 							continue;
 						}
 
 						const auto valbuf_value_before = valbuf[ j ];
 						OutputType result_value;
-						(void)grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
+						(void) grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper );
 
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;
@@ -1170,51 +1134,17 @@ namespace grb {
 						}
 
 						// update count
-						(void)++nzc;
+						(void) ++nzc;
 					}
 
 					CRS_raw.col_start[ i + 1 ] = nzc;
-
-				}
-
-
-#ifdef _DEBUG
-				std::cout << "CRS_raw.col_start = [ ";
-				for( size_t j = 0; j <= m; ++j )
-					std::cout << CRS_raw.col_start[ j ] << " ";
-				std::cout << "]\n";
-				std::cout << "CRS_raw.row_index = [ ";
-				for( size_t j = 0; j < nzc; ++j )
-					std::cout << CRS_raw.row_index[ j ] << " ";
-				std::cout << "]\n";
-				std::cout << "CRS_raw.values    = [ ";
-				for( size_t j = 0; j < nzc; ++j )
-					std::cout << CRS_raw.values[ j ] << " ";
-				std::cout << "]\n";
-				if( !crs_only ) {
-					std::cout << "C_col_index =       [ ";
-					for( size_t j = 0; j < n; ++j )
-						std::cout << C_col_index[ j ] << " ";
-					std::cout << "]\n";
-					std::cout << "CCS_raw.col_start = [ ";
-					for( size_t j = 0; j <= n; ++j )
-						std::cout << CCS_raw.col_start[ j ] << " ";
-					std::cout << "]\n";
-					std::cout << "CCS_raw.row_index = [ ";
-					for( size_t j = 0; j < nzc; ++j )
-						std::cout << CCS_raw.row_index[ j ] << " ";
-					std::cout << "]\n";
-					std::cout << "CCS_raw.values    = [ ";
-					for( size_t j = 0; j < nzc; ++j )
-						std::cout << CCS_raw.values[ j ] << " ";
-					std::cout << "]\n";
 				}
-#endif
 
 #ifndef NDEBUG
 				if( !crs_only ) {
-					for( size_t j = 0; j < n; ++j )
+					for( size_t j = 0; j < n; ++j ) {
 						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
+					}
 				}
 #endif
 
@@ -1305,7 +1235,6 @@ namespace grb {
 			auto &CRS_raw = internal::getCRS( C );
 			auto &CCS_raw = internal::getCCS( C );
 
-
 			// retrieve buffers
 			char *arr1 = nullptr, *arr3 = nullptr;
 			char *buf1 = nullptr, *buf3 = nullptr;
@@ -1325,7 +1254,7 @@ namespace grb {
 
 			if( !crs_only ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n)
+				#pragma omp parallel for simd
 #endif
 				for( size_t j = 0; j < n + 1; ++j ) {
 					CCS_raw.col_start[ j ] = 0;
@@ -1339,60 +1268,18 @@ namespace grb {
 			// symbolic phase
 			if( phase == RESIZE ) {
 				nzc = 0;
-
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel default(none) \
-					shared(coors1, vbuf1, coors2, vbuf3) \
-					firstprivate(A_raw, identity_A, B_raw, identity_B, m) \
-					reduction(+:nzc)
-#endif
-				{
-					for( size_t i = 0; i < m; ++i ) {
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						#pragma omp single
-#endif
-						{
-							coors1.clear();
-						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						#pragma omp barrier
-#endif
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update1 = coors1.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
-						size_t assigns1 = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
-						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-							const size_t k_col = A_raw.row_index[ k ];
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-                                utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-                                if( ++assigns1 == maxAsyncAssigns1 ) {
-                                    coors1.joinUpdate( local_update1 );
-                                    assigns1 = 0;
-                                }
-                            }
-#else
-							(void)coors1.assign( k_col );
-#endif
-							(void)++nzc;
+				for( size_t i = 0; i < m; ++i ) {
+					coors1.clear();
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = A_raw.row_index[ k ];
+						if( !coors1.assign( k_col ) ) {
+							(void) ++nzc;
 						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	                    while( !coors1.joinUpdate( local_update1 ) ) {}
-#endif
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						#pragma omp barrier
-
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() )
-#endif
-						for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
-							const size_t l_col = B_raw.row_index[ l ];
-							if( !coors1.assigned( l_col ) ) {
-								(void)++nzc;
-							}
+					}
+					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
+						const size_t l_col = B_raw.row_index[ l ];
+						if( !coors1.assigned( l_col ) ) {
+							(void) ++nzc;
 						}
 					}
 				}
@@ -1407,7 +1294,8 @@ namespace grb {
 			// computational phase
 			if( phase == EXECUTE ) {
 				// retrieve additional buffer
-				auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
+				config::NonzeroIndexType * const C_col_index =
+					getReferenceBuffer< config::NonzeroIndexType >( n + 1 );
 
 				// perform column-wise nonzero count
 				nzc = 0;
@@ -1415,8 +1303,8 @@ namespace grb {
 					coors1.clear();
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						(void)coors1.assign( k_col );
-						(void)++nzc;
+						(void) coors1.assign( k_col );
+						(void) ++nzc;
 
 						if( !crs_only ) {
 							(void) ++CCS_raw.col_start[ k_col + 1 ];
@@ -1425,9 +1313,9 @@ namespace grb {
 					for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) {
 						const size_t l_col = B_raw.row_index[ l ];
 						if( !coors1.assigned( l_col ) ) {
-							(void)++nzc;
+							(void) ++nzc;
 							if( !crs_only ) {
-								(void)++CCS_raw.col_start[ l_col + 1 ];
+								(void) ++CCS_raw.col_start[ l_col + 1 ];
 							}
 						}
 					}
@@ -1472,66 +1360,17 @@ namespace grb {
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
 					coors1.clear();
-					coors2.clear();
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				#pragma omp parallel default(none) \
-						shared(coors1, vbuf1, coors2, vbuf3) \
-						firstprivate(i, A_raw, identity_A, B_raw, identity_B )
-#endif
-					{
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update1 = coors1.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns();
-						size_t assigns1 = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
-						for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-							const size_t k_col = A_raw.row_index[ k ];
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors1.asyncAssign( k_col, local_update1 ) ) {
-								if( ++assigns1 == maxAsyncAssigns1 ) {
-									coors1.joinUpdate( local_update1 );
-									assigns1 = 0;
-								}
-							}
-#else
-							(void)coors1.assign( k_col );
-#endif
-							utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-						}
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors1.joinUpdate( local_update1 )) {}
-#endif
-
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						auto local_update2 = coors2.EMPTY_UPDATE();
-						const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns();
-						size_t assigns2 = 0;
-						#pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait
-#endif
-						for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
-							const size_t k_col = B_raw.row_index[ k ];
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = A_raw.row_index[ k ];
+						(void) coors1.assign( k_col );
+						utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+					}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-							if( !coors2.asyncAssign( k_col, local_update2 ) ) {
-								utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
-								if( ++assigns2 == maxAsyncAssigns2 ) {
-									coors2.joinUpdate( local_update2 );
-									assigns2 = 0;
-								}
-							}
-#else
-							(void)coors2.assign( k_col );
-#endif
-							utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
-						}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-						while( !coors2.joinUpdate( local_update2 )) {}
-#endif
+					coors2.clear();
+					for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = B_raw.row_index[ k ];
+						(void) coors2.assign( k_col );
+						utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
 					}
 
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {
@@ -1540,7 +1379,7 @@ namespace grb {
 						const auto B_val = coors2.assigned(j) ? utils::getValue(vbuf3, j, identity_B) : identity_B;
 
 						OutputType result_value;
-						(void)grb::apply( result_value, A_val, B_val, oper );
+						(void) grb::apply( result_value, A_val, B_val, oper );
 
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;
@@ -1558,7 +1397,7 @@ namespace grb {
 							CCS_raw.setValue( CCS_index, result_value );
 						}
 						// update count
-						(void)++nzc;
+						(void) ++nzc;
 					}
 					for( size_t k = 0; k < coors2.nonzeroes(); ++k ) {
 						const auto j = coors2.index( k );
@@ -1569,7 +1408,7 @@ namespace grb {
 						const auto B_val = utils::getValue(vbuf3, j, identity_B);
 
 						OutputType result_value;
-						(void)grb::apply( result_value, A_val, B_val, oper );
+						(void) grb::apply( result_value, A_val, B_val, oper );
 
 						// update CRS
 						CRS_raw.row_index[ nzc ] = j;
@@ -1587,52 +1426,19 @@ namespace grb {
 							CCS_raw.setValue( CCS_index, result_value );
 						}
 						// update count
-						(void)++nzc;
+						(void) ++nzc;
 					}
 
 					CRS_raw.col_start[ i + 1 ] = nzc;
 				}
 
-				if( !crs_only ) {
-#ifdef _DEBUG
-					std::cout << "CRS_raw.col_start = [ ";
-					for( size_t j = 0; j <= m; ++j )
-						std::cout << CRS_raw.col_start[ j ] << " ";
-					std::cout << "]\n";
-					std::cout << "CRS_raw.row_index = [ ";
-					for( size_t j = 0; j < nzc; ++j )
-						std::cout << CRS_raw.row_index[ j ] << " ";
-					std::cout << "]\n";
-					std::cout << "CRS_raw.values    = [ ";
-					for( size_t j = 0; j < nzc; ++j )
-						std::cout << CRS_raw.values[ j ] << " ";
-					std::cout << "]\n";
-					if( !crs_only ) {
-						std::cout << "C_col_index =       [ ";
-						for( size_t j = 0; j < n; ++j )
-							std::cout << C_col_index[ j ] << " ";
-						std::cout << "]\n";
-						std::cout << "CCS_raw.col_start = [ ";
-						for( size_t j = 0; j <= n; ++j )
-							std::cout << CCS_raw.col_start[ j ] << " ";
-						std::cout << "]\n";
-						std::cout << "CCS_raw.row_index = [ ";
-						for( size_t j = 0; j < nzc; ++j )
-							std::cout << CCS_raw.row_index[ j ] << " ";
-						std::cout << "]\n";
-						std::cout << "CCS_raw.values    = [ ";
-						for( size_t j = 0; j < nzc; ++j )
-							std::cout << CCS_raw.values[ j ] << " ";
-						std::cout << "]\n";
-					}
-#endif
-
 #ifndef NDEBUG
+				if( !crs_only ) {
 					for( size_t j = 0; j < n; ++j ) {
 						assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] );
 					}
-#endif
 				}
+#endif
 
 				// set final number of nonzeroes in output matrix
 #ifdef _DEBUG

From 4eaaf9166e3f0563b340a5deaf4269474286c1cd Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 26 Feb 2024 11:18:06 +0100
Subject: [PATCH 37/37] Enabling little threadnig using two tasks

---
 include/graphblas/reference/blas3.hpp | 38 ++++++++++++++++++---------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 4167c076e..563241ca3 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1359,18 +1359,32 @@ namespace grb {
 				nzc = 0;
 				CRS_raw.col_start[ 0 ] = 0;
 				for( size_t i = 0; i < m; ++i ) {
-					coors1.clear();
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const auto k_col = A_raw.row_index[ k ];
-						(void) coors1.assign( k_col );
-						utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
-					}
-
-					coors2.clear();
-					for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
-						const auto k_col = B_raw.row_index[ k ];
-						(void) coors2.assign( k_col );
-						utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+					#pragma omp parallel
+#endif
+					{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						#pragma omp single nowait
+#endif
+						{
+							coors1.clear();
+							for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+								const auto k_col = A_raw.row_index[ k ];
+								(void) coors1.assign( k_col );
+								utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) );
+							}
+						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+						#pragma omp single nowait
+#endif
+						{
+							coors2.clear();
+							for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) {
+								const auto k_col = B_raw.row_index[ k ];
+								(void) coors2.assign( k_col );
+								utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) );
+							}
+						}
 					}
 
 					for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {