From 06590c05a7db2b826a34d374a0e08065038a1e2a Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 14:59:26 +0200
Subject: [PATCH 01/63] base blas3::foldl + blas3::foldr API

---
 include/graphblas/base/blas3.hpp | 300 +++++++++++++++++++++++++++++++
 1 file changed, 300 insertions(+)
diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 425f7bc7a..bce17cb64 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -442,6 +442,306 @@ namespace grb {
 		return ret == SUCCESS ? UNSUPPORTED : ret;
 	}
 
+
+	/**
+	 * Reduces, or \em folds, a matrix into a scalar.
+	 *
+	 * Reduction takes place according a monoid \f$ (\oplus,1) \f$, where
+	 * \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with associated identities
+	 * \f$ 1_k in D_k \f$. Usually, \f$ D_k \subseteq D_3, 1 \leq k < 3 \f$,
+	 * though other more exotic structures may be envisioned (and used).
+	 *
+	 * Let \f$ x_0 = 1 \f$ and let
+	 * \f$ x_{i+1} = \begin{cases}
+	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }m_i\text{ evaluates true}
+	 *   x_i\text{ otherwise}
+	 * \end{cases},\f$
+	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$.
+	 *
+	 * \note Per this definition, the folding happens in a right-to-left direction.
+	 *       If another direction is wanted, which may have use in cases where
+	 *       \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a monoid with those
+	 *       operator domains switched may be supplied, or #grb::foldr may be used
+	 *       instead.
+	 *
+	 * After a successfull call, \a x will be equal to \f$ x_n \f$.
+	 *
+	 * Note that the operator \f$ \oplus \f$ must be associative since it is part
+	 * of a monoid. This algebraic property is exploited when parallelising the
+	 * requested operation. The identity is required when parallelising over
+	 * multiple user processes.
+	 *
+	 * \warning In so doing, the order of the evaluation of the reduction operation
+	 *          should not be expected to be a serial, right-to-left, evaluation of
+	 *          the computation chain.
+	 *
+	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
+	 *                   left unspecified).
+	 * @tparam Monoid    The monoid to use for reduction.
+	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a y.
+	 * @tparam IOType    The type of the output scalar \a x.
+	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a mask.
+	 *
+	 * @param[out]   x   The result of the reduction.
+	 * @param[in]    A   Any ALP/GraphBLAS matrix.
+	 * @param[in]  mask  Any ALP/GraphBLAS matrix.
+	 * @param[in] monoid The monoid under which to perform this reduction.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
+	 *                       equal to \a y.
+	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
+	 *                       #grb::descriptors::dense was given.
+	 *
+	 * @see grb::foldl provides similar in-place functionality.
+	 * @see grb::eWiseApply provides out-of-place semantics.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense, grb::descriptors::invert_mask,
+	 * grb::descriptors::structural, grb::descriptors::structural_complement
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If grb::descriptors::no_casting is given, then 1) the first domain of
+	 * \a monoid must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, 3) the third domain must match \a IOType, and 4) the element type
+	 * of \a mask must be <tt>bool</tt>. If one of these is not true, the code
+	 * shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		Backend backend
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, backend > &A,
+		const Matrix< MaskType, backend > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifndef NDEBUG
+		const bool should_not_call_base_scalar_masked_matrix_foldr = false;
+		assert( should_not_call_base_scalar_masked_matrix_foldr );
+#endif
+		(void) A;
+		(void) x;
+		(void) mask;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Reduces, or \em folds, a matrix into a scalar. 
+	 * Right-to-left unmasked variant.
+	 * 
+	 * Please see the masked grb::foldr variant for a full description.
+	 * 
+	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
+	 *                   left unspecified).
+	 * @tparam Operator  The operator to use for reduction.
+	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a y.
+	 * @tparam IOType    The type of the output scalar \a x.
+	 *
+	 * @param[out]   x     The result of the reduction.
+	 * @param[in]    A     Any ALP/GraphBLAS matrix.
+	 * @param[in] operator The operator used for reduction.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
+	 *                       #grb::descriptors::dense was given.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename InputType, typename IOType,
+		Backend backend
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, backend > &A,
+		const Operator &op,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+#ifndef NDEBUG
+		const bool should_not_call_base_scalar_unmasked_matrix_foldr = false;
+		assert( should_not_call_base_scalar_unmasked_matrix_foldr );
+#endif
+		(void) A;
+		(void) x;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+
+	/**
+	 * Reduces, or \em folds, a matrix into a scalar.
+	 *
+	 * Reduction takes place according a monoid \f$ (\oplus,1) \f$, where
+	 * \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with associated identities
+	 * \f$ 1_k in D_k \f$. Usually, \f$ D_k \subseteq D_3, 1 \leq k < 3 \f$,
+	 * though other more exotic structures may be envisioned (and used).
+	 *
+	 * Let \f$ x_0 = 1 \f$ and let
+	 * \f$ x_{i+1} = \begin{cases}
+	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }m_i\text{ evaluates true}
+	 *   x_i\text{ otherwise}
+	 * \end{cases},\f$
+	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$.
+	 *
+	 * \note Per this definition, the folding happens in a left-to-right direction.
+	 *       If another direction is wanted, which may have use in cases where
+	 *       \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a monoid with those
+	 *       operator domains switched may be supplied, or #grb::foldr may be used
+	 *       instead.
+	 *
+	 * After a successfull call, \a x will be equal to \f$ x_n \f$.
+	 *
+	 * Note that the operator \f$ \oplus \f$ must be associative since it is part
+	 * of a monoid. This algebraic property is exploited when parallelising the
+	 * requested operation. The identity is required when parallelising over
+	 * multiple user processes.
+	 *
+	 * \warning In so doing, the order of the evaluation of the reduction operation
+	 *          should not be expected to be a serial, left-to-right, evaluation of
+	 *          the computation chain.
+	 *
+	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
+	 *                   left unspecified).
+	 * @tparam Monoid    The monoid to use for reduction.
+	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a y.
+	 * @tparam IOType    The type of the output scalar \a x.
+	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a mask.
+	 *
+	 * @param[out]   x   The result of the reduction.
+	 * @param[in]    A   Any ALP/GraphBLAS matrix.
+	 * @param[in]  mask  Any ALP/GraphBLAS matrix.
+	 * @param[in] monoid The monoid under which to perform this reduction.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
+	 *                       equal to \a y.
+	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
+	 *                       #grb::descriptors::dense was given.
+	 *
+	 * @see grb::foldr provides similar in-place functionality.
+	 * @see grb::eWiseApply provides out-of-place semantics.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense, grb::descriptors::invert_mask,
+	 * grb::descriptors::structural, grb::descriptors::structural_complement
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If grb::descriptors::no_casting is given, then 1) the first domain of
+	 * \a monoid must match \a InputType, 2) the second domain of \a op must match
+	 * \a IOType, 3) the third domain must match \a IOType, and 4) the element type
+	 * of \a mask must be <tt>bool</tt>. If one of these is not true, the code
+	 * shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		Backend backend
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, backend > &A,
+		const Matrix< MaskType, backend > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifndef NDEBUG
+		const bool should_not_call_base_scalar_matrix_foldl = false;
+		assert( should_not_call_base_scalar_matrix_foldl );
+#endif
+		(void) A;
+		(void) x;
+		(void) mask;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Reduces, or \em folds, a matrix into a scalar. 
+	 * Left-to-right unmasked variant.
+	 * 
+	 * Please see the masked grb::foldl variant for a full description.
+	 * 
+	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
+	 *                   left unspecified).
+	 * @tparam Operator  The operator to use for reduction.
+	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                   matrix \a y.
+	 * @tparam IOType    The type of the output scalar \a x.
+	 *
+	 * @param[out]   x     The result of the reduction.
+	 * @param[in]    A     Any ALP/GraphBLAS matrix.
+	 * @param[in] operator The operator used for reduction.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
+	 *                       #grb::descriptors::dense was given.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename InputType, typename IOType,
+		Backend backend
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, backend > &A,
+		const Operator &op,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+#ifndef NDEBUG
+		const bool should_not_call_base_scalar_unmasked_matrix_foldl = false;
+		assert( should_not_call_base_scalar_unmasked_matrix_foldl );
+#endif
+		(void) A;
+		(void) x;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * @}
 	 */

From 670e85eeb98c83ea63cd9f71c306a259a5393ef5 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 15:03:13 +0200
Subject: [PATCH 02/63] foldl+foldr unit-test

---
 tests/unit/CMakeLists.txt            |   4 +
 tests/unit/fold_matrix_to_scalar.cpp | 284 +++++++++++++++++++++++++++
 tests/unit/unittests.sh              |   6 +
 3 files changed, 294 insertions(+)
 create mode 100644 tests/unit/fold_matrix_to_scalar.cpp

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 16999fd42..2ee3de02e 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -121,6 +121,10 @@ add_grb_executables( matrixIterator matrixIterator.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
+add_grb_executables( fold_matrix_to_scalar fold_matrix_to_scalar.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
 add_grb_executables( doubleAssign doubleAssign.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
new file mode 100644
index 000000000..47106b361
--- /dev/null
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -0,0 +1,284 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Tests for the reduce( Matrix<D>, T, Operator<T,D,T> ) API call
+ *
+ * @author Benjamin Lozes
+ * @date 17/05/2023
+ *
+ * Tests whether the foldl and foldl API calls produce the expected results.
+ * 
+ * The test cases are focused on the following aspects:
+ *   * The types of the result, the matrix values and the operator
+ * 	 * The initial value of the reduction result
+ * 	 * The order of the operands (foldr, foldl)
+ */
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <vector>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+using nz_t = float;
+
+template< typename T, typename V, class Operator >
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	T value = initial;
+	foldl( value, A, op );
+
+	std::cout << "foldl_test \"" << test_label << "\": ";
+	if( value == expected )
+		std::cout << "OK" << std::endl;
+	else
+		std::cerr << "Failed" << std::endl
+				  << test_description << std::endl
+				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+	return value == expected ? RC::SUCCESS : RC::FAILED;
+}
+
+template< typename T, typename V, class Operator >
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	T value = initial;
+	foldr( value, A, op );
+
+	std::cout << "foldr_test \"" << test_label << "\": ";
+	if( value == expected )
+		std::cout << "OK" << std::endl;
+	else
+		std::cerr << "Failed" << std::endl
+				  << test_description << std::endl
+				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+	return value == expected ? RC::SUCCESS : RC::FAILED;
+}
+
+template< typename T, typename V, class Operator >
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	RC rc = foldl_test( test_label, test_description, A, initial, expected, op );
+	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, op );
+}
+
+void grb_program( const long & n, grb::RC & rc ) {
+	// Build an identity matrix
+	Matrix< nz_t > I( n, n );
+	std::vector< size_t > I_rows( n ), I_cols( n );
+	std::vector< nz_t > I_vals( n, 1 );
+	std::iota( I_rows.begin(), I_rows.end(), 0 );
+	std::iota( I_cols.begin(), I_cols.end(), 0 );
+	buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), n, PARALLEL );
+
+	/**    Test case 1:
+	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  * Initial value is 0
+	 *  * Expected result: n
+	 */
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, operators::add< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 2:
+	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  * Initial value is n
+	 *  * Expected result: 2*n
+	 */
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), operators::add< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 3:
+	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  * Initial value is 0
+	 *  * Expected result: n
+	 */
+	rc = foldl_test(
+		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 4:
+	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  * Initial value is n
+	 *  * Expected result: 2*n
+	 */
+	rc = foldl_test(
+		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 5:
+	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is 0
+	 * * Expected result: 0
+	 */
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::mul< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 6:
+	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is 1
+	 * * Expected result: 1
+	 */
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::mul< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 7:
+	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * * Initial value is 0
+	 * * Expected result: 0
+	 */
+	rc = foldl_test(
+		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 8:
+	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * * Initial value is 1
+	 * * Expected result: 1
+	 */
+	rc = foldl_test(
+		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 9:
+	 * A simple binary equal reduction with different types for the nnzs and the reduction result (bool <- bool == float).
+	 * * Initial value is true
+	 * * Expected result: true
+	 */
+	rc = foldl_test(
+		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< bool, nz_t, bool >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< nz_t, bool, bool >() );
+	if( rc )
+		return;
+
+	/**     Test case 10:
+	 * A simple binary logical_or reduction with different types for the nnzs and the reduction result (bool <- bool || float).
+	 * * Initial value is false
+	 * * Expected result: true
+	 */
+	rc = foldl_test(
+		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< bool, nz_t, bool >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< nz_t, bool, bool >() );
+	if( rc )
+		return;
+
+	/**     Test case 11:  Non-commutative reduction
+	 * A simple substraction reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is for foldl is 0
+	 * * Expected result for foldl: -n
+	 * 
+	 * * Initial value is for foldr is 0
+	 * * Expected result for foldr: 0
+	 * 
+	 * * Initial value is for foldr is 1
+	 * * Expected result for foldr: 1
+	 */
+	rc = foldl_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)( -n ), operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 10;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 10): an even integer, the test "
+				  << "size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out = RC::SUCCESS;
+	if( launcher.exec( &grb_program, (long)in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 3817164c8..8aa42d597 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -390,6 +390,12 @@ for MODE in ${MODES}; do
 				echo "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::foldl+r (scalar, matrix, [mask], monoid)"
+				$runner ${TEST_BIN_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log
+				echo "Test OK" ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Tests the \`level-0' grb::collectives"
 				echo "Functional test executable: ${TEST_BIN_DIR}/collectives_blas0_${MODE}_${BACKEND}"
 				$runner ${TEST_BIN_DIR}/collectives_blas0_${MODE}_${BACKEND} ${P} &> ${TEST_OUT_DIR}/collectives_blas0_${MODE}_${BACKEND}_${P}_${T}.log

From 0d3fd9dee6f5e777b0faa48055f4d62da3d7f085 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 15:03:21 +0200
Subject: [PATCH 03/63] Unmasked foldl+foldr implementations in reference

---
 include/graphblas/reference/blas3.hpp | 284 ++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index f3f918734..6e4406fff 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -918,6 +918,97 @@ namespace grb {
 
 	namespace internal {
 
+		template<
+			Descriptor descr = descriptors::no_operation,
+			class Operator,
+			typename InputType, typename IOType
+		>
+		RC foldl_unmasked_generic(
+			IOType &x,
+			const Matrix< InputType, reference > &A,
+			const Operator &op
+		) {
+#ifdef _DEBUG
+			std::cout << "In grb::internal::foldl_unmasked_generic\n";
+#endif
+			RC rc = SUCCESS;
+
+			const typename grb::Monoid<
+				grb::operators::mul< double >,
+				grb::identities::one
+			> dummyMonoid;
+			const auto identity = dummyMonoid.template getIdentity< typename Operator::D1 >();
+
+			const auto &A_raw = internal::getCRS( A );
+
+			const size_t m = nrows( A );
+			for( size_t i = 0; i < m; ++i ) {
+				const size_t k_begin = A_raw.col_start[ i ];
+				const size_t k_end = A_raw.col_start[ i + 1 ];
+				for( size_t k = k_begin; k < k_end; ++k ) {
+					const InputType a = A_raw.getValue( k, identity);
+#ifdef _DEBUG
+					std::cout << "A( " << i << ", " << k << " ) = " << a << std::endl;
+#endif
+#ifdef _DEBUG
+					std::cout << "Computing: x = op(" << x << ", " << a << ")";
+#endif
+					rc = rc ? rc : grb::foldl( x, a, op );
+#ifdef _DEBUG
+					std::cout << " = " << x << std::endl;
+#endif
+				}
+			}
+		
+			return rc;
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			class Operator,
+			typename InputType, typename IOType
+		>
+		RC foldr_unmasked_generic(
+			IOType &x,
+			const Matrix< InputType, reference > &A,
+			const Operator &op
+		) {
+#ifdef _DEBUG
+			std::cout << "In grb::internal::foldr_unmasked_generic\n";
+#endif
+			RC rc = SUCCESS;
+
+			const typename grb::Monoid<
+				grb::operators::mul< double >,
+				grb::identities::one
+			> dummyMonoid;
+			const auto identity = dummyMonoid.template getIdentity< typename Operator::D1 >();
+
+			const auto &A_raw = internal::getCRS( A );
+
+			const size_t m = nrows( A );
+			for( size_t i = 0; i < m; ++i ) {
+				const size_t k_begin = A_raw.col_start[ i ];
+				const size_t k_end = A_raw.col_start[ i + 1 ];
+				for( size_t k = k_begin; k < k_end; ++k ) {
+					const InputType a = A_raw.getValue( k, identity);
+#ifdef _DEBUG
+					std::cout << "A( " << i << ", " << k << " ) = " << a << std::endl;
+#endif
+#ifdef _DEBUG
+					std::cout << "Computing: x = op(" << x << ", " << a << ")";
+#endif
+					rc = rc ? rc : grb::foldr( a, x, op );
+#ifdef _DEBUG
+					std::cout << " = " << x << std::endl;
+#endif
+				}
+			}
+
+			return rc;
+		}
+
+
 		/**
 		 * \internal general elementwise matrix application that all eWiseApply
 		 *           variants refer to.
@@ -1326,6 +1417,199 @@ namespace grb {
 		);
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, reference > &A,
+		const Matrix< MaskType, reference > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldr (reference,  mask, matrix, monoid)\n";
+#endif
+		// TODO: implement foldr with mask
+
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename InputType, typename IOType	
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, reference > &A,
+		const Operator &op,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Operator::D1, InputType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Operator::D2, IOType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Operator::D3, IOType >::value),
+			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldr (reference, matrix, op)\n";
+#endif
+
+		return internal::foldr_unmasked_generic(
+			x, A, op
+		);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, reference > &A,
+		const Vector< MaskType, reference > &mask,
+		const Monoid &monoid,
+		const typename std::enable_if< 
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"the operator version of foldl cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"the operator version of foldl cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldl (reference, mask, matrix, monoid)\n";
+#endif
+
+		// TODO: implement foldl with mask
+
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename InputType, typename IOType	
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, reference > &A,
+		const Operator &op,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"the operator version of foldl cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"the operator version of foldl cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Operator::D1, IOType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Operator::D2, InputType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Operator::D3, IOType >::value),
+			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldl (reference, matrix, op)\n";
+#endif
+
+		return internal::foldl_unmasked_generic(
+			x, A, op
+		);
+	}
+
+
 } // namespace grb
 
 #undef NO_CAST_ASSERT

From 14a380b3aa58ddb6cb3923104d47a4294b06d8f4 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:08:54 +0200
Subject: [PATCH 04/63] parserr bugfix in matrixReduce unit-test

---
 tests/unit/fold_matrix_to_scalar.cpp | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index 47106b361..2aca39581 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -244,21 +244,7 @@ int main( int argc, char ** argv ) {
 		printUsage = true;
 	}
 	if( argc == 2 ) {
-		size_t read;
-		std::istringstream ss( argv[ 1 ] );
-		if( ! ( ss >> read ) ) {
-			std::cerr << "Error parsing first argument\n";
-			printUsage = true;
-		} else if( ! ss.eof() ) {
-			std::cerr << "Error parsing first argument\n";
-			printUsage = true;
-		} else if( read % 2 != 0 ) {
-			std::cerr << "Given value for n is odd\n";
-			printUsage = true;
-		} else {
-			// all OK
-			in = read;
-		}
+		in = std::atol( argv[ 1 ] );
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";

From 3904c71ccf62078216684d4ae2b726f2c920d8b4 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:09:34 +0200
Subject: [PATCH 05/63] Adapt matrixReduce test for OMP foldl+foldr

---
 tests/unit/fold_matrix_to_scalar.cpp | 121 +++++++++++++--------------
 1 file changed, 60 insertions(+), 61 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index 2aca39581..f05ccfd0e 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -22,13 +22,14 @@
  * @date 17/05/2023
  *
  * Tests whether the foldl and foldl API calls produce the expected results.
- * 
+ *
  * The test cases are focused on the following aspects:
  *   * The types of the result, the matrix values and the operator
  * 	 * The initial value of the reduction result
  * 	 * The order of the operands (foldr, foldl)
  */
 
+#include <chrono>
 #include <iostream>
 #include <numeric>
 #include <sstream>
@@ -38,12 +39,24 @@
 
 using namespace grb;
 
+constexpr bool PRINT_TIMERS = false;
+constexpr bool SKIP_FOLDL = false;
+constexpr bool SKIP_FOLDR = false;
+
 using nz_t = float;
 
-template< typename T, typename V, class Operator >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+template< typename T, typename V, class Monoid >
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	if( SKIP_FOLDL )
+		return RC::SUCCESS;
+
 	T value = initial;
-	foldl( value, A, op );
+	auto start_chrono = std::chrono::high_resolution_clock::now();
+	foldl( value, A, monoid );
+	auto end_chrono = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+	if( PRINT_TIMERS )
+		std::cout << "foldl_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
 	std::cout << "foldl_test \"" << test_label << "\": ";
 	if( value == expected )
@@ -58,10 +71,18 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	return value == expected ? RC::SUCCESS : RC::FAILED;
 }
 
-template< typename T, typename V, class Operator >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+template< typename T, typename V, class Monoid >
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	if( SKIP_FOLDR )
+		return RC::SUCCESS;
+
 	T value = initial;
-	foldr( value, A, op );
+	auto start_chrono = std::chrono::high_resolution_clock::now();
+	foldr( value, A, monoid );
+	auto end_chrono = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+	if( PRINT_TIMERS )
+		std::cout << "foldr_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
 	std::cout << "foldr_test \"" << test_label << "\": ";
 	if( value == expected )
@@ -76,10 +97,10 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 	return value == expected ? RC::SUCCESS : RC::FAILED;
 }
 
-template< typename T, typename V, class Operator >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
-	RC rc = foldl_test( test_label, test_description, A, initial, expected, op );
-	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, op );
+template< typename T, typename V, class Monoid >
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	RC rc = foldl_test( test_label, test_description, A, initial, expected, monoid );
+	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
 }
 
 void grb_program( const long & n, grb::RC & rc ) {
@@ -96,7 +117,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, operators::add< nz_t >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, Monoid< operators::add< nz_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -105,7 +126,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), operators::add< nz_t >() );
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), Monoid< operators::add< nz_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -114,12 +135,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test(
-		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< size_t, nz_t, size_t >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< nz_t, size_t, size_t >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -128,12 +149,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test(
-		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< size_t, nz_t, size_t >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< nz_t, size_t, size_t >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -142,7 +163,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::mul< nz_t >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, Monoid< operators::mul< nz_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -151,7 +172,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::mul< nz_t >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, Monoid< operators::mul< nz_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -160,12 +181,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test(
-		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< size_t, nz_t, size_t >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< nz_t, size_t, size_t >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -174,12 +195,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test(
-		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< size_t, nz_t, size_t >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< nz_t, size_t, size_t >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -188,12 +209,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test(
-		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< bool, nz_t, bool >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, nz_t, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< nz_t, bool, bool >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< nz_t, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
@@ -202,36 +223,14 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test(
-		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< bool, nz_t, bool >() );
-	if( rc )
-		return;
-	rc = foldr_test(
-		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< nz_t, bool, bool >() );
-	if( rc )
-		return;
-
-	/**     Test case 11:  Non-commutative reduction
-	 * A simple substraction reduction with the same types for the nnzs and the reduction result.
-	 * * Initial value is for foldl is 0
-	 * * Expected result for foldl: -n
-	 * 
-	 * * Initial value is for foldr is 0
-	 * * Expected result for foldr: 0
-	 * 
-	 * * Initial value is for foldr is 1
-	 * * Expected result for foldr: 1
-	 */
-	rc = foldl_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)( -n ), operators::subtract< nz_t >() );
-	if( rc )
-		return;
-	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::subtract< nz_t >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, nz_t, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::subtract< nz_t >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< nz_t, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	
 }
 
 int main( int argc, char ** argv ) {

From 5c1f5599224311ee9163d697f272d71ee6088da1 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:10:44 +0200
Subject: [PATCH 06/63] Adapt base foldl+r signature: operator -> monoid

---
 include/graphblas/base/blas3.hpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index bce17cb64..371c0987a 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -569,17 +569,17 @@ namespace grb {
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
-		class Operator,
+		class Monoid,
 		typename InputType, typename IOType,
 		Backend backend
 	>
 	RC foldr(
 		IOType &x,
 		const Matrix< InputType, backend > &A,
-		const Operator &op,
+		const Monoid &monoid,
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
-			grb::is_operator< Operator >::value, void
+			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
 #ifndef NDEBUG
@@ -588,7 +588,7 @@ namespace grb {
 #endif
 		(void) A;
 		(void) x;
-		(void) op;
+		(void) monoid;
 		return UNSUPPORTED;
 	}
 
@@ -719,17 +719,18 @@ namespace grb {
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
-		class Operator,
+		class Monoid,
 		typename InputType, typename IOType,
 		Backend backend
 	>
 	RC foldl(
 		IOType &x,
 		const Matrix< InputType, backend > &A,
-		const Operator &op,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
+		const Monoid &monoid,
+		const typename std::enable_if< 
+			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
-			grb::is_operator< Operator >::value, void
+			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
 #ifndef NDEBUG
@@ -738,7 +739,7 @@ namespace grb {
 #endif
 		(void) A;
 		(void) x;
-		(void) op;
+		(void) monoid;
 		return UNSUPPORTED;
 	}
 

From 1cf4e29c33e1739123102f64c6ceb4384bf59399 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:11:06 +0200
Subject: [PATCH 07/63] reference_omp version of unmasked foldl+r

---
 include/graphblas/reference/blas3.hpp | 170 ++++++++++++++++++--------
 1 file changed, 118 insertions(+), 52 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 6e4406fff..c738ef885 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -918,93 +918,158 @@ namespace grb {
 
 	namespace internal {
 
+#ifdef _DEBUG
+#ifndef _DEBUG_THREADSAFE_PRINT
+#define _DEBUG_THREADSAFE_PRINT
+		//TODO: Shall and will be removed ;)
+		void debug_threadsafe_print( const std::string &str ) {
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+			{
+				std::cout << "[T" << omp_get_thread_num(); << "] - " << str;
+			}
+#else
+			std::cout << str;
+#endif
+		}
+#endif
+#endif
+
+		
 		template<
 			Descriptor descr = descriptors::no_operation,
-			class Operator,
+			class Monoid,
 			typename InputType, typename IOType
 		>
 		RC foldl_unmasked_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
-			const Operator &op
+			const Monoid &monoid
 		) {
+
 #ifdef _DEBUG
 			std::cout << "In grb::internal::foldl_unmasked_generic\n";
 #endif
 			RC rc = SUCCESS;
 
-			const typename grb::Monoid<
-				grb::operators::mul< double >,
-				grb::identities::one
-			> dummyMonoid;
-			const auto identity = dummyMonoid.template getIdentity< typename Operator::D1 >();
+			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
+			const auto& op = monoid.getOperator();
 
 			const auto &A_raw = internal::getCRS( A );
-
 			const size_t m = nrows( A );
-			for( size_t i = 0; i < m; ++i ) {
-				const size_t k_begin = A_raw.col_start[ i ];
-				const size_t k_end = A_raw.col_start[ i + 1 ];
-				for( size_t k = k_begin; k < k_end; ++k ) {
-					const InputType a = A_raw.getValue( k, identity);
+			RC local_rc = rc;
+			auto local_x = identity;
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity) 
+#endif
+			{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp	for schedule(static)
+#endif
+				for( size_t i = 0; i < m; ++i ) {
+					const size_t k_begin = A_raw.col_start[ i ];
+					const size_t k_end = A_raw.col_start[ i + 1 ];
+					for( size_t k = k_begin; k < k_end; ++k ) {
+						const InputType a = A_raw.getValue( k, identity);
+
 #ifdef _DEBUG
-					std::cout << "A( " << i << ", " << k << " ) = " << a << std::endl;
+						debug_threadsafe_print( "A( " + std::to_string( i ) + ", " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );		
+						auto x_before = local_x;	
 #endif
+						local_rc = local_rc ? local_rc : grb::foldl( local_x, a, op );
+
 #ifdef _DEBUG
-					std::cout << "Computing: x = op(" << x << ", " << a << ")";
+						debug_threadsafe_print( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
 #endif
-					rc = rc ? rc : grb::foldl( x, a, op );
+					}
+				}
+			
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp critical
+#endif
+				{	
+#ifdef _DEBUG
+					auto x_before = x;
+#endif
+					local_rc = local_rc ? local_rc : grb::foldl( x, local_x, op );
 #ifdef _DEBUG
-					std::cout << " = " << x << std::endl;
+					std::cout << "Computing x: op(" << x_before << ", " << local_x << ") = " << x << std::endl;
 #endif
+					rc = rc ? rc : local_rc;
 				}
 			}
-		
+
+#undef _DEBUG
 			return rc;
 		}
 
 		template<
 			Descriptor descr = descriptors::no_operation,
-			class Operator,
+			class Monoid,
 			typename InputType, typename IOType
 		>
 		RC foldr_unmasked_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
-			const Operator &op
+			const Monoid &monoid
 		) {
 #ifdef _DEBUG
 			std::cout << "In grb::internal::foldr_unmasked_generic\n";
 #endif
 			RC rc = SUCCESS;
 
-			const typename grb::Monoid<
-				grb::operators::mul< double >,
-				grb::identities::one
-			> dummyMonoid;
-			const auto identity = dummyMonoid.template getIdentity< typename Operator::D1 >();
+			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
+			const auto& op = monoid.getOperator();
 
 			const auto &A_raw = internal::getCRS( A );
-
 			const size_t m = nrows( A );
-			for( size_t i = 0; i < m; ++i ) {
-				const size_t k_begin = A_raw.col_start[ i ];
-				const size_t k_end = A_raw.col_start[ i + 1 ];
-				for( size_t k = k_begin; k < k_end; ++k ) {
-					const InputType a = A_raw.getValue( k, identity);
+			RC local_rc = rc;
+			auto local_x = identity;
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity) 
+#endif
+			{
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp	for schedule(static)
+#endif
+				for( size_t i = 0; i < m; ++i ) {
+					const size_t k_begin = A_raw.col_start[ i ];
+					const size_t k_end = A_raw.col_start[ i + 1 ];
+					for( size_t k = k_begin; k < k_end; ++k ) {
+						const InputType a = A_raw.getValue( k, identity);
+
+#ifdef _DEBUG
+						debug_threadsafe_print( "A( " + std::to_string( i ) + ", " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );		
+						auto x_before = local_x;	
+#endif
+						local_rc = local_rc ? local_rc : grb::foldr( a, local_x, op );
+
 #ifdef _DEBUG
-					std::cout << "A( " << i << ", " << k << " ) = " << a << std::endl;
+						debug_threadsafe_print( "Computing: local_x = op(" + std::to_string( a ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+#endif
+					}
+				}
+			
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp critical
 #endif
+				{	
 #ifdef _DEBUG
-					std::cout << "Computing: x = op(" << x << ", " << a << ")";
+					auto x_before = x;
 #endif
-					rc = rc ? rc : grb::foldr( a, x, op );
+					local_rc = local_rc ? local_rc : grb::foldr( local_x, x, op );
 #ifdef _DEBUG
-					std::cout << " = " << x << std::endl;
+					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
 #endif
+					rc = rc ? rc : local_rc;
 				}
 			}
 
+#undef _DEBUG
 			return rc;
 		}
 
@@ -1467,16 +1532,16 @@ namespace grb {
 
 	template<
 		Descriptor descr = descriptors::no_operation,
-		class Operator,
+		class Monoid,
 		typename InputType, typename IOType	
 	>
 	RC foldr(
 		IOType &x,
 		const Matrix< InputType, reference > &A,
-		const Operator &op,
+		const Monoid &monoid,
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
-			grb::is_operator< Operator >::value, void
+			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
 		// static checks
@@ -1490,15 +1555,15 @@ namespace grb {
 			"the operator version of foldr cannot be used if the "
 			"result is of type void"
 		);
-		static_assert( (std::is_same< typename Operator::D1, InputType >::value),
+		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
 			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
-		static_assert( (std::is_same< typename Operator::D2, IOType >::value),
+		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
 			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
-		static_assert( (std::is_same< typename Operator::D3, IOType >::value),
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
@@ -1508,7 +1573,7 @@ namespace grb {
 #endif
 
 		return internal::foldr_unmasked_generic(
-			x, A, op
+			x, A, monoid
 		);
 	}
 
@@ -1520,7 +1585,7 @@ namespace grb {
 	RC foldl(
 		IOType &x,
 		const Matrix< InputType, reference > &A,
-		const Vector< MaskType, reference > &mask,
+		const Matrix< MaskType, reference > &mask,
 		const Monoid &monoid,
 		const typename std::enable_if< 
 			!grb::is_object< IOType >::value &&
@@ -1564,16 +1629,17 @@ namespace grb {
 
 	template<
 		Descriptor descr = descriptors::no_operation,
-		class Operator,
+		class Monoid,
 		typename InputType, typename IOType	
 	>
 	RC foldl(
 		IOType &x,
 		const Matrix< InputType, reference > &A,
-		const Operator &op,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
+		const Monoid &monoid,
+		const typename std::enable_if< 
+			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
-			grb::is_operator< Operator >::value, void
+			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
 		// static checks
@@ -1587,25 +1653,25 @@ namespace grb {
 			"the operator version of foldl cannot be used if the "
 			"result is of type void"
 		);
-		static_assert( (std::is_same< typename Operator::D1, IOType >::value),
+		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
 			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
-		static_assert( (std::is_same< typename Operator::D2, InputType >::value),
+		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
 			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
-		static_assert( (std::is_same< typename Operator::D3, IOType >::value),
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::foldl (reference, matrix, op)\n";
+		std::cout << "In grb::foldl (reference, matrix, monoid)\n";
 #endif
 
 		return internal::foldl_unmasked_generic(
-			x, A, op
+			x, A, monoid
 		);
 	}
 

From e5fb254ea7f1ea601ec5a8439fe2209b4a09ddce Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Mon, 22 May 2023 13:32:29 +0200
Subject: [PATCH 08/63] Extend matrixeReduce unit-test

---
 tests/unit/fold_matrix_to_scalar.cpp | 189 +++++++++++++++++++--------
 1 file changed, 132 insertions(+), 57 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index f05ccfd0e..e67d02d0e 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -43,8 +43,6 @@ constexpr bool PRINT_TIMERS = false;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
 
-using nz_t = float;
-
 template< typename T, typename V, class Monoid >
 RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDL )
@@ -103,132 +101,126 @@ RC foldLR_test( const char * test_label, const char * test_description, const gr
 	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
 }
 
-void grb_program( const long & n, grb::RC & rc ) {
-	// Build an identity matrix
-	Matrix< nz_t > I( n, n );
-	std::vector< size_t > I_rows( n ), I_cols( n );
-	std::vector< nz_t > I_vals( n, 1 );
-	std::iota( I_rows.begin(), I_rows.end(), 0 );
-	std::iota( I_cols.begin(), I_cols.end(), 0 );
-	buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), n, PARALLEL );
+void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
+	const long n = grb::nnz( I );
 
 	/**    Test case 1:
-	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, Monoid< operators::add< nz_t >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 2:
-	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), Monoid< operators::add< nz_t >, identities::zero >() );
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 5:
-	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, Monoid< operators::mul< nz_t >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 6:
-	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, Monoid< operators::mul< nz_t >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
-		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
-		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nnzs and the reduction result (bool <- bool == float).
+	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == float).
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, nz_t, bool >, identities::logical_true >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
-		Monoid< operators::equal< nz_t, bool, bool >, identities::logical_true >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
 	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nnzs and the reduction result (bool <- bool || float).
+	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || float).
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, nz_t, bool >, identities::logical_false >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
-		Monoid< operators::logical_or< nz_t, bool, bool >, identities::logical_false >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
 }
@@ -236,14 +228,14 @@ void grb_program( const long & n, grb::RC & rc ) {
 int main( int argc, char ** argv ) {
 	// defaults
 	bool printUsage = false;
-	size_t in = 10;
+	size_t n = 10;
 
 	// error checking
 	if( argc > 2 ) {
 		printUsage = true;
 	}
 	if( argc == 2 ) {
-		in = std::atol( argv[ 1 ] );
+		n = std::atol( argv[ 1 ] );
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
@@ -254,14 +246,97 @@ int main( int argc, char ** argv ) {
 
 	std::cout << "This is functional test " << argv[ 0 ] << "\n";
 	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC out = RC::SUCCESS;
-	if( launcher.exec( &grb_program, (long)in, out, true ) != SUCCESS ) {
-		std::cerr << "Launching test FAILED\n";
-		return 255;
+	grb::RC rc = RC::SUCCESS;
+
+	if( ! rc ) { // Build an identity square-matrix
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n ), I_cols( n );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 01 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Build a square-matrix with n 1s on the first row
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 02 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
 	}
-	if( out != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
-		return out;
+
+	if( ! rc ) { // Build a square-matrix with n 1s on the first column
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 03 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
+		std::vector< float > I_vals( 2 * n - 1, 1.f );
+		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
+		std::iota( I_cols.begin() + n, I_cols.end(), 1 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 04 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
+		Matrix< float > I( 1, n );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 04 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
+		Matrix< float > I( n, 1 );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 06 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( rc != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl;
+		return rc;
 	} else {
 		std::cout << "Test OK" << std::endl;
 		return 0;

From c1b812f7f2bd52c776a0292675ad94573886d36e Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Tue, 23 May 2023 11:34:14 +0200
Subject: [PATCH 09/63] Remove nested loop for foldl+r, as the order of the
 reduction is not important

---
 include/graphblas/reference/blas3.hpp | 125 +++++++++++++-------------
 1 file changed, 65 insertions(+), 60 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index c738ef885..08a79fc97 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -918,24 +918,6 @@ namespace grb {
 
 	namespace internal {
 
-#ifdef _DEBUG
-#ifndef _DEBUG_THREADSAFE_PRINT
-#define _DEBUG_THREADSAFE_PRINT
-		//TODO: Shall and will be removed ;)
-		void debug_threadsafe_print( const std::string &str ) {
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-			{
-				std::cout << "[T" << omp_get_thread_num(); << "] - " << str;
-			}
-#else
-			std::cout << str;
-#endif
-		}
-#endif
-#endif
-
-		
 		template<
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
@@ -946,7 +928,6 @@ namespace grb {
 			const Matrix< InputType, reference > &A,
 			const Monoid &monoid
 		) {
-
 #ifdef _DEBUG
 			std::cout << "In grb::internal::foldl_unmasked_generic\n";
 #endif
@@ -961,35 +942,48 @@ namespace grb {
 			auto local_x = identity;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity) 
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
 #endif
 			{
+				size_t start, end;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp	for schedule(static)
+				config::OMP::localRange( start, end, A_raw.col_start[ 0 ], A_raw.col_start[ m ] );
+#else
+				start = A_raw.col_start[ 0 ];
+				end = A_raw.col_start[ m ];
 #endif
-				for( size_t i = 0; i < m; ++i ) {
-					const size_t k_begin = A_raw.col_start[ i ];
-					const size_t k_end = A_raw.col_start[ i + 1 ];
-					for( size_t k = k_begin; k < k_end; ++k ) {
-						const InputType a = A_raw.getValue( k, identity);
-
+				for( size_t k = start; k < end; ++k ) {
+					const InputType a = A_raw.getValue( k, identity);
 #ifdef _DEBUG
-						debug_threadsafe_print( "A( " + std::to_string( i ) + ", " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );		
-						auto x_before = local_x;	
+					const std::string str( "A( " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+					{
+						std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+					}
+#else
+					std::cout << str;
 #endif
-						local_rc = local_rc ? local_rc : grb::foldl( local_x, a, op );
-
-#ifdef _DEBUG
-						debug_threadsafe_print( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
+					auto x_before = local_x;
 #endif
+					local_rc = local_rc ? local_rc : grb::foldl( local_x, a, op );
+#ifdef _DEBUG
+					const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+					{
+						std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
 					}
+#else
+					std::cout << str2;
+#endif
+#endif
 				}
-			
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 	#pragma omp critical
 #endif
-				{	
+				{
 #ifdef _DEBUG
 					auto x_before = x;
 #endif
@@ -1001,7 +995,6 @@ namespace grb {
 				}
 			}
 
-#undef _DEBUG
 			return rc;
 		}
 
@@ -1015,6 +1008,7 @@ namespace grb {
 			const Matrix< InputType, reference > &A,
 			const Monoid &monoid
 		) {
+
 #ifdef _DEBUG
 			std::cout << "In grb::internal::foldr_unmasked_generic\n";
 #endif
@@ -1029,47 +1023,59 @@ namespace grb {
 			auto local_x = identity;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity) 
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
 #endif
 			{
+				size_t start, end;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp	for schedule(static)
+				config::OMP::localRange( start, end, A_raw.col_start[ 0 ], A_raw.col_start[ m ] );
+#else
+				start = A_raw.col_start[ 0 ];
+				end = A_raw.col_start[ m ];
 #endif
-				for( size_t i = 0; i < m; ++i ) {
-					const size_t k_begin = A_raw.col_start[ i ];
-					const size_t k_end = A_raw.col_start[ i + 1 ];
-					for( size_t k = k_begin; k < k_end; ++k ) {
-						const InputType a = A_raw.getValue( k, identity);
-
+				for( size_t k = start; k < end; ++k ) {
+					const InputType a = A_raw.getValue( k, identity);
 #ifdef _DEBUG
-						debug_threadsafe_print( "A( " + std::to_string( i ) + ", " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );		
-						auto x_before = local_x;	
+					const std::string str( "A( " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+					{
+						std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+					}
+#else
+					std::cout << str;
 #endif
-						local_rc = local_rc ? local_rc : grb::foldr( a, local_x, op );
-
-#ifdef _DEBUG
-						debug_threadsafe_print( "Computing: local_x = op(" + std::to_string( a ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+					auto x_before = local_x;
 #endif
+					local_rc = local_rc ? local_rc : grb::foldr( a, local_x, op );
+#ifdef _DEBUG
+					const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+					{
+						std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
 					}
+#else
+					std::cout << str2;
+#endif
+#endif
 				}
-			
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 	#pragma omp critical
 #endif
-				{	
+				{
 #ifdef _DEBUG
 					auto x_before = x;
 #endif
 					local_rc = local_rc ? local_rc : grb::foldr( local_x, x, op );
 #ifdef _DEBUG
-					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
+					std::cout << "Computing x: op(" << x_before << ", " << local_x << ") = " << x << std::endl;
 #endif
 					rc = rc ? rc : local_rc;
 				}
 			}
-
-#undef _DEBUG
+		
 			return rc;
 		}
 
@@ -1533,7 +1539,7 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType	
+		typename InputType, typename IOType
 	>
 	RC foldr(
 		IOType &x,
@@ -1587,7 +1593,7 @@ namespace grb {
 		const Matrix< InputType, reference > &A,
 		const Matrix< MaskType, reference > &mask,
 		const Monoid &monoid,
-		const typename std::enable_if< 
+		const typename std::enable_if<
 			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
 			!grb::is_object< MaskType >::value &&
@@ -1630,13 +1636,13 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType	
+		typename InputType, typename IOType
 	>
 	RC foldl(
 		IOType &x,
 		const Matrix< InputType, reference > &A,
 		const Monoid &monoid,
-		const typename std::enable_if< 
+		const typename std::enable_if<
 			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
 			grb::is_monoid< Monoid >::value, void
@@ -1692,4 +1698,3 @@ namespace grb {
 #endif
 
 #endif // ``_H_GRB_REFERENCE_BLAS3''
-

From 624c193249f6ea3ac51362607653d431fd5ca656 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 15:26:49 +0200
Subject: [PATCH 10/63] Documentation review

---
 include/graphblas/base/blas3.hpp | 66 +++++++++++++++++---------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 371c0987a..0b652f4ac 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -484,10 +484,11 @@ namespace grb {
 	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
 	 *                   matrix \a mask.
 	 *
-	 * @param[out]   x   The result of the reduction.
-	 * @param[in]    A   Any ALP/GraphBLAS matrix.
-	 * @param[in]  mask  Any ALP/GraphBLAS matrix.
-	 * @param[in] monoid The monoid under which to perform this reduction.
+	 * @param[in, out] x   The result of the reduction.
+	 * 					   Prior value will be considered.
+	 * @param[in]    A     Any ALP/GraphBLAS matrix.
+	 * @param[in]  mask    Any ALP/GraphBLAS matrix.
+	 * @param[in] monoid   The monoid under which to perform this reduction.
 	 *
 	 * @return grb::SUCCESS  When the call completed successfully.
 	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
@@ -559,8 +560,9 @@ namespace grb {
 	 *                   matrix \a y.
 	 * @tparam IOType    The type of the output scalar \a x.
 	 *
-	 * @param[out]   x     The result of the reduction.
-	 * @param[in]    A     Any ALP/GraphBLAS matrix.
+	 * @param[in, out] x   The result of the reduction.
+	 * 					   Prior value will be considered.
+	 * @param[in]      A   Any ALP/GraphBLAS matrix.
 	 * @param[in] operator The operator used for reduction.
 	 *
 	 * @return grb::SUCCESS  When the call completed successfully.
@@ -603,27 +605,27 @@ namespace grb {
 	 *
 	 * Let \f$ x_0 = 1 \f$ and let
 	 * \f$ x_{i+1} = \begin{cases}
-	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }m_i\text{ evaluates true}
-	 *   x_i\text{ otherwise}
+	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }
+	 * 	 m_i\text{ evaluates true}x_i\text{ otherwise}
 	 * \end{cases},\f$
 	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$.
 	 *
-	 * \note Per this definition, the folding happens in a left-to-right direction.
-	 *       If another direction is wanted, which may have use in cases where
-	 *       \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a monoid with those
-	 *       operator domains switched may be supplied, or #grb::foldr may be used
-	 *       instead.
+	 * \note Per this definition, the folding happens in a left-to-right
+	 * 		 direction. If another direction is wanted, which may have use in
+	 *  	 cases where \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a
+	 * 		 monoid with those operator domains switched may be supplied, or
+	 * 		 #grb::foldr may be used instead.
 	 *
 	 * After a successfull call, \a x will be equal to \f$ x_n \f$.
 	 *
-	 * Note that the operator \f$ \oplus \f$ must be associative since it is part
-	 * of a monoid. This algebraic property is exploited when parallelising the
-	 * requested operation. The identity is required when parallelising over
+	 * Note that the operator \f$ \oplus \f$ must be associative since it is
+	 * part of a monoid. This algebraic property is exploited when parallelising
+	 * the requested operation. The identity is required when parallelising over
 	 * multiple user processes.
 	 *
-	 * \warning In so doing, the order of the evaluation of the reduction operation
-	 *          should not be expected to be a serial, left-to-right, evaluation of
-	 *          the computation chain.
+	 * \warning In so doing, the order of the evaluation of the reduction
+	 * 			operation should not be expected to be a serial, left-to-right,
+	 * 			evaluation of the computation chain.
 	 *
 	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
 	 *                   left unspecified).
@@ -634,16 +636,17 @@ namespace grb {
 	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
 	 *                   matrix \a mask.
 	 *
-	 * @param[out]   x   The result of the reduction.
-	 * @param[in]    A   Any ALP/GraphBLAS matrix.
-	 * @param[in]  mask  Any ALP/GraphBLAS matrix.
-	 * @param[in] monoid The monoid under which to perform this reduction.
+	 * @param[in, out] x  The result of the reduction. 
+	 * 					  Prior value will be considered.
+	 * @param[in] A       Any ALP/GraphBLAS matrix.
+	 * @param[in] mask    Any ALP/GraphBLAS matrix.
+	 * @param[in] monoid  The monoid under which to perform this reduction.
 	 *
 	 * @return grb::SUCCESS  When the call completed successfully.
 	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
-	 *                       equal to \a y.
-	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
-	 *                       #grb::descriptors::dense was given.
+	 *                       equal to \a A.
+	 * @return grb::ILLEGAL  If the provided input matrix \a A was not dense,
+	 * 						 while #grb::descriptors::dense was given.
 	 *
 	 * @see grb::foldr provides similar in-place functionality.
 	 * @see grb::eWiseApply provides out-of-place semantics.
@@ -657,10 +660,10 @@ namespace grb {
 	 * \note Invalid descriptors will be ignored.
 	 *
 	 * If grb::descriptors::no_casting is given, then 1) the first domain of
-	 * \a monoid must match \a InputType, 2) the second domain of \a op must match
-	 * \a IOType, 3) the third domain must match \a IOType, and 4) the element type
-	 * of \a mask must be <tt>bool</tt>. If one of these is not true, the code
-	 * shall not compile.
+	 * \a monoid must match \a InputType, 2) the second domain of \a op must
+	 * match \a IOType, 3) the third domain must match \a IOType, and 4) the
+	 * element type of \a mask must be <tt>bool</tt>. If one of these is not
+	 * true, the code shall not compile.
 	 * \endparblock
 	 *
 	 * \par Performance semantics
@@ -709,7 +712,8 @@ namespace grb {
 	 *                   matrix \a y.
 	 * @tparam IOType    The type of the output scalar \a x.
 	 *
-	 * @param[out]   x     The result of the reduction.
+	 * @param[in, out] x   The result of the reduction.
+	 * 					   Prior value will be considered.
 	 * @param[in]    A     Any ALP/GraphBLAS matrix.
 	 * @param[in] operator The operator used for reduction.
 	 *

From 1c097299e0851a7b6bb936566edae99d7f961a61 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 16:20:21 +0200
Subject: [PATCH 11/63] Prepare generic folr+l functionfor masked version

---
 include/graphblas/reference/blas3.hpp | 29 +++++++++++++++++----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 08a79fc97..060fc53f3 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -919,17 +919,20 @@ namespace grb {
 	namespace internal {
 
 		template<
+			bool masked,
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
-			typename InputType, typename IOType
+			typename InputType, typename IOType, typename MaskType
 		>
-		RC foldl_unmasked_generic(
+		RC foldl_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
+			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
+			(void) mask;
 #ifdef _DEBUG
-			std::cout << "In grb::internal::foldl_unmasked_generic\n";
+			std::cout << "In grb::internal::foldl_generic\n";
 #endif
 			RC rc = SUCCESS;
 
@@ -999,18 +1002,20 @@ namespace grb {
 		}
 
 		template<
+			bool masked,
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
-			typename InputType, typename IOType
+			typename InputType, typename IOType, typename MaskType
 		>
-		RC foldr_unmasked_generic(
+		RC foldr_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
+			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
-
+			(void) mask;
 #ifdef _DEBUG
-			std::cout << "In grb::internal::foldr_unmasked_generic\n";
+			std::cout << "In grb::internal::foldr_generic\n";
 #endif
 			RC rc = SUCCESS;
 
@@ -1578,8 +1583,9 @@ namespace grb {
 		std::cout << "In grb::foldr (reference, matrix, op)\n";
 #endif
 
-		return internal::foldr_unmasked_generic(
-			x, A, monoid
+		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
+		return internal::foldr_generic< false, descr, Monoid, InputType, IOType, void >(
+			x, A, empty_mask, monoid
 		);
 	}
 
@@ -1676,8 +1682,9 @@ namespace grb {
 		std::cout << "In grb::foldl (reference, matrix, monoid)\n";
 #endif
 
-		return internal::foldl_unmasked_generic(
-			x, A, monoid
+		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
+		return internal::foldl_generic< false, descr, Monoid, InputType, IOType, void >(
+			x, A, empty_mask, monoid
 		);
 	}
 

From 8ad6be6abdb82761ccd05ab15d2fef036ce0d4ab Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 22:09:59 +0200
Subject: [PATCH 12/63] Matrix-to-scalar foldl+r masked version

---
 include/graphblas/reference/blas3.hpp | 263 +++++++++++++++++++-------
 tests/unit/fold_matrix_to_scalar.cpp  | 231 ++++++++++++++--------
 2 files changed, 352 insertions(+), 142 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 060fc53f3..35b505445 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -930,9 +930,10 @@ namespace grb {
 			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
-			(void) mask;
+
 #ifdef _DEBUG
-			std::cout << "In grb::internal::foldl_generic\n";
+			std::cout << "In grb::internal::foldl_generic( reference, masked = "
+				<< ( masked ? "true" : "false" ) << " )" << std::endl;
 #endif
 			RC rc = SUCCESS;
 
@@ -940,47 +941,111 @@ namespace grb {
 			const auto& op = monoid.getOperator();
 
 			const auto &A_raw = internal::getCRS( A );
+			const auto &mask_raw = internal::getCRS( mask );
 			const size_t m = nrows( A );
+			const size_t n = ncols( A );
+			const size_t mask_k_increment = masked ? 1 : 0;
+
+			// Check mask dimensions
+			if( masked && ( m != nrows(mask) || n != ncols(mask) ) ) {
+#ifdef _DEBUG
+				std::cout << "Mask dimensions do not match input matrix dimensions\n";
+#endif
+				return MISMATCH;
+			}
+
 			RC local_rc = rc;
 			auto local_x = identity;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
+	#pragma omp parallel default(none) shared(A_raw, mask_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
 #endif
 			{
-				size_t start, end;
+				size_t start_row, end_row;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start, end, A_raw.col_start[ 0 ], A_raw.col_start[ m ] );
+				config::OMP::localRange( start_row, end_row, 0, m );
 #else
-				start = A_raw.col_start[ 0 ];
-				end = A_raw.col_start[ m ];
+				start_row = 0;
+				end_row = m;
 #endif
-				for( size_t k = start; k < end; ++k ) {
-					const InputType a = A_raw.getValue( k, identity);
+				for( size_t i = start_row; i < end_row; ++i ) {
+					size_t mask_k = mask_raw.col_start[ i ];
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						if( masked ) {
+							// Increment the mask pointer until we find the right column, or an higher one
+							while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
 #ifdef _DEBUG
-					const std::string str( "A( " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );
+								const std::string skip_str( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 #if defined(_H_GRB_REFERENCE_OMP_BLAS3)
 	#pragma omp critical
-					{
-						std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-					}
+								{
+									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str;
+								}
 #else
-					std::cout << str;
+								std::cout << skip_str;
 #endif
-					auto x_before = local_x;
 #endif
-					local_rc = local_rc ? local_rc : grb::foldl( local_x, a, op );
+								mask_k += mask_k_increment;
+							}
+							// if there is no value for this coordinate, skip it
+							if( mask_raw.row_index[ mask_k ] != k_col ) {
 #ifdef _DEBUG
-					const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
+								const std::string skip_str2( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 #if defined(_H_GRB_REFERENCE_OMP_BLAS3)
 	#pragma omp critical
-					{
-						std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
-					}
+								{
+									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str2;
+								}
+#else
+								std::cout << skip_str2;
+#endif
+#endif
+								continue;
+							}
+
+#ifdef _DEBUG
+							const std::string str( "Mask( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+							{
+								std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+							}
 #else
-					std::cout << str2;
+							std::cout << str;
 #endif
 #endif
+						}
+
+						// Increment the mask pointer in order to skip the next while loop (best case)
+						mask_k += mask_k_increment;
+
+						const InputType a_val = A_raw.getValue( k, identity );
+#ifdef _DEBUG
+						const std::string str( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+						{
+							std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+						}
+#else
+						std::cout << str;
+#endif
+						auto x_before = local_x;
+#endif
+						local_rc = local_rc ? local_rc : grb::foldl( local_x, a_val, op );
+#ifdef _DEBUG
+						const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a_val ) + ") = " + std::to_string( local_x ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+						{
+							std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
+						}
+#else
+						std::cout << str2;
+#endif
+#endif
+					}
 				}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1013,9 +1078,10 @@ namespace grb {
 			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
-			(void) mask;
+
 #ifdef _DEBUG
-			std::cout << "In grb::internal::foldr_generic\n";
+			std::cout << "In grb::internal::foldr_generic( reference, masked = "
+				<< ( masked ? "true" : "false" ) << " )" << std::endl;
 #endif
 			RC rc = SUCCESS;
 
@@ -1023,47 +1089,111 @@ namespace grb {
 			const auto& op = monoid.getOperator();
 
 			const auto &A_raw = internal::getCRS( A );
+			const auto &mask_raw = internal::getCRS( mask );
 			const size_t m = nrows( A );
+			const size_t n = ncols( A );
+			const size_t mask_k_increment = masked ? 1 : 0;
+
+			// Check mask dimensions
+			if( masked && ( m != nrows(mask) || n != ncols(mask) ) ) {
+#ifdef _DEBUG
+				std::cout << "Mask dimensions do not match input matrix dimensions\n";
+#endif
+				return MISMATCH;
+			}
+
 			RC local_rc = rc;
 			auto local_x = identity;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
+	#pragma omp parallel default(none) shared(A_raw, mask_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
 #endif
 			{
-				size_t start, end;
+				size_t start_row, end_row;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start, end, A_raw.col_start[ 0 ], A_raw.col_start[ m ] );
+				config::OMP::localRange( start_row, end_row, 0, m );
 #else
-				start = A_raw.col_start[ 0 ];
-				end = A_raw.col_start[ m ];
+				start_row = 0;
+				end_row = m;
 #endif
-				for( size_t k = start; k < end; ++k ) {
-					const InputType a = A_raw.getValue( k, identity);
+				for( size_t i = start_row; i < end_row; ++i ) {
+					size_t mask_k = mask_raw.col_start[ i ];
+					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const size_t k_col = A_raw.row_index[ k ];
+						if( masked ) {
+							// Increment the mask pointer until we find the right column, or an higher one
+							while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
 #ifdef _DEBUG
-					const std::string str( "A( " + std::to_string( k ) + " ) = " + std::to_string( a ) + "\n" );
+								const std::string skip_str( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 #if defined(_H_GRB_REFERENCE_OMP_BLAS3)
 	#pragma omp critical
-					{
-						std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-					}
+								{
+									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str;
+								}
 #else
-					std::cout << str;
+								std::cout << skip_str;
 #endif
-					auto x_before = local_x;
 #endif
-					local_rc = local_rc ? local_rc : grb::foldr( a, local_x, op );
+								mask_k += mask_k_increment;
+							}
+							// if there is no value for this coordinate, skip it
+							if( mask_raw.row_index[ mask_k ] != k_col ) {
 #ifdef _DEBUG
-					const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a ) + ") = " + std::to_string( local_x ) + "\n" );
+								const std::string skip_str2( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 #if defined(_H_GRB_REFERENCE_OMP_BLAS3)
 	#pragma omp critical
-					{
-						std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
-					}
+								{
+									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str2;
+								}
 #else
-					std::cout << str2;
+								std::cout << skip_str2;
 #endif
 #endif
+								continue;
+							}
+
+#ifdef _DEBUG
+							const std::string str( "Mask( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+							{
+								std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+							}
+#else
+							std::cout << str;
+#endif
+#endif
+						}
+
+						// Increment the mask pointer in order to skip the next while loop (best case)
+						mask_k += mask_k_increment;
+
+						const InputType a_val = A_raw.getValue( k, identity );
+#ifdef _DEBUG
+						const std::string str( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+						{
+							std::cout << "[T" << omp_get_thread_num() << "] - " << str;
+						}
+#else
+						std::cout << str;
+#endif
+						auto x_before = local_x;
+#endif
+						local_rc = local_rc ? local_rc : grb::foldr( a_val, local_x, op );
+#ifdef _DEBUG
+						const std::string str2( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+						{
+							std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
+						}
+#else
+						std::cout << str2;
+#endif
+#endif
+					}
 				}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
@@ -1075,12 +1205,12 @@ namespace grb {
 #endif
 					local_rc = local_rc ? local_rc : grb::foldr( local_x, x, op );
 #ifdef _DEBUG
-					std::cout << "Computing x: op(" << x_before << ", " << local_x << ") = " << x << std::endl;
+					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
 #endif
 					rc = rc ? rc : local_rc;
 				}
 			}
-		
+
 			return rc;
 		}
 
@@ -1511,34 +1641,35 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"result is of type void"
 		);
-		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
+			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
-		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
+			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
 		std::cout << "In grb::foldr (reference,  mask, matrix, monoid)\n";
 #endif
-		// TODO: implement foldr with mask
 
-		return UNSUPPORTED;
+		return internal::foldr_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+			x, A, mask, monoid
+		);
 	}
 
 	template<
@@ -1608,25 +1739,25 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
@@ -1634,9 +1765,9 @@ namespace grb {
 		std::cout << "In grb::foldl (reference, mask, matrix, monoid)\n";
 #endif
 
-		// TODO: implement foldl with mask
-
-		return UNSUPPORTED;
+		return internal::foldl_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+			x, A, mask, monoid
+		);
 	}
 
 	template<
@@ -1656,25 +1787,25 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index e67d02d0e..d27a1fd85 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -42,111 +42,178 @@ using namespace grb;
 constexpr bool PRINT_TIMERS = false;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
+constexpr bool SKIP_UNMASKED = false;
+constexpr bool SKIP_MASKED = false;
 
 template< typename T, typename V, class Monoid >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDL )
 		return RC::SUCCESS;
+	RC rc = RC::SUCCESS;
+
+	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldl( value, A, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldl (unmasked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
+
+	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldl( value, A, mask, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldl (masked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
 
-	T value = initial;
-	auto start_chrono = std::chrono::high_resolution_clock::now();
-	foldl( value, A, monoid );
-	auto end_chrono = std::chrono::high_resolution_clock::now();
-	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
-	if( PRINT_TIMERS )
-		std::cout << "foldl_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-	std::cout << "foldl_test \"" << test_label << "\": ";
-	if( value == expected )
-		std::cout << "OK" << std::endl;
-	else
-		std::cerr << "Failed" << std::endl
-				  << test_description << std::endl
-				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-	return value == expected ? RC::SUCCESS : RC::FAILED;
+	return rc;
 }
 
 template< typename T, typename V, class Monoid >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDR )
 		return RC::SUCCESS;
+	RC rc = RC::SUCCESS;
+
+	if( rc == RC::SUCCESS ) { // Unmasked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldr( value, A, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldr (unmasked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
 
-	T value = initial;
-	auto start_chrono = std::chrono::high_resolution_clock::now();
-	foldr( value, A, monoid );
-	auto end_chrono = std::chrono::high_resolution_clock::now();
-	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
-	if( PRINT_TIMERS )
-		std::cout << "foldr_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-	std::cout << "foldr_test \"" << test_label << "\": ";
-	if( value == expected )
-		std::cout << "OK" << std::endl;
-	else
-		std::cerr << "Failed" << std::endl
-				  << test_description << std::endl
-				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-	return value == expected ? RC::SUCCESS : RC::FAILED;
+	if( rc == RC::SUCCESS ) { // Masked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldr( value, A, mask, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldr (masked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
+
+	return rc;
 }
 
 template< typename T, typename V, class Monoid >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
-	RC rc = foldl_test( test_label, test_description, A, initial, expected, monoid );
-	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
+	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid );
+	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid );
 }
 
-void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
+struct input {
+	const grb::Matrix< float > & A;
+	const grb::Matrix< void > & mask;
+};
+
+void grb_program( const input & in, grb::RC & rc ) {
+	const grb::Matrix< float > & I = in.A;
+	const grb::Matrix< void > & mask = in.mask;
+
 	const long n = grb::nnz( I );
 
 	/**    Test case 1:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is 0
-	 *  * Expected result: n
+	 *  * Expected unmasked result: n
+	 *  * Expected masked result: 0
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
+	return;
 
 	/**     Test case 2:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test(
+		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< int, float, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< float, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< int, float, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< float, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -155,7 +222,7 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
@@ -164,7 +231,7 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
@@ -173,11 +240,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
 		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
 		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
@@ -187,11 +254,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
 		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
 		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
@@ -201,11 +268,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
 		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
 		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
@@ -215,11 +282,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
 		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
 		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
@@ -248,15 +315,17 @@ int main( int argc, char ** argv ) {
 	grb::Launcher< AUTOMATIC > launcher;
 	grb::RC rc = RC::SUCCESS;
 
-	if( ! rc ) { // Build an identity square-matrix
+	if( ! rc ) { // Identity square-matrix
 		Matrix< float > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n );
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
 		}
@@ -269,22 +338,26 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
 		}
 		std::cout << std::endl;
 	}
 
-	if( ! rc ) { // Build a square-matrix with n 1s on the first column
+	if( ! rc ) { // Square-matrix with n 1s on the first column
 		Matrix< float > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
 		}
@@ -296,10 +369,12 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
 		std::vector< float > I_vals( 2 * n - 1, 1.f );
 		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
-		std::iota( I_cols.begin() + n, I_cols.end(), 1 );
+		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -312,8 +387,10 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( 1, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -326,8 +403,10 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, 1 );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;
 		}

From 874690981edf9b9aa86a2664b39aab8c9d61a722 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 13 Jun 2023 12:02:49 +0200
Subject: [PATCH 13/63] Fuse foldlr_generic and foldl_generic

---
 include/graphblas/reference/blas3.hpp | 162 ++------------------------
 1 file changed, 7 insertions(+), 155 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 35b505445..9e6de49f7 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -924,155 +924,7 @@ namespace grb {
 			class Monoid,
 			typename InputType, typename IOType, typename MaskType
 		>
-		RC foldl_generic(
-			IOType &x,
-			const Matrix< InputType, reference > &A,
-			const Matrix< MaskType, reference > &mask,
-			const Monoid &monoid
-		) {
-
-#ifdef _DEBUG
-			std::cout << "In grb::internal::foldl_generic( reference, masked = "
-				<< ( masked ? "true" : "false" ) << " )" << std::endl;
-#endif
-			RC rc = SUCCESS;
-
-			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
-			const auto& op = monoid.getOperator();
-
-			const auto &A_raw = internal::getCRS( A );
-			const auto &mask_raw = internal::getCRS( mask );
-			const size_t m = nrows( A );
-			const size_t n = ncols( A );
-			const size_t mask_k_increment = masked ? 1 : 0;
-
-			// Check mask dimensions
-			if( masked && ( m != nrows(mask) || n != ncols(mask) ) ) {
-#ifdef _DEBUG
-				std::cout << "Mask dimensions do not match input matrix dimensions\n";
-#endif
-				return MISMATCH;
-			}
-
-			RC local_rc = rc;
-			auto local_x = identity;
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, mask_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
-#endif
-			{
-				size_t start_row, end_row;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start_row, end_row, 0, m );
-#else
-				start_row = 0;
-				end_row = m;
-#endif
-				for( size_t i = start_row; i < end_row; ++i ) {
-					size_t mask_k = mask_raw.col_start[ i ];
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
-						if( masked ) {
-							// Increment the mask pointer until we find the right column, or an higher one
-							while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
-#ifdef _DEBUG
-								const std::string skip_str( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-								{
-									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str;
-								}
-#else
-								std::cout << skip_str;
-#endif
-#endif
-								mask_k += mask_k_increment;
-							}
-							// if there is no value for this coordinate, skip it
-							if( mask_raw.row_index[ mask_k ] != k_col ) {
-#ifdef _DEBUG
-								const std::string skip_str2( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-								{
-									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str2;
-								}
-#else
-								std::cout << skip_str2;
-#endif
-#endif
-								continue;
-							}
-
-#ifdef _DEBUG
-							const std::string str( "Mask( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-							{
-								std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-							}
-#else
-							std::cout << str;
-#endif
-#endif
-						}
-
-						// Increment the mask pointer in order to skip the next while loop (best case)
-						mask_k += mask_k_increment;
-
-						const InputType a_val = A_raw.getValue( k, identity );
-#ifdef _DEBUG
-						const std::string str( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-						{
-							std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-						}
-#else
-						std::cout << str;
-#endif
-						auto x_before = local_x;
-#endif
-						local_rc = local_rc ? local_rc : grb::foldl( local_x, a_val, op );
-#ifdef _DEBUG
-						const std::string str2( "Computing: local_x = op(" + std::to_string( x_before ) + ", " + std::to_string( a_val ) + ") = " + std::to_string( local_x ) + "\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-						{
-							std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
-						}
-#else
-						std::cout << str2;
-#endif
-#endif
-					}
-				}
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp critical
-#endif
-				{
-#ifdef _DEBUG
-					auto x_before = x;
-#endif
-					local_rc = local_rc ? local_rc : grb::foldl( x, local_x, op );
-#ifdef _DEBUG
-					std::cout << "Computing x: op(" << x_before << ", " << local_x << ") = " << x << std::endl;
-#endif
-					rc = rc ? rc : local_rc;
-				}
-			}
-
-			return rc;
-		}
-
-		template<
-			bool masked,
-			Descriptor descr = descriptors::no_operation,
-			class Monoid,
-			typename InputType, typename IOType, typename MaskType
-		>
-		RC foldr_generic(
+		RC fold_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
 			const Matrix< MaskType, reference > &mask,
@@ -1181,7 +1033,7 @@ namespace grb {
 #endif
 						auto x_before = local_x;
 #endif
-						local_rc = local_rc ? local_rc : grb::foldr( a_val, local_x, op );
+						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x, a_val, op );
 #ifdef _DEBUG
 						const std::string str2( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 #if defined(_H_GRB_REFERENCE_OMP_BLAS3)
@@ -1203,7 +1055,7 @@ namespace grb {
 #ifdef _DEBUG
 					auto x_before = x;
 #endif
-					local_rc = local_rc ? local_rc : grb::foldr( local_x, x, op );
+					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x, local_x, op );
 #ifdef _DEBUG
 					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
 #endif
@@ -1667,7 +1519,7 @@ namespace grb {
 		std::cout << "In grb::foldr (reference,  mask, matrix, monoid)\n";
 #endif
 
-		return internal::foldr_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_generic< true, descr, Monoid, InputType, IOType, MaskType >(
 			x, A, mask, monoid
 		);
 	}
@@ -1715,7 +1567,7 @@ namespace grb {
 #endif
 
 		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
-		return internal::foldr_generic< false, descr, Monoid, InputType, IOType, void >(
+		return internal::fold_generic< false, descr, Monoid, InputType, IOType, void >(
 			x, A, empty_mask, monoid
 		);
 	}
@@ -1765,7 +1617,7 @@ namespace grb {
 		std::cout << "In grb::foldl (reference, mask, matrix, monoid)\n";
 #endif
 
-		return internal::foldl_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_generic< true, descr, Monoid, InputType, IOType, MaskType >(
 			x, A, mask, monoid
 		);
 	}
@@ -1814,7 +1666,7 @@ namespace grb {
 #endif
 
 		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
-		return internal::foldl_generic< false, descr, Monoid, InputType, IOType, void >(
+		return internal::fold_generic< false, descr, Monoid, InputType, IOType, void >(
 			x, A, empty_mask, monoid
 		);
 	}

From 5db9121a59bfdfaaf4ae0935009d4781e6f666a8 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 13 Jun 2023 14:07:39 +0200
Subject: [PATCH 14/63] Taking into account mask values

---
 include/graphblas/blas0.hpp           | 32 +++++++++++++++++++++++++++
 include/graphblas/reference/blas3.hpp | 21 ++++++++++++++++--
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/blas0.hpp b/include/graphblas/blas0.hpp
index 751b2cf14..94fed7cdb 100644
--- a/include/graphblas/blas0.hpp
+++ b/include/graphblas/blas0.hpp
@@ -34,6 +34,7 @@
 #include "graphblas/descriptors.hpp"
 #include "graphblas/rc.hpp"
 #include "graphblas/type_traits.hpp"
+#include "graphblas/identities.hpp"
 
 #define NO_CAST_ASSERT( x, y, z )                                                  \
 	static_assert( x,                                                              \
@@ -604,6 +605,37 @@ namespace grb {
 
 		};
 
+		template< typename MaskType >
+		struct MaskHasValue {
+
+			public:
+				template < Descriptor descr = descriptors::no_operation, typename MaskStruct >
+				MaskHasValue( const MaskStruct& mask_raw, const size_t k ) {
+						bool hasValue = mask_raw.getValue( k, identities::logical_false<MaskType>() );
+						if (descr & grb::descriptors::invert_mask) {
+							hasValue = !hasValue;
+						}
+						value = hasValue;
+					}
+
+				const bool value;
+		};
+
+		template<>
+		struct MaskHasValue< void > {
+
+			public:
+				template < Descriptor descr = descriptors::no_operation, typename MaskStruct >
+				MaskHasValue( const MaskStruct& mask_raw, const size_t k ) :
+				value(not (descr & grb::descriptors::invert_mask)){
+					(void) mask_raw;
+					(void) k;
+				}
+
+				const bool value;
+
+		};
+
 	} // namespace internal
 
 } // namespace grb
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 9e6de49f7..ae0badf2f 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -930,7 +930,7 @@ namespace grb {
 			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
-
+#define _DEBUG
 #ifdef _DEBUG
 			std::cout << "In grb::internal::foldr_generic( reference, masked = "
 				<< ( masked ? "true" : "false" ) << " )" << std::endl;
@@ -1015,8 +1015,25 @@ namespace grb {
 							std::cout << str;
 #endif
 #endif
+							// Get mask value
+							if( not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
+#ifdef _DEBUG
+								const std::string skip_str3( "Skipped masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
+	#pragma omp critical
+								{
+									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str3;
+								}
+#else
+								std::cout << skip_str3;
+#endif
+#endif
+								continue;
+							}
 						}
 
+						
+
 						// Increment the mask pointer in order to skip the next while loop (best case)
 						mask_k += mask_k_increment;
 
@@ -1062,7 +1079,7 @@ namespace grb {
 					rc = rc ? rc : local_rc;
 				}
 			}
-
+#undef _DEBUG
 			return rc;
 		}
 

From e6101b98802225f9375366991c2ed211facf9ad3 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 14 Jun 2023 10:58:26 +0200
Subject: [PATCH 15/63] Separate masked and unmasked versions to benchmark

---
 include/graphblas/reference/blas3.hpp | 229 +++++++++++++-------------
 1 file changed, 118 insertions(+), 111 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index ae0badf2f..eb8bcf9f3 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -57,6 +57,24 @@
 		"********************************************************************" \
 		"******************************\n" );
 
+#define OMP_CRITICAL _Pragma("omp critical")
+
+#ifndef _DEBUG_THREADESAFE_PRINT
+	#ifndef _DEBUG
+		#define _DEBUG_THREADESAFE_PRINT( msg )
+	#else
+		#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+			#define _DEBUG_THREADESAFE_PRINT( msg ) \
+				OMP_CRITICAL \
+					{ \
+						std::cout << "[T" << omp_get_thread_num() << "] - " << msg << std::flush; \
+					}
+		#else
+			#define _DEBUG_THREADESAFE_PRINT( msg ) std::cout << msg << std::flush;
+		#endif
+	#endif
+#endif
+
 namespace grb {
 
 	namespace internal {
@@ -919,22 +937,81 @@ namespace grb {
 	namespace internal {
 
 		template<
-			bool masked,
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
 			typename InputType, typename IOType, typename MaskType
 		>
-		RC fold_generic(
+		RC fold_unmasked_generic(
 			IOType &x,
 			const Matrix< InputType, reference > &A,
-			const Matrix< MaskType, reference > &mask,
 			const Monoid &monoid
 		) {
-#define _DEBUG
+			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
+			RC rc = SUCCESS;
+
+			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
+			const auto& op = monoid.getOperator();
+
+			const auto &A_raw = internal::getCRS( A );
+			const size_t A_nnz = nnz( A );
+			if( grb::nnz( A ) == 0 ) {
+				x = identity;
+				return rc;
+			}
+
+			RC local_rc = rc;
+			auto local_x = identity;
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, A_nnz, op, identity)
+#endif
+			{
+				size_t start = 0;
+				size_t end = A_nnz;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+				config::OMP::localRange( start, end, 0, A_nnz );
+#endif
+				
+				for( size_t idx = start; idx < end; ++idx ) {
+					// Get A value
+					const InputType a_val = A_raw.values[ idx ];
+					_DEBUG_THREADESAFE_PRINT( "A.CRS.values[ " + std::to_string( idx ) + " ] = " + std::to_string( a_val ) + "\n" );
+				
+					// Compute the fold for this coordinate
+					auto local_x_before = local_x;
+					local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
+					_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+				}
+				
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+	#pragma omp critical
+#endif
+				{ // Reduction with the global result (critical section if OpenMP)
+					auto x_before = x;
+					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
 #ifdef _DEBUG
-			std::cout << "In grb::internal::foldr_generic( reference, masked = "
-				<< ( masked ? "true" : "false" ) << " )" << std::endl;
+					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
 #endif
+					rc = rc ? rc : local_rc;
+				}
+			}
+
+			return rc;
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			class Monoid,
+			typename InputType, typename IOType, typename MaskType
+		>
+		RC fold_masked_generic(
+			IOType &x,
+			const Matrix< InputType, reference > &A,
+			const Matrix< MaskType, reference > &mask,
+			const Monoid &monoid
+		) {
+			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
@@ -944,13 +1021,10 @@ namespace grb {
 			const auto &mask_raw = internal::getCRS( mask );
 			const size_t m = nrows( A );
 			const size_t n = ncols( A );
-			const size_t mask_k_increment = masked ? 1 : 0;
 
 			// Check mask dimensions
-			if( masked && ( m != nrows(mask) || n != ncols(mask) ) ) {
-#ifdef _DEBUG
-				std::cout << "Mask dimensions do not match input matrix dimensions\n";
-#endif
+			if( m != nrows(mask) || n != ncols(mask) ) {
+				_DEBUG_THREADESAFE_PRINT( "Mask dimensions do not match input matrix dimensions\n" );
 				return MISMATCH;
 			}
 
@@ -961,125 +1035,60 @@ namespace grb {
 	#pragma omp parallel default(none) shared(A_raw, mask_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
 #endif
 			{
-				size_t start_row, end_row;
+				size_t start_row = 0;
+				size_t end_row = m;
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				config::OMP::localRange( start_row, end_row, 0, m );
-#else
-				start_row = 0;
-				end_row = m;
 #endif
 				for( size_t i = start_row; i < end_row; ++i ) {
 					size_t mask_k = mask_raw.col_start[ i ];
 					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						const size_t k_col = A_raw.row_index[ k ];
-						if( masked ) {
-							// Increment the mask pointer until we find the right column, or an higher one
-							while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
-#ifdef _DEBUG
-								const std::string skip_str( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-								{
-									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str;
-								}
-#else
-								std::cout << skip_str;
-#endif
-#endif
-								mask_k += mask_k_increment;
-							}
-							// if there is no value for this coordinate, skip it
-							if( mask_raw.row_index[ mask_k ] != k_col ) {
-#ifdef _DEBUG
-								const std::string skip_str2( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-								{
-									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str2;
-								}
-#else
-								std::cout << skip_str2;
-#endif
-#endif
-								continue;
-							}
 
-#ifdef _DEBUG
-							const std::string str( "Mask( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-							{
-								std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-							}
-#else
-							std::cout << str;
-#endif
-#endif
-							// Get mask value
-							if( not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
-#ifdef _DEBUG
-								const std::string skip_str3( "Skipped masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-								{
-									std::cout << "[T" << omp_get_thread_num() << "] - " << skip_str3;
-								}
-#else
-								std::cout << skip_str3;
-#endif
-#endif
-								continue;
-							}
+						// Increment the mask pointer until we find the right column, or an higher one
+						while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
+							_DEBUG_THREADESAFE_PRINT( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							mask_k++;
+						}
+						// if there is no value for this coordinate, skip it
+						if( mask_raw.row_index[ mask_k ] != k_col ) {
+							_DEBUG_THREADESAFE_PRINT( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							continue;
 						}
 
-						
+						// Get mask value
+						if( not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
+							_DEBUG_THREADESAFE_PRINT( "Skipped masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							continue;
+						}
 
 						// Increment the mask pointer in order to skip the next while loop (best case)
-						mask_k += mask_k_increment;
+						mask_k++;
 
+						// Get A value
 						const InputType a_val = A_raw.getValue( k, identity );
-#ifdef _DEBUG
-						const std::string str( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-						{
-							std::cout << "[T" << omp_get_thread_num() << "] - " << str;
-						}
-#else
-						std::cout << str;
-#endif
-						auto x_before = local_x;
-#endif
-						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x, a_val, op );
-#ifdef _DEBUG
-						const std::string str2( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( local_x ) + "\n" );
-#if defined(_H_GRB_REFERENCE_OMP_BLAS3)
-	#pragma omp critical
-						{
-							std::cout << "[T" << omp_get_thread_num() << "] - " << str2;
-						}
-#else
-						std::cout << str2;
-#endif
-#endif
+						_DEBUG_THREADESAFE_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
+						
+						// Compute the fold for this coordinate
+						auto local_x_before = local_x;
+						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
+						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 					}
 				}
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 	#pragma omp critical
 #endif
-				{
-#ifdef _DEBUG
+				{ // Reduction with the global result (critical section if OpenMP)
 					auto x_before = x;
-#endif
-					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x, local_x, op );
+					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
 #ifdef _DEBUG
 					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
 #endif
 					rc = rc ? rc : local_rc;
 				}
 			}
-#undef _DEBUG
+
 			return rc;
 		}
 
@@ -1536,7 +1545,7 @@ namespace grb {
 		std::cout << "In grb::foldr (reference,  mask, matrix, monoid)\n";
 #endif
 
-		return internal::fold_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
 			x, A, mask, monoid
 		);
 	}
@@ -1583,9 +1592,8 @@ namespace grb {
 		std::cout << "In grb::foldr (reference, matrix, op)\n";
 #endif
 
-		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
-		return internal::fold_generic< false, descr, Monoid, InputType, IOType, void >(
-			x, A, empty_mask, monoid
+		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(
+			x, A, monoid
 		);
 	}
 
@@ -1634,7 +1642,7 @@ namespace grb {
 		std::cout << "In grb::foldl (reference, mask, matrix, monoid)\n";
 #endif
 
-		return internal::fold_generic< true, descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
 			x, A, mask, monoid
 		);
 	}
@@ -1682,9 +1690,8 @@ namespace grb {
 		std::cout << "In grb::foldl (reference, matrix, monoid)\n";
 #endif
 
-		Matrix< void, reference > empty_mask( nrows( A ), ncols( A ) );
-		return internal::fold_generic< false, descr, Monoid, InputType, IOType, void >(
-			x, A, empty_mask, monoid
+		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(
+			x, A, monoid
 		);
 	}
 

From a4bb7a53950611500dcd7a2a355b67b73bb91314 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 15 Jun 2023 09:44:52 +0200
Subject: [PATCH 16/63] Add benchmarking quick to matrixReduce test

---
 tests/unit/fold_matrix_to_scalar.cpp | 146 +++++++++++++++------------
 1 file changed, 81 insertions(+), 65 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index d27a1fd85..08dbe3bed 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -39,11 +39,15 @@
 
 using namespace grb;
 
-constexpr bool PRINT_TIMERS = false;
+using NzType = double;
+
+
+constexpr bool PRINT_TIMERS = true;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
 constexpr bool SKIP_UNMASKED = false;
 constexpr bool SKIP_MASKED = false;
+constexpr size_t ITERATIONS = 100;
 
 template< typename T, typename V, class Monoid >
 RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
@@ -54,9 +58,12 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldl( value, A, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldl( value, A, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -76,9 +83,12 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldl( value, A, mask, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldl( value, A, mask, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -104,12 +114,15 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 		return RC::SUCCESS;
 	RC rc = RC::SUCCESS;
 
-	if( rc == RC::SUCCESS ) { // Unmasked
+	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldr( value, A, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldr( value, A, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -126,12 +139,15 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
 	}
 
-	if( rc == RC::SUCCESS ) { // Masked
+	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldr( value, A, mask, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldr( value, A, mask, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -158,12 +174,12 @@ RC foldLR_test( const char * test_label, const char * test_description, const gr
 }
 
 struct input {
-	const grb::Matrix< float > & A;
+	const grb::Matrix< NzType > & A;
 	const grb::Matrix< void > & mask;
 };
 
 void grb_program( const input & in, grb::RC & rc ) {
-	const grb::Matrix< float > & I = in.A;
+	const grb::Matrix< NzType > & I = in.A;
 	const grb::Matrix< void > & mask = in.mask;
 
 	const long n = grb::nnz( I );
@@ -174,7 +190,7 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 *  * Expected unmasked result: n
 	 *  * Expected masked result: 0
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
 	if( rc )
 		return;
 	return;
@@ -185,35 +201,35 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 *  * Expected result: 2*n
 	 */
 	rc = foldLR_test(
-		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
+		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< int, float, int >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< int, NzType, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< float, int, int >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< NzType, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< int, float, int >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< int, NzType, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< float, int, int >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< NzType, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -222,7 +238,7 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
 	if( rc )
 		return;
 
@@ -231,63 +247,63 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == float).
+	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == NzType).
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+		Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
 	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || float).
+	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || NzType).
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+		Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
 }
@@ -316,9 +332,9 @@ int main( int argc, char ** argv ) {
 	grb::RC rc = RC::SUCCESS;
 
 	if( ! rc ) { // Identity square-matrix
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
@@ -329,13 +345,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Build a square-matrix with n 1s on the first row
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, n );
@@ -345,13 +361,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Square-matrix with n 1s on the first column
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, n );
@@ -361,13 +377,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
-		std::vector< float > I_vals( 2 * n - 1, 1.f );
+		std::vector< NzType > I_vals( 2 * n - 1, 1.f );
 		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
 		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
@@ -378,13 +394,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
-		Matrix< float > I( 1, n );
+		Matrix< NzType > I( 1, n );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( 1, n );
@@ -394,13 +410,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
-		Matrix< float > I( n, 1 );
+		Matrix< NzType > I( n, 1 );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, 1 );
@@ -410,7 +426,7 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( rc != SUCCESS ) {

From 28599ab9ce4bd19ebc6e0e71960b4ec666824e91 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 10:28:32 +0200
Subject: [PATCH 17/63] Explicit templates declaration

---
 include/graphblas/base/blas3.hpp      | 18 ++++++----
 include/graphblas/reference/blas3.hpp | 47 ++++++++++++++++-----------
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 0b652f4ac..16a6a1575 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -523,12 +523,14 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
 		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M,
 		Backend backend
 	>
 	RC foldr(
 		IOType &x,
-		const Matrix< InputType, backend > &A,
-		const Matrix< MaskType, backend > &mask,
+		const Matrix< InputType, backend, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, backend, RIT_M, CIT_M, NIT_M > &mask,
 		const Monoid &monoid = Monoid(),
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
@@ -573,11 +575,12 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
 		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT,
 		Backend backend
 	>
 	RC foldr(
 		IOType &x,
-		const Matrix< InputType, backend > &A,
+		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
 		const Monoid &monoid,
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
@@ -675,12 +678,14 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
 		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M,
 		Backend backend
 	>
 	RC foldl(
 		IOType &x,
-		const Matrix< InputType, backend > &A,
-		const Matrix< MaskType, backend > &mask,
+		const Matrix< InputType, backend, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, backend, RIT_M, CIT_M, NIT_M > &mask,
 		const Monoid &monoid = Monoid(),
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
@@ -725,11 +730,12 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
 		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT,
 		Backend backend
 	>
 	RC foldl(
 		IOType &x,
-		const Matrix< InputType, backend > &A,
+		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
 		const Monoid &monoid,
 		const typename std::enable_if< 
 			!grb::is_object< IOType >::value &&
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index eb8bcf9f3..12a2cb889 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -939,11 +939,12 @@ namespace grb {
 		template<
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
-			typename InputType, typename IOType, typename MaskType
+			typename InputType, typename IOType, typename MaskType,
+			typename RIT, typename CIT, typename NIT
 		>
 		RC fold_unmasked_generic(
 			IOType &x,
-			const Matrix< InputType, reference > &A,
+			const Matrix< InputType, reference, RIT, CIT, NIT > &A,
 			const Monoid &monoid
 		) {
 			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
@@ -1003,12 +1004,14 @@ namespace grb {
 		template<
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
-			typename InputType, typename IOType, typename MaskType
+			typename InputType, typename IOType, typename MaskType,
+			typename RIT_A, typename CIT_A, typename NIT_A,
+			typename RIT_M, typename CIT_M, typename NIT_M
 		>
 		RC fold_masked_generic(
 			IOType &x,
-			const Matrix< InputType, reference > &A,
-			const Matrix< MaskType, reference > &mask,
+			const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
+			const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
 			const Monoid &monoid
 		) {
 			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
@@ -1040,10 +1043,10 @@ namespace grb {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				config::OMP::localRange( start_row, end_row, 0, m );
 #endif
-				for( size_t i = start_row; i < end_row; ++i ) {
-					size_t mask_k = mask_raw.col_start[ i ];
-					for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						const size_t k_col = A_raw.row_index[ k ];
+				for( auto i = start_row; i < end_row; ++i ) {
+					auto mask_k = mask_raw.col_start[ i ];
+					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						auto k_col = A_raw.row_index[ k ];
 
 						// Increment the mask pointer until we find the right column, or an higher one
 						while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
@@ -1504,12 +1507,14 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType, typename MaskType
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
 	>
 	RC foldr(
 		IOType &x,
-		const Matrix< InputType, reference > &A,
-		const Matrix< MaskType, reference > &mask,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
 		const Monoid &monoid = Monoid(),
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
@@ -1553,11 +1558,12 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
 	>
 	RC foldr(
 		IOType &x,
-		const Matrix< InputType, reference > &A,
+		const Matrix< InputType, reference, RIT, CIT, NIT > &A,
 		const Monoid &monoid,
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
@@ -1600,12 +1606,14 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType, typename MaskType
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
 	>
 	RC foldl(
 		IOType &x,
-		const Matrix< InputType, reference > &A,
-		const Matrix< MaskType, reference > &mask,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
 		const Monoid &monoid,
 		const typename std::enable_if<
 			!grb::is_object< IOType >::value &&
@@ -1650,11 +1658,12 @@ namespace grb {
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Monoid,
-		typename InputType, typename IOType
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
 	>
 	RC foldl(
 		IOType &x,
-		const Matrix< InputType, reference > &A,
+		const Matrix< InputType, reference, RIT, CIT, NIT > &A,
 		const Monoid &monoid,
 		const typename std::enable_if<
 			!grb::is_object< IOType >::value &&

From 0e59cface1f0516e03ef4e09f4992a697968a042 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 15:29:51 +0200
Subject: [PATCH 18/63] grb::foldr+l documentation enhancement

---
 include/graphblas/base/blas3.hpp | 62 +++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 16a6a1575..e05d54d9f 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -501,17 +501,18 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Valid descriptors
-	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
-	 * grb::descriptors::dense, grb::descriptors::invert_mask,
-	 * grb::descriptors::structural, grb::descriptors::structural_complement
+	 * - descriptors::no_operation: the default descriptor.
+	 * - descriptors::no_casting: the first domain of
+	 * 	 	\a monoid must match \a InputType, the second domain of \a op
+	 * 		match \a IOType, the third domain must match \a IOType.
+	 * - descriptors::transpose_left: A^T will be considered instead 
+	 * 	 	of \a A.
+	 * - descriptors::transpose_right: mask^T will be considered 
+	 * 	 	instead of \a mask.
+	 * - descriptors::invert_mask: Not supported yet.
 	 *
 	 * \note Invalid descriptors will be ignored.
 	 *
-	 * If grb::descriptors::no_casting is given, then 1) the first domain of
-	 * \a monoid must match \a IOType, 2) the second domain of \a op must match
-	 * \a InputType, 3) the third domain must match \a IOType, and 4) the element type
-	 * of \a mask must be <tt>bool</tt>. If one of these is not true, the code
-	 * shall not compile.
 	 * \endparblock
 	 *
 	 * \par Performance semantics
@@ -570,6 +571,19 @@ namespace grb {
 	 * @return grb::SUCCESS  When the call completed successfully.
 	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
 	 *                       #grb::descriptors::dense was given.
+	 * 
+	 * \parblock
+	 * \par Valid descriptors
+	 * - descriptors::no_operation: the default descriptor.
+	 * - descriptors::no_casting: the first domain of
+	 * 	 	\a monoid must match \a InputType, the second domain of \a op
+	 * 		match \a IOType, the third domain must match \a IOType.
+	 * - descriptors::transpose_matrix: A^T will be considered instead 
+	 * 	 	of \a A.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -656,17 +670,19 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Valid descriptors
-	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
-	 * grb::descriptors::dense, grb::descriptors::invert_mask,
-	 * grb::descriptors::structural, grb::descriptors::structural_complement
+	 * - descriptors::no_operation: the default descriptor.
+	 * - descriptors::no_casting: the first domain of
+	 * 	 	\a monoid must match \a InputType, the second domain of \a op
+	 * 		match \a IOType, the third domain must match \a IOType, and the
+	 *   	element type of \a mask must be <tt>bool</tt>. 
+	 * - descriptors::transpose_left: A^T will be considered instead 
+	 * 	 	of \a A.
+	 * - descriptors::transpose_right: mask^T will be considered 
+	 * 	 	instead of \a mask.
+	 * - descriptors::invert_mask: Not supported yet.
 	 *
 	 * \note Invalid descriptors will be ignored.
 	 *
-	 * If grb::descriptors::no_casting is given, then 1) the first domain of
-	 * \a monoid must match \a InputType, 2) the second domain of \a op must
-	 * match \a IOType, 3) the third domain must match \a IOType, and 4) the
-	 * element type of \a mask must be <tt>bool</tt>. If one of these is not
-	 * true, the code shall not compile.
 	 * \endparblock
 	 *
 	 * \par Performance semantics
@@ -725,6 +741,20 @@ namespace grb {
 	 * @return grb::SUCCESS  When the call completed successfully.
 	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
 	 *                       #grb::descriptors::dense was given.
+	 * 
+	 * \parblock
+	 * \par Valid descriptors
+	 * - descriptors::no_operation: the default descriptor.
+	 * - descriptors::no_casting: the first domain of
+	 * 	 	\a monoid must match \a InputType, the second domain of \a op
+	 * 		match \a IOType, the third domain must match \a IOType.
+	 * - descriptors::transpose_matrix: A^T will be considered instead 
+	 * 	 	of \a A.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * \endparblock
+	 * 
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,

From 4a0d63527adbaea827f6e255b35995a23b394e2a Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 15:32:55 +0200
Subject: [PATCH 19/63] Implement transpose descriptors in foldl+r

---
 include/graphblas/reference/blas3.hpp | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 12a2cb889..8d026a14f 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -953,7 +953,8 @@ namespace grb {
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
 			const auto& op = monoid.getOperator();
 
-			const auto &A_raw = internal::getCRS( A );
+			const auto &A_raw = descr & grb::descriptors::transpose_matrix ?
+				internal::getCCS( A ) : internal::getCRS( A );
 			const size_t A_nnz = nnz( A );
 			if( grb::nnz( A ) == 0 ) {
 				x = identity;
@@ -1020,13 +1021,21 @@ namespace grb {
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
 			const auto& op = monoid.getOperator();
 
-			const auto &A_raw = internal::getCRS( A );
-			const auto &mask_raw = internal::getCRS( mask );
-			const size_t m = nrows( A );
-			const size_t n = ncols( A );
+			const auto &A_raw = descr & grb::descriptors::transpose_left ?
+				internal::getCCS( A ) : internal::getCRS( A );
+			const auto &mask_raw = descr & grb::descriptors::transpose_right ?
+				internal::getCCS( mask ) : internal::getCRS( mask );
+			const size_t m = descr & grb::descriptors::transpose_right ?
+				ncols( A ) : nrows( A );
+			const size_t n = descr & grb::descriptors::transpose_right ?
+				nrows( A ) : ncols( A );
+			const size_t m_mask = descr & grb::descriptors::transpose_left ?
+				ncols( mask ) : nrows( mask );
+			const size_t n_mask = descr & grb::descriptors::transpose_left ?
+				nrows( mask ) : ncols( mask );
 
 			// Check mask dimensions
-			if( m != nrows(mask) || n != ncols(mask) ) {
+			if( m != m_mask || n != n_mask ) {
 				_DEBUG_THREADESAFE_PRINT( "Mask dimensions do not match input matrix dimensions\n" );
 				return MISMATCH;
 			}
@@ -1060,7 +1069,7 @@ namespace grb {
 						}
 
 						// Get mask value
-						if( not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
+						if( MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
 							_DEBUG_THREADESAFE_PRINT( "Skipped masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							continue;
 						}

From 3a83956ba39fd10447a59c171f283fa00829c304 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 15:47:35 +0200
Subject: [PATCH 20/63] Implementation of grb::foldl+r in hyperdags

---
 include/graphblas/hyperdags/blas3.hpp     | 166 ++++++++++++++++++++++
 include/graphblas/hyperdags/hyperdags.hpp |  17 ++-
 src/graphblas/hyperdags/hyperdags.cpp     |  12 ++
 3 files changed, 192 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index ee0c10f36..f5af0365d 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -332,6 +332,172 @@ namespace grb {
 		return ret;
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, hyperdags, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldr (hyperdags, mask, matrix, monoid)\n";
+#endif
+
+		const RC ret = foldr< descr, Monoid >(
+			x, A, mask, monoid
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(mask) )
+		};
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		// std::array< uintptr_t, 1 > destinations{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_SCALAR_MATRIX_MASK_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldr (hyperdags, matrix, monoid)\n";
+#endif
+
+		const RC ret = foldr< descr, Monoid >(
+			x, A, monoid
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		// std::array< uintptr_t, 1 > destinations{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_SCALAR_MATRIX_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, hyperdags, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		#ifdef _DEBUG
+		std::cout << "In grb::foldl (hyperdags, mask, matrix, monoid)\n";
+#endif
+
+		const RC ret = foldl< descr, Monoid >(
+			x, A, mask, monoid
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(mask) )
+		};
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		// std::array< uintptr_t, 1 > destinations{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_SCALAR_MATRIX_MASK_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldl (hyperdags, matrix, monoid)\n";
+#endif
+
+		const RC ret = foldl< descr, Monoid >(
+			x, A, monoid
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		// std::array< uintptr_t, 1 > destinations{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_SCALAR_MATRIX_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
 } // end namespace grb
 
 #endif
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index 4ef0e0059..d032c8e38 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -488,12 +488,19 @@ namespace grb {
 
 				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
 
-				EWISELAMBDA_FUNC_VECTOR
+				EWISELAMBDA_FUNC_VECTOR,
 
+				FOLDL_SCALAR_MATRIX_MASK_MONOID,
+
+				FOLDL_SCALAR_MATRIX_MONOID,
+
+				FOLDR_SCALAR_MATRIX_MASK_MONOID,
+
+				FOLDR_SCALAR_MATRIX_MONOID,
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 106;
+			const constexpr size_t numOperationVertexTypes = 110;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
@@ -604,7 +611,11 @@ namespace grb {
 				EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
 				EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
 				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
-				EWISELAMBDA_FUNC_VECTOR
+				EWISELAMBDA_FUNC_VECTOR,
+				FOLDL_SCALAR_MATRIX_MASK_MONOID,
+				FOLDL_SCALAR_MATRIX_MONOID,
+				FOLDR_SCALAR_MATRIX_MASK_MONOID,
+				FOLDR_SCALAR_MATRIX_MONOID,
 			};
 
 			/** \internal @returns The operation vertex type as a string. */
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index 6000f3af7..90774d0e2 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -380,6 +380,18 @@ std::string grb::internal::hyperdags::toString(
 		case GETID_MATRIX:
 			return "getID( matrix )";
 
+		case FOLDL_SCALAR_MATRIX_MASK_MONOID:
+			return "foldl( scalar, matrix, matrix, monoid )";
+
+		case FOLDL_SCALAR_MATRIX_MONOID:
+			return "foldl( scalar, matrix, monoid )";
+
+		case FOLDR_SCALAR_MATRIX_MASK_MONOID:
+			return "foldr( scalar, matrix, matrix, monoid )";
+
+		case FOLDR_SCALAR_MATRIX_MONOID:
+			return "foldr( scalar, matrix, monoid )";
+
 	}
 	assert( false );
 	return "unknown operation";

From fa7676916144c23e163d441d18a662a0d3d0d026 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 15:48:02 +0200
Subject: [PATCH 21/63] Restrict foldr+l test to implemented backends

---
 tests/unit/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 2ee3de02e..d4427d2d4 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -122,7 +122,7 @@ add_grb_executables( matrixIterator matrixIterator.cpp
 )
 
 add_grb_executables( fold_matrix_to_scalar fold_matrix_to_scalar.cpp
-	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	BACKENDS reference reference_omp hyperdags
 )
 
 add_grb_executables( doubleAssign doubleAssign.cpp

From b5dd0c59fc2e7a9acc459e3cb24a542c0ffac629 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 26 Jun 2023 22:29:45 +0200
Subject: [PATCH 22/63] Add empty+dense mask test cases

---
 tests/unit/fold_matrix_to_scalar.cpp | 151 ++++++++++++++++++++++-----
 1 file changed, 124 insertions(+), 27 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index 08dbe3bed..afe9c29cc 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -41,21 +41,67 @@ using namespace grb;
 
 using NzType = double;
 
-
-constexpr bool PRINT_TIMERS = true;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
 constexpr bool SKIP_UNMASKED = false;
 constexpr bool SKIP_MASKED = false;
-constexpr size_t ITERATIONS = 100;
+// Benchmarking
+constexpr bool PRINT_TIMERS = false;
+constexpr size_t ITERATIONS = 1;
+
+//#define _DEBUG
+
+template< class Iterator >
+void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+#ifndef _DEBUG
+	return;
+#endif
+	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+	if( rows > 50 || cols > 50 ) {
+		os << "   Matrix too large to print" << std::endl;
+	} else {
+		// os.precision( 3 );
+		for( size_t y = 0; y < rows; y++ ) {
+			os << std::string( 3, ' ' );
+			for( size_t x = 0; x < cols; x++ ) {
+				auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+					return a.first.first == y && a.first.second == x;
+				} );
+				if( nnz_val != end )
+					os << std::fixed << ( *nnz_val ).second;
+				else
+					os << '_';
+				os << " ";
+			}
+			os << std::endl;
+		}
+	}
+	os << "]" << std::endl;
+	std::flush( os );
+}
+
+template< typename D >
+void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+	grb::wait( mat );
+	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+}
+
 
-template< typename T, typename V, class Monoid >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
+template< typename T, typename V, typename M, class Monoid >
+RC foldl_test( const char * test_label,
+	const char * test_description,
+	const grb::Matrix< V > & A,
+	const grb::Matrix< M > & mask,
+	T initial,
+	T expected,
+	const Monoid & monoid,
+	bool skip_masked = false,
+	bool skip_unmasked = false ) {
 	if( SKIP_FOLDL )
 		return RC::SUCCESS;
 	RC rc = RC::SUCCESS;
 
-	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
+	if( not skip_unmasked && rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
 		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
@@ -80,7 +126,7 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
 	}
 
-	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
+	if( not skip_masked && rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
 		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
@@ -108,13 +154,21 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	return rc;
 }
 
-template< typename T, typename V, class Monoid >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
+template< typename T, typename V, typename M, class Monoid >
+RC foldr_test( const char * test_label,
+	const char * test_description,
+	const grb::Matrix< V > & A,
+	const grb::Matrix< M > & mask,
+	T initial,
+	T expected,
+	const Monoid & monoid,
+	bool skip_masked = false,
+	bool skip_unmasked = false ) {
 	if( SKIP_FOLDR )
 		return RC::SUCCESS;
 	RC rc = RC::SUCCESS;
 
-	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
+	if( not skip_unmasked && rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
 		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
@@ -139,7 +193,7 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
 	}
 
-	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
+	if( not skip_masked && rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
 		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
@@ -167,20 +221,30 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 	return rc;
 }
 
-template< typename T, typename V, class Monoid >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
-	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid );
-	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid );
+template< typename T, typename V, typename M, class Monoid >
+RC foldLR_test( const char * test_label,
+	const char * test_description,
+	const grb::Matrix< V > & A,
+	const grb::Matrix< M > & mask,
+	T initial,
+	T expected,
+	const Monoid & monoid,
+	bool skip_masked = false,
+	bool skip_unmasked = false ) {
+	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid, skip_masked, skip_unmasked );
+	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid, skip_masked, skip_unmasked );
 }
 
+template< typename T, typename M >
 struct input {
-	const grb::Matrix< NzType > & A;
-	const grb::Matrix< void > & mask;
+	const grb::Matrix< T > & A;
+	const grb::Matrix< M > & mask;
 };
 
-void grb_program( const input & in, grb::RC & rc ) {
-	const grb::Matrix< NzType > & I = in.A;
-	const grb::Matrix< void > & mask = in.mask;
+template< typename T, typename M >
+void grb_program( const input< T, M > & in, grb::RC & rc ) {
+	const grb::Matrix< T > & I = in.A;
+	const grb::Matrix< M > & mask = in.mask;
 
 	const long n = grb::nnz( I );
 
@@ -193,7 +257,6 @@ void grb_program( const input & in, grb::RC & rc ) {
 	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
 	if( rc )
 		return;
-	return;
 
 	/**     Test case 2:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
@@ -306,6 +369,34 @@ void grb_program( const input & in, grb::RC & rc ) {
 		Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
+
+	/**     Test case 11:
+	 * Reduction with an empty mask.
+	 * * Initial value is 4
+	 * * Expected result: 4
+	 */
+	Matrix< void > empty_mask( grb::nrows( I ), grb::ncols( I ), 0 );
+	rc = foldLR_test( "11", "Reduction with an empty mask.", I, empty_mask, (NzType)4, (NzType)4, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+	if( rc )
+		return;
+
+	/**     Test case 12:
+	 * Reduction with a dense mask.
+	 * * Initial value is 0
+	 * * Expected result: n
+	 */
+	Matrix< bool > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
+	std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
+	for( size_t x = 0; x < grb::nrows( I ); x++ ) {
+		std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
+		std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+	}
+	std::vector< int > dense_mask_vals( grb::nrows( I ) * grb::ncols( I ), 1 );
+	buildMatrixUnique( dense_mask, rows.data(), cols.data(), dense_mask_vals.data(), grb::nrows( I ) * grb::ncols( I ), SEQUENTIAL );
+	printSparseMatrix( dense_mask, "dense_mask" );
+	rc = foldLR_test( "11", "Reduction with a dense mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+	if( rc )
+		return;
 }
 
 int main( int argc, char ** argv ) {
@@ -341,7 +432,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
 		}
@@ -357,7 +449,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
 		}
@@ -373,7 +466,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
 		}
@@ -390,7 +484,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -406,7 +501,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( 1, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -422,7 +518,8 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, 1 );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
+		input< NzType, void > input = { I, mask };
+		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;
 		}

From 45589eb3c43582e88cab83cf36edc88b4558cf41 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 26 Jun 2023 22:30:31 +0200
Subject: [PATCH 23/63] Fix in hyperdags

---
 include/graphblas/hyperdags/blas3.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index f5af0365d..ef51992fd 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -355,7 +355,7 @@ namespace grb {
 #endif
 
 		const RC ret = foldr< descr, Monoid >(
-			x, A, mask, monoid
+			x, internal::getMatrix( A ), internal::getMatrix( mask ), monoid
 		);
 		if( ret != SUCCESS ) { return ret; }
 		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
@@ -396,7 +396,7 @@ namespace grb {
 #endif
 
 		const RC ret = foldr< descr, Monoid >(
-			x, A, monoid
+			x, internal::getMatrix( A ), monoid
 		);
 		if( ret != SUCCESS ) { return ret; }
 		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
@@ -438,7 +438,7 @@ namespace grb {
 #endif
 
 		const RC ret = foldl< descr, Monoid >(
-			x, A, mask, monoid
+			x, internal::getMatrix( A ), internal::getMatrix( mask ), monoid
 		);
 		if( ret != SUCCESS ) { return ret; }
 		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
@@ -480,7 +480,7 @@ namespace grb {
 #endif
 
 		const RC ret = foldl< descr, Monoid >(
-			x, A, monoid
+			x, internal::getMatrix( A ), monoid
 		);
 		if( ret != SUCCESS ) { return ret; }
 		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }

From 6972be0c95dbbed1e0b54fb3c9a9d58c90400899 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Mon, 26 Jun 2023 22:30:44 +0200
Subject: [PATCH 24/63] Fix masked iteration pattern

---
 include/graphblas/blas0.hpp           |  4 +--
 include/graphblas/reference/blas3.hpp | 37 ++++++++++++---------------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/include/graphblas/blas0.hpp b/include/graphblas/blas0.hpp
index 94fed7cdb..dad228bbf 100644
--- a/include/graphblas/blas0.hpp
+++ b/include/graphblas/blas0.hpp
@@ -611,14 +611,14 @@ namespace grb {
 			public:
 				template < Descriptor descr = descriptors::no_operation, typename MaskStruct >
 				MaskHasValue( const MaskStruct& mask_raw, const size_t k ) {
-						bool hasValue = mask_raw.getValue( k, identities::logical_false<MaskType>() );
+						bool hasValue = (bool) mask_raw.values[ k ];
 						if (descr & grb::descriptors::invert_mask) {
 							hasValue = !hasValue;
 						}
 						value = hasValue;
 					}
 
-				const bool value;
+				bool value;
 		};
 
 		template<>
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 8d026a14f..15608d520 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1018,6 +1018,10 @@ namespace grb {
 			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
+			if( grb::nnz( mask ) == 0 ) {
+				return rc;
+			}
+
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
 			const auto& op = monoid.getOperator();
 
@@ -1025,13 +1029,13 @@ namespace grb {
 				internal::getCCS( A ) : internal::getCRS( A );
 			const auto &mask_raw = descr & grb::descriptors::transpose_right ?
 				internal::getCCS( mask ) : internal::getCRS( mask );
-			const size_t m = descr & grb::descriptors::transpose_right ?
+			const size_t m = descr & grb::descriptors::transpose_left ?
 				ncols( A ) : nrows( A );
-			const size_t n = descr & grb::descriptors::transpose_right ?
+			const size_t n = descr & grb::descriptors::transpose_left ?
 				nrows( A ) : ncols( A );
-			const size_t m_mask = descr & grb::descriptors::transpose_left ?
+			const size_t m_mask = descr & grb::descriptors::transpose_right ?
 				ncols( mask ) : nrows( mask );
-			const size_t n_mask = descr & grb::descriptors::transpose_left ?
+			const size_t n_mask = descr & grb::descriptors::transpose_right ?
 				nrows( mask ) : ncols( mask );
 
 			// Check mask dimensions
@@ -1057,26 +1061,19 @@ namespace grb {
 					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 						auto k_col = A_raw.row_index[ k ];
 
-						// Increment the mask pointer until we find the right column, or an higher one
-						while( mask_raw.row_index[ mask_k ] < k_col && mask_k < mask_raw.col_start[ i + 1 ] ) {
-							_DEBUG_THREADESAFE_PRINT( "Skipping masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+						// Increment the mask pointer until we find the right column, or a lower column (since the storage withing a row is sorted in a descending order)
+						while( mask_k < mask_raw.col_start[ i + 1 ] && mask_raw.row_index[ mask_k ] > k_col  ) {
+							_DEBUG_THREADESAFE_PRINT( "NEquals masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							mask_k++;
 						}
-						// if there is no value for this coordinate, skip it
-						if( mask_raw.row_index[ mask_k ] != k_col ) {
-							_DEBUG_THREADESAFE_PRINT( "Skipped masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-							continue;
-						}
-
-						// Get mask value
-						if( MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
-							_DEBUG_THREADESAFE_PRINT( "Skipped masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							
+						if( mask_raw.row_index[ mask_k ] < k_col || not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
+							mask_k++;
+							_DEBUG_THREADESAFE_PRINT( "Skip masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							continue;
 						}
 
-						// Increment the mask pointer in order to skip the next while loop (best case)
-						mask_k++;
-
+						_DEBUG_THREADESAFE_PRINT( "Found masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 						// Get A value
 						const InputType a_val = A_raw.getValue( k, identity );
 						_DEBUG_THREADESAFE_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
@@ -1084,7 +1081,7 @@ namespace grb {
 						// Compute the fold for this coordinate
 						auto local_x_before = local_x;
 						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );	
 					}
 				}
 

From 765d3e3240cf59a3e3a6750ec7ede15a5ab66bc1 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 27 Jun 2023 10:36:51 +0200
Subject: [PATCH 25/63] Add more test cases for masked variant

---
 tests/unit/fold_matrix_to_scalar.cpp | 247 ++++++++++++++++++---------
 1 file changed, 168 insertions(+), 79 deletions(-)

diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index afe9c29cc..1a2bd9e1a 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -49,7 +49,7 @@ constexpr bool SKIP_MASKED = false;
 constexpr bool PRINT_TIMERS = false;
 constexpr size_t ITERATIONS = 1;
 
-//#define _DEBUG
+// #define _DEBUG
 
 template< class Iterator >
 void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
@@ -86,7 +86,6 @@ void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name =
 	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
 }
 
-
 template< typename T, typename V, typename M, class Monoid >
 RC foldl_test( const char * test_label,
 	const char * test_description,
@@ -254,149 +253,239 @@ void grb_program( const input< T, M > & in, grb::RC & rc ) {
 	 *  * Expected unmasked result: n
 	 *  * Expected masked result: 0
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
-	if( rc )
-		return;
+	{
+		rc = foldLR_test(
+			"1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 2:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test(
-		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
-	if( rc )
-		return;
+	{
+		rc = foldLR_test(
+			"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 3:
 	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< int, NzType, int >, identities::zero >() );
-	if( rc )
-		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< NzType, int, int >, identities::zero >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
+			Monoid< operators::add< int, NzType, int >, identities::zero >() );
+		if( rc )
+			return;
+		rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
+			Monoid< operators::add< NzType, int, int >, identities::zero >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 4:
 	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< int, NzType, int >, identities::zero >() );
-	if( rc )
-		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< NzType, int, int >, identities::zero >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
+			Monoid< operators::add< int, NzType, int >, identities::zero >() );
+		if( rc )
+			return;
+		rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
+			Monoid< operators::add< NzType, int, int >, identities::zero >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 5:
 	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
-	if( rc )
-		return;
+	{
+		rc = foldLR_test(
+			"5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 6:
 	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
-	if( rc )
-		return;
+	{
+		rc = foldLR_test(
+			"6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 7:
 	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-	if( rc )
-		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+			Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
+		if( rc )
+			return;
+		rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+			Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 8:
 	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-	if( rc )
-		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+			Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
+		if( rc )
+			return;
+		rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+			Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 9:
 	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == NzType).
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
-	if( rc )
-		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+			Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
+		if( rc )
+			return;
+		rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+			Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 10:
 	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || NzType).
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
-	if( rc )
-		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
-	if( rc )
-		return;
+	{
+		rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+			Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
+		if( rc )
+			return;
+		rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+			Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 11:
 	 * Reduction with an empty mask.
 	 * * Initial value is 4
 	 * * Expected result: 4
 	 */
-	Matrix< void > empty_mask( grb::nrows( I ), grb::ncols( I ), 0 );
-	rc = foldLR_test( "11", "Reduction with an empty mask.", I, empty_mask, (NzType)4, (NzType)4, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-	if( rc )
-		return;
+	{
+		Matrix< void > empty_mask( grb::nrows( I ), grb::ncols( I ), 0 );
+		rc = foldLR_test( "11", "Reduction with an empty mask.", I, empty_mask, (NzType)4, (NzType)4, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+		if( rc )
+			return;
+	}
 
 	/**     Test case 12:
-	 * Reduction with a dense mask.
+	 * Reduction with a dense void mask.
+	 * * Initial value is 0
+	 * * Expected result: n
+	 */
+	{
+		Matrix< void > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
+		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
+		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
+			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
+			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+		}
+		buildMatrixUnique( dense_mask, rows.data(), cols.data(), grb::nrows( I ) * grb::ncols( I ), SEQUENTIAL );
+		rc = foldLR_test( "12", "Reduction with a dense void mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+		if( rc )
+			return;
+	}
+
+	/**     Test case 13:
+	 * Reduction with a dense int mask.
 	 * * Initial value is 0
 	 * * Expected result: n
 	 */
-	Matrix< bool > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
-	std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
-	for( size_t x = 0; x < grb::nrows( I ); x++ ) {
-		std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
-		std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+	{
+		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
+		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
+		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
+			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
+			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+		}
+		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 1 );
+		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
+		rc = foldLR_test( "13", "Reduction with a dense int mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+		if( rc )
+			return;
+	}
+
+	/**     Test case 14:
+	 * Reduction with a dense int mask, full of zero, except for the first nz.
+	 * * Initial value is 0
+	 * * Expected result: 1
+	 */
+	{
+		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
+		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
+		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
+			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
+			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+		}
+		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 0 );
+		for( const auto e : I ) {
+			vals[ e.first.first * grb::ncols( I ) + e.first.second ] = 1;
+			break;
+		}
+		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
+		rc = foldLR_test( "14", "Reduction with a dense int mask, matching only the first nz.", I, dense_mask, (NzType)0, (NzType)1, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+		if( rc )
+			return;
+	}
+
+	/**     Test case 15:
+	 * Reduction with a dense int mask, full of zero, except for the last nz.
+	 * * Initial value is 0
+	 * * Expected result: 1
+	 */
+	{
+		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
+		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
+		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
+			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
+			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
+		}
+		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 0 );
+		size_t previous_idx = 0;
+		for( const auto e : I ) 
+			previous_idx = e.first.first * grb::ncols( I ) + e.first.second;
+		vals[ previous_idx ] = 1;
+		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
+		rc = foldLR_test( "15", "Reduction with a dense int mask, matching only the last nz.", I, dense_mask, (NzType)0, (NzType)1, Monoid< operators::add< NzType >, identities::zero >(), false, true );
+		if( rc )
+			return;
 	}
-	std::vector< int > dense_mask_vals( grb::nrows( I ) * grb::ncols( I ), 1 );
-	buildMatrixUnique( dense_mask, rows.data(), cols.data(), dense_mask_vals.data(), grb::nrows( I ) * grb::ncols( I ), SEQUENTIAL );
-	printSparseMatrix( dense_mask, "dense_mask" );
-	rc = foldLR_test( "11", "Reduction with a dense mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-	if( rc )
-		return;
 }
 
 int main( int argc, char ** argv ) {

From 7253d934f84b2956c858dbd5332592dea03711f5 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 27 Jun 2023 11:33:27 +0200
Subject: [PATCH 26/63] Cleaning

---
 include/graphblas/reference/blas3.hpp | 91 ++++++++++++++++-----------
 1 file changed, 55 insertions(+), 36 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 15608d520..50cdea4ef 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -950,16 +950,26 @@ namespace grb {
 			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
+			if( grb::nnz( A ) == 0 ) {
+				return rc;
+			}
+
+			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
+				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
+				return RC::ILLEGAL;
+			}
+			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_matrix ) {
+				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_matrix is not supported\n" );
+				return RC::ILLEGAL;
+			}
+
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
 			const auto& op = monoid.getOperator();
 
-			const auto &A_raw = descr & grb::descriptors::transpose_matrix ?
+			const auto &A_raw = (descr & grb::descriptors::transpose_matrix || descr & grb::descriptors::transpose_left ) ?
 				internal::getCCS( A ) : internal::getCRS( A );
 			const size_t A_nnz = nnz( A );
-			if( grb::nnz( A ) == 0 ) {
-				x = identity;
-				return rc;
-			}
+
 
 			RC local_rc = rc;
 			auto local_x = identity;
@@ -973,18 +983,18 @@ namespace grb {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				config::OMP::localRange( start, end, 0, A_nnz );
 #endif
-				
+
 				for( size_t idx = start; idx < end; ++idx ) {
 					// Get A value
 					const InputType a_val = A_raw.values[ idx ];
 					_DEBUG_THREADESAFE_PRINT( "A.CRS.values[ " + std::to_string( idx ) + " ] = " + std::to_string( a_val ) + "\n" );
-				
+
 					// Compute the fold for this coordinate
 					auto local_x_before = local_x;
 					local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
 					_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 				}
-				
+
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 	#pragma omp critical
@@ -1018,10 +1028,19 @@ namespace grb {
 			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
-			if( grb::nnz( mask ) == 0 ) {
+			if( grb::nnz( mask ) == 0 || grb::nnz( A ) == 0 ) {
 				return rc;
 			}
 
+			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
+				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
+				return RC::ILLEGAL;
+			}
+			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_right ) {
+				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_right is not supported\n" );
+				return RC::ILLEGAL;
+			}
+
 			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
 			const auto& op = monoid.getOperator();
 
@@ -1066,7 +1085,7 @@ namespace grb {
 							_DEBUG_THREADESAFE_PRINT( "NEquals masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							mask_k++;
 						}
-							
+
 						if( mask_raw.row_index[ mask_k ] < k_col || not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
 							mask_k++;
 							_DEBUG_THREADESAFE_PRINT( "Skip masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
@@ -1077,11 +1096,11 @@ namespace grb {
 						// Get A value
 						const InputType a_val = A_raw.getValue( k, identity );
 						_DEBUG_THREADESAFE_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
-						
+
 						// Compute the fold for this coordinate
 						auto local_x_before = local_x;
 						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );	
+						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 					}
 				}
 
@@ -1530,30 +1549,30 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( InputType, IOType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::foldr (reference,  mask, matrix, monoid)\n";
+		std::cout << "In grb::foldr( reference, mask, matrix, monoid )\n";
 #endif
 
 		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
@@ -1578,30 +1597,30 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"the operator version of foldr cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::foldr (reference, matrix, op)\n";
+		std::cout << "In grb::foldr( reference, matrix, monoid )\n";
 #endif
 
 		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(
@@ -1630,30 +1649,30 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::foldl (reference, mask, matrix, monoid)\n";
+		std::cout << "In grb::foldl( reference, mask, matrix, monoid )\n";
 #endif
 
 		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
@@ -1679,30 +1698,30 @@ namespace grb {
 	) {
 		// static checks
 		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"input matrix is a pattern matrix (of type void)"
 		);
 		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"the operator version of foldl cannot be used if the "
 			"result is of type void"
 		);
 		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with a prefactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with a postfactor input type that does not match the first domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl ( reference, IOType <- op( IOType, InputType ): "
+			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
 			"called with an output type that does not match the output domain of the given operator"
 		);
 
 #ifdef _DEBUG
-		std::cout << "In grb::foldl (reference, matrix, monoid)\n";
+		std::cout << "In grb::foldl( reference, matrix, monoid )\n";
 #endif
 
 		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(

From 841aeec31997dd24af9be501dcb947549a8ff224 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 27 Jun 2023 11:36:17 +0200
Subject: [PATCH 27/63] Implementation for BSP1D backend

---
 include/graphblas/bsp1d/blas3.hpp    | 272 +++++++++++++++++++++++++++
 tests/unit/CMakeLists.txt            |   2 +-
 tests/unit/fold_matrix_to_scalar.cpp |  35 ++--
 3 files changed, 292 insertions(+), 17 deletions(-)

diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp
index 386beb164..13e144f38 100644
--- a/include/graphblas/bsp1d/blas3.hpp
+++ b/include/graphblas/bsp1d/blas3.hpp
@@ -205,6 +205,278 @@ namespace grb {
 		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, BSP1D, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, BSP1D, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
+			"the operator version of foldr cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
+			"the operator version of foldr cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
+			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
+			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldr( BSP1D, matrix, mask, monoid )\n";
+#endif
+		RC rc = SUCCESS;
+
+		if( grb::nnz( A ) == 0 ) {
+			return rc;
+		}
+
+		// Do local folding
+		IOType local = monoid.template getIdentity< IOType >();
+		rc = foldr< descr >( local, internal::getLocal( A ), internal::getLocal( mask ), monoid );
+
+#ifdef _DEBUG
+		std::cout << "After process-local delegation, local value has become "
+			<< local << ". Entering allreduce..." << std::endl;
+#endif
+
+		// All-reduce using \a op
+		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
+
+		// Accumulate end result
+		rc = rc ? rc : foldr( x, local, monoid.getOperator() );
+
+		return SUCCESS;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, BSP1D, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldr cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
+			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
+			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldr( BSP1D, matrix, monoid )\n";
+#endif
+		RC rc = SUCCESS;
+
+		if( grb::nnz( A ) == 0 ) {
+			return rc;
+		}
+
+		// Do local folding
+		IOType local = monoid.template getIdentity< IOType >();
+		rc = foldr< descr >( local, internal::getLocal( A ), monoid );
+
+#ifdef _DEBUG
+		std::cout << "After process-local delegation, local value has become "
+			<< local << ". Entering allreduce..." << std::endl;
+#endif
+
+		// All-reduce using \a op
+		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
+
+		// Accumulate end result
+		rc = rc ? rc : foldr( x, local, monoid.getOperator() );
+
+		return SUCCESS;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, BSP1D, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, BSP1D, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldl cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldl cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldl( BSP1D, matrix, mask, monoid )\n";
+#endif
+		RC rc = SUCCESS;
+
+		if( grb::nnz( A ) == 0 ) {
+			return rc;
+		}
+
+		// Do local folding
+		IOType local = monoid.template getIdentity< IOType >();
+		rc = foldl< descr >( local, internal::getLocal( A ), internal::getLocal( mask ), monoid );
+
+#ifdef _DEBUG
+		std::cout << "After process-local delegation, local value has become "
+			<< local << ". Entering allreduce..." << std::endl;
+#endif
+
+		// All-reduce using \a op
+		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
+
+		// Accumulate end result
+		rc = rc ? rc : foldl( x, local, monoid.getOperator() );
+
+		return SUCCESS;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, BSP1D, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		static_assert( !std::is_same< InputType, void >::value,
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldl cannot be used if the "
+			"input matrix is a pattern matrix (of type void)"
+		);
+		static_assert( !std::is_same< IOType, void >::value,
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"the operator version of foldl cannot be used if the "
+			"result is of type void"
+		);
+		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a prefactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with a postfactor input type that does not match the first domain of the given operator"
+		);
+		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
+			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
+			"called with an output type that does not match the output domain of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::foldl( BSP1D, matrix, monoid )\n";
+#endif
+		RC rc = SUCCESS;
+
+		if( grb::nnz( A ) == 0 ) {
+			return rc;
+		}
+
+		// Do local folding
+		IOType local = monoid.template getIdentity< IOType >();
+		rc = foldl< descr >( local, internal::getLocal( A ), monoid );
+
+#ifdef _DEBUG
+		std::cout << "After process-local delegation, local value has become "
+			<< local << ". Entering allreduce..." << std::endl;
+#endif
+
+		// All-reduce using \a op
+		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
+
+		// Accumulate end result
+		rc = rc ? rc : foldl( x, local, monoid.getOperator() );
+
+		return SUCCESS;
+	}
+
 } // namespace grb
 
 #endif
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d4427d2d4..6991c8b7d 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -122,7 +122,7 @@ add_grb_executables( matrixIterator matrixIterator.cpp
 )
 
 add_grb_executables( fold_matrix_to_scalar fold_matrix_to_scalar.cpp
-	BACKENDS reference reference_omp hyperdags
+	BACKENDS reference reference_omp hyperdags bsp1d hybrid
 )
 
 add_grb_executables( doubleAssign doubleAssign.cpp
diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
index 1a2bd9e1a..a22c7667e 100644
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ b/tests/unit/fold_matrix_to_scalar.cpp
@@ -238,6 +238,9 @@ template< typename T, typename M >
 struct input {
 	const grb::Matrix< T > & A;
 	const grb::Matrix< M > & mask;
+	
+	// Default constructor for distributed backends
+	input( const grb::Matrix< T > & A = {0,0}, const grb::Matrix< M > & mask = {0,0} ) : A( A ), mask( mask ) {}
 };
 
 template< typename T, typename M >
@@ -517,11 +520,11 @@ int main( int argc, char ** argv ) {
 		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
 		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
 		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
@@ -534,11 +537,11 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
 		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
 		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
 		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
@@ -555,7 +558,7 @@ int main( int argc, char ** argv ) {
 		Matrix< void > mask( n, n );
 		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
@@ -569,11 +572,11 @@ int main( int argc, char ** argv ) {
 		std::vector< NzType > I_vals( 2 * n - 1, 1.f );
 		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
 		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
 		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
 		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
@@ -586,11 +589,11 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
 		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
 		Matrix< void > mask( 1, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
 		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
@@ -603,11 +606,11 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
 		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
 		Matrix< void > mask( n, 1 );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
 		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		input< NzType, void > input = { I, mask };
+		input< NzType, void > input(I, mask);
 		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;

From 7d5f34c6ee8eb51302e82c2b8136b6733445f9f3 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 28 Jun 2023 14:09:44 +0200
Subject: [PATCH 28/63] Implementation in nonblocking (delegating)

---
 include/graphblas/nonblocking/blas3.hpp | 112 ++++++++++++++++++++++++
 tests/unit/CMakeLists.txt               |   2 +-
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
index 5a222c7f2..ace0ccbe0 100644
--- a/include/graphblas/nonblocking/blas3.hpp
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -571,6 +571,118 @@ namespace grb {
 		);
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, nonblocking, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, nonblocking, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldr( nonblocking, matrix, mask, monoid )\n";
+#endif
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return foldr< descr, Monoid >( x, internal::getRefMatrix( A ), internal::getRefMatrix( mask ), monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldr(
+		IOType &x,
+		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldr( nonblocking, matrix, monoid )\n";
+#endif
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return foldr< descr, Monoid >( x, internal::getRefMatrix( A ), monoid	);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		typename RIT_M, typename CIT_M, typename NIT_M
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, nonblocking, RIT_A, CIT_A, NIT_A > &A,
+		const Matrix< MaskType, nonblocking, RIT_M, CIT_M, NIT_M > &mask,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldl( nonblocking, matrix, mask, monoid )\n";
+#endif
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return foldl< descr, Monoid >( x, internal::getRefMatrix( A ), internal::getRefMatrix( mask ), monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType, typename IOType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC foldl(
+		IOType &x,
+		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+		const Monoid &monoid,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::foldl( nonblocking, matrix, monoid )\n";
+#endif
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return foldl< descr, Monoid >( x, internal::getRefMatrix( A ), monoid	);
+	}
+
 } // namespace grb
 
 #undef NO_CAST_ASSERT
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 6991c8b7d..2ee3de02e 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -122,7 +122,7 @@ add_grb_executables( matrixIterator matrixIterator.cpp
 )
 
 add_grb_executables( fold_matrix_to_scalar fold_matrix_to_scalar.cpp
-	BACKENDS reference reference_omp hyperdags bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( doubleAssign doubleAssign.cpp

From dccb53b01bf26fd01026120b277913befd75a367 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 28 Jun 2023 22:52:29 +0200
Subject: [PATCH 29/63] Cleaning for review

---
 include/graphblas/base/blas3.hpp      | 158 ++++----------------------
 include/graphblas/reference/blas3.hpp |  88 +++++++-------
 2 files changed, 66 insertions(+), 180 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index e05d54d9f..d62d43330 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -445,6 +445,7 @@ namespace grb {
 
 	/**
 	 * Reduces, or \em folds, a matrix into a scalar.
+	 * Right-to-left masked variant.
 	 *
 	 * Reduction takes place according a monoid \f$ (\oplus,1) \f$, where
 	 * \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with associated identities
@@ -464,8 +465,6 @@ namespace grb {
 	 *       operator domains switched may be supplied, or #grb::foldr may be used
 	 *       instead.
 	 *
-	 * After a successfull call, \a x will be equal to \f$ x_n \f$.
-	 *
 	 * Note that the operator \f$ \oplus \f$ must be associative since it is part
 	 * of a monoid. This algebraic property is exploited when parallelising the
 	 * requested operation. The identity is required when parallelising over
@@ -479,25 +478,27 @@ namespace grb {
 	 *                   left unspecified).
 	 * @tparam Monoid    The monoid to use for reduction.
 	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a y.
+	 *                   matrix \a A.
 	 * @tparam IOType    The type of the output scalar \a x.
 	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
 	 *                   matrix \a mask.
 	 *
 	 * @param[in, out] x   The result of the reduction.
 	 * 					   Prior value will be considered.
-	 * @param[in]    A     Any ALP/GraphBLAS matrix.
-	 * @param[in]  mask    Any ALP/GraphBLAS matrix.
+	 * @param[in]      A   Any ALP/GraphBLAS matrix, will be reduced into \a x.
+	 * @param[in]   mask   Any ALP/GraphBLAS matrix, will mask the matrix \a A.
+	 * 					   Dimensions must match those of \a A.
 	 * @param[in] monoid   The monoid under which to perform this reduction.
+	 * 					   An identity element must be provided when using
+	 * 					   threads in order to perform the local reductions.
 	 *
 	 * @return grb::SUCCESS  When the call completed successfully.
 	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
 	 *                       equal to \a y.
-	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
-	 *                       #grb::descriptors::dense was given.
 	 *
-	 * @see grb::foldl provides similar in-place functionality.
-	 * @see grb::eWiseApply provides out-of-place semantics.
+	 * @see grb::foldl provides similar in-place functionality, but folds in a
+	 * 	left-to-right direction.
+	 * @see The same primitive but unmasked is also provided.
 	 *
 	 * \parblock
 	 * \par Valid descriptors
@@ -556,33 +557,15 @@ namespace grb {
 	 * 
 	 * Please see the masked grb::foldr variant for a full description.
 	 * 
-	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
-	 *                   left unspecified).
-	 * @tparam Operator  The operator to use for reduction.
-	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a y.
-	 * @tparam IOType    The type of the output scalar \a x.
-	 *
-	 * @param[in, out] x   The result of the reduction.
-	 * 					   Prior value will be considered.
-	 * @param[in]      A   Any ALP/GraphBLAS matrix.
-	 * @param[in] operator The operator used for reduction.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
-	 *                       #grb::descriptors::dense was given.
-	 * 
 	 * \parblock
-	 * \par Valid descriptors
-	 * - descriptors::no_operation: the default descriptor.
-	 * - descriptors::no_casting: the first domain of
-	 * 	 	\a monoid must match \a InputType, the second domain of \a op
-	 * 		match \a IOType, the third domain must match \a IOType.
+	 * 
+	 * \par Valid descriptors specific to this variant
 	 * - descriptors::transpose_matrix: A^T will be considered instead 
 	 * 	 	of \a A.
-	 *
+	 * 
+	 * \note See other valid descriptors in the masked variant.
 	 * \note Invalid descriptors will be ignored.
-	 *
+	 * 
 	 * \endparblock
 	 */
 	template<
@@ -613,82 +596,10 @@ namespace grb {
 
 
 	/**
-	 * Reduces, or \em folds, a matrix into a scalar.
-	 *
-	 * Reduction takes place according a monoid \f$ (\oplus,1) \f$, where
-	 * \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with associated identities
-	 * \f$ 1_k in D_k \f$. Usually, \f$ D_k \subseteq D_3, 1 \leq k < 3 \f$,
-	 * though other more exotic structures may be envisioned (and used).
-	 *
-	 * Let \f$ x_0 = 1 \f$ and let
-	 * \f$ x_{i+1} = \begin{cases}
-	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }
-	 * 	 m_i\text{ evaluates true}x_i\text{ otherwise}
-	 * \end{cases},\f$
-	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$.
-	 *
-	 * \note Per this definition, the folding happens in a left-to-right
-	 * 		 direction. If another direction is wanted, which may have use in
-	 *  	 cases where \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a
-	 * 		 monoid with those operator domains switched may be supplied, or
-	 * 		 #grb::foldr may be used instead.
-	 *
-	 * After a successfull call, \a x will be equal to \f$ x_n \f$.
-	 *
-	 * Note that the operator \f$ \oplus \f$ must be associative since it is
-	 * part of a monoid. This algebraic property is exploited when parallelising
-	 * the requested operation. The identity is required when parallelising over
-	 * multiple user processes.
-	 *
-	 * \warning In so doing, the order of the evaluation of the reduction
-	 * 			operation should not be expected to be a serial, left-to-right,
-	 * 			evaluation of the computation chain.
-	 *
-	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
-	 *                   left unspecified).
-	 * @tparam Monoid    The monoid to use for reduction.
-	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a y.
-	 * @tparam IOType    The type of the output scalar \a x.
-	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a mask.
-	 *
-	 * @param[in, out] x  The result of the reduction. 
-	 * 					  Prior value will be considered.
-	 * @param[in] A       Any ALP/GraphBLAS matrix.
-	 * @param[in] mask    Any ALP/GraphBLAS matrix.
-	 * @param[in] monoid  The monoid under which to perform this reduction.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
-	 *                       equal to \a A.
-	 * @return grb::ILLEGAL  If the provided input matrix \a A was not dense,
-	 * 						 while #grb::descriptors::dense was given.
-	 *
-	 * @see grb::foldr provides similar in-place functionality.
-	 * @see grb::eWiseApply provides out-of-place semantics.
-	 *
-	 * \parblock
-	 * \par Valid descriptors
-	 * - descriptors::no_operation: the default descriptor.
-	 * - descriptors::no_casting: the first domain of
-	 * 	 	\a monoid must match \a InputType, the second domain of \a op
-	 * 		match \a IOType, the third domain must match \a IOType, and the
-	 *   	element type of \a mask must be <tt>bool</tt>. 
-	 * - descriptors::transpose_left: A^T will be considered instead 
-	 * 	 	of \a A.
-	 * - descriptors::transpose_right: mask^T will be considered 
-	 * 	 	instead of \a mask.
-	 * - descriptors::invert_mask: Not supported yet.
-	 *
-	 * \note Invalid descriptors will be ignored.
-	 *
-	 * \endparblock
-	 *
-	 * \par Performance semantics
-	 * Each backend must define performance semantics for this primitive.
-	 *
-	 * @see perfSemantics
+	 * Reduces, or \em folds, a matrix into a scalar. 
+	 * Left-to-right masked variant.
+	 * 
+	 * Please see the masked grb::foldr variant for a full description.
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -724,37 +635,18 @@ namespace grb {
 	 * Reduces, or \em folds, a matrix into a scalar. 
 	 * Left-to-right unmasked variant.
 	 * 
-	 * Please see the masked grb::foldl variant for a full description.
-	 * 
-	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
-	 *                   left unspecified).
-	 * @tparam Operator  The operator to use for reduction.
-	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a y.
-	 * @tparam IOType    The type of the output scalar \a x.
-	 *
-	 * @param[in, out] x   The result of the reduction.
-	 * 					   Prior value will be considered.
-	 * @param[in]    A     Any ALP/GraphBLAS matrix.
-	 * @param[in] operator The operator used for reduction.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::ILLEGAL  If the provided input matrix \a y was not dense, while
-	 *                       #grb::descriptors::dense was given.
+	 * Please see the masked grb::foldr variant for a full description.
 	 * 
 	 * \parblock
-	 * \par Valid descriptors
-	 * - descriptors::no_operation: the default descriptor.
-	 * - descriptors::no_casting: the first domain of
-	 * 	 	\a monoid must match \a InputType, the second domain of \a op
-	 * 		match \a IOType, the third domain must match \a IOType.
+	 * 
+	 * \par Valid descriptors specific to this variant
 	 * - descriptors::transpose_matrix: A^T will be considered instead 
 	 * 	 	of \a A.
-	 *
+	 * 
+	 * \note See other valid descriptors in the masked variant.
 	 * \note Invalid descriptors will be ignored.
-	 *
-	 * \endparblock
 	 * 
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 50cdea4ef..6a397dab2 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -57,20 +57,21 @@
 		"********************************************************************" \
 		"******************************\n" );
 
-#define OMP_CRITICAL _Pragma("omp critical")
-
-#ifndef _DEBUG_THREADESAFE_PRINT
+#ifndef _DEBUG_PRINT
 	#ifndef _DEBUG
-		#define _DEBUG_THREADESAFE_PRINT( msg )
+		#define _DEBUG_PRINT( msg )
 	#else
-		#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#define _DEBUG_THREADESAFE_PRINT( msg ) \
-				OMP_CRITICAL \
-					{ \
-						std::cout << "[T" << omp_get_thread_num() << "] - " << msg << std::flush; \
-					}
+		#ifdef _GRB_WITH_OMP
+			#ifndef _GRB_DEBUG_CRITICAL_SECTION
+				#define _GRB_DEBUG_CRITICAL_SECTION _Pragma("omp critical(_GRB_DEBUG_CRITICAL_SECTION)")
+			#endif
+			#define _DEBUG_PRINT( msg ) \
+				_GRB_DEBUG_CRITICAL_SECTION \
+				{ \
+					std::cout << "[T" << omp_get_thread_num() << "] - " << msg << std::flush; \
+				}
 		#else
-			#define _DEBUG_THREADESAFE_PRINT( msg ) std::cout << msg << std::flush;
+			#define _DEBUG_PRINT( msg ) std::cout << msg << std::flush;
 		#endif
 	#endif
 #endif
@@ -939,7 +940,7 @@ namespace grb {
 		template<
 			Descriptor descr = descriptors::no_operation,
 			class Monoid,
-			typename InputType, typename IOType, typename MaskType,
+			typename InputType, typename IOType,
 			typename RIT, typename CIT, typename NIT
 		>
 		RC fold_unmasked_generic(
@@ -947,35 +948,33 @@ namespace grb {
 			const Matrix< InputType, reference, RIT, CIT, NIT > &A,
 			const Monoid &monoid
 		) {
-			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
+			_DEBUG_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
 			if( grb::nnz( A ) == 0 ) {
+				_DEBUG_PRINT( "The input matrix is empty, nothing to compute\n" );
 				return rc;
 			}
 
 			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
-				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
+				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
 				return RC::ILLEGAL;
 			}
 			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_matrix ) {
-				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_matrix is not supported\n" );
+				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_matrix is not supported\n" );
 				return RC::ILLEGAL;
 			}
 
-			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
-			const auto& op = monoid.getOperator();
-
 			const auto &A_raw = (descr & grb::descriptors::transpose_matrix || descr & grb::descriptors::transpose_left ) ?
 				internal::getCCS( A ) : internal::getCRS( A );
 			const size_t A_nnz = nnz( A );
 
-
+			const auto& op = monoid.getOperator();
 			RC local_rc = rc;
-			auto local_x = identity;
+			auto local_x = monoid.template getIdentity< typename Monoid::D3 >();
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, A_nnz, op, identity)
+	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, A_nnz, op)
 #endif
 			{
 				size_t start = 0;
@@ -987,12 +986,12 @@ namespace grb {
 				for( size_t idx = start; idx < end; ++idx ) {
 					// Get A value
 					const InputType a_val = A_raw.values[ idx ];
-					_DEBUG_THREADESAFE_PRINT( "A.CRS.values[ " + std::to_string( idx ) + " ] = " + std::to_string( a_val ) + "\n" );
+					_DEBUG_PRINT( "A.values[ " + std::to_string( idx ) + " ] = " + std::to_string( a_val ) + "\n" );
 
 					// Compute the fold for this coordinate
 					auto local_x_before = local_x;
 					local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-					_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+					_DEBUG_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 				}
 
 
@@ -1002,9 +1001,7 @@ namespace grb {
 				{ // Reduction with the global result (critical section if OpenMP)
 					auto x_before = x;
 					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
-#ifdef _DEBUG
-					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
-#endif
+					_DEBUG_PRINT( "Computing x: op(" + std::to_string( local_x ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( x ) + "\n" );
 					rc = rc ? rc : local_rc;
 				}
 			}
@@ -1025,19 +1022,20 @@ namespace grb {
 			const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
 			const Monoid &monoid
 		) {
-			_DEBUG_THREADESAFE_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
+			_DEBUG_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
 			RC rc = SUCCESS;
 
 			if( grb::nnz( mask ) == 0 || grb::nnz( A ) == 0 ) {
+				_DEBUG_PRINT( "The mask and/or the input matrix are empty, nothing to compute\n" );
 				return rc;
 			}
 
 			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
-				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
+				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
 				return RC::ILLEGAL;
 			}
 			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_right ) {
-				_DEBUG_THREADESAFE_PRINT( "Masked fold with force_row_major and transpose_right is not supported\n" );
+				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_right is not supported\n" );
 				return RC::ILLEGAL;
 			}
 
@@ -1059,7 +1057,7 @@ namespace grb {
 
 			// Check mask dimensions
 			if( m != m_mask || n != n_mask ) {
-				_DEBUG_THREADESAFE_PRINT( "Mask dimensions do not match input matrix dimensions\n" );
+				_DEBUG_PRINT( "Mask dimensions do not match input matrix dimensions\n" );
 				return MISMATCH;
 			}
 
@@ -1082,25 +1080,24 @@ namespace grb {
 
 						// Increment the mask pointer until we find the right column, or a lower column (since the storage withing a row is sorted in a descending order)
 						while( mask_k < mask_raw.col_start[ i + 1 ] && mask_raw.row_index[ mask_k ] > k_col  ) {
-							_DEBUG_THREADESAFE_PRINT( "NEquals masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							_DEBUG_PRINT( "NEquals masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							mask_k++;
 						}
 
 						if( mask_raw.row_index[ mask_k ] < k_col || not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
 							mask_k++;
-							_DEBUG_THREADESAFE_PRINT( "Skip masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
+							_DEBUG_PRINT( "Skip masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 							continue;
 						}
 
-						_DEBUG_THREADESAFE_PRINT( "Found masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
 						// Get A value
 						const InputType a_val = A_raw.getValue( k, identity );
-						_DEBUG_THREADESAFE_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
+						_DEBUG_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
 
 						// Compute the fold for this coordinate
 						auto local_x_before = local_x;
 						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-						_DEBUG_THREADESAFE_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
+						_DEBUG_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
 					}
 				}
 
@@ -1110,9 +1107,7 @@ namespace grb {
 				{ // Reduction with the global result (critical section if OpenMP)
 					auto x_before = x;
 					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
-#ifdef _DEBUG
-					std::cout << "Computing x: op(" << local_x << ", " << x_before << ") = " << x << std::endl;
-#endif
+					_DEBUG_PRINT( "Computing x: op(" + std::to_string( local_x ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( x ) + "\n" );
 					rc = rc ? rc : local_rc;
 				}
 			}
@@ -1472,7 +1467,6 @@ namespace grb {
 	 *
 	 * \internal Dispatches to internal::eWiseApply_matrix_generic
 	 */
-
 	template<
 		Descriptor descr = grb::descriptors::no_operation,
 		class Operator,
@@ -1564,7 +1558,7 @@ namespace grb {
 		);
 		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
 			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
+			"called with a postfactor input type that does not match the second domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
@@ -1575,7 +1569,7 @@ namespace grb {
 		std::cout << "In grb::foldr( reference, mask, matrix, monoid )\n";
 #endif
 
-		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_masked_generic< descr, Monoid >(
 			x, A, mask, monoid
 		);
 	}
@@ -1612,7 +1606,7 @@ namespace grb {
 		);
 		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
 			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
+			"called with a postfactor input type that does not match the second domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
@@ -1623,7 +1617,7 @@ namespace grb {
 		std::cout << "In grb::foldr( reference, matrix, monoid )\n";
 #endif
 
-		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(
+		return internal::fold_unmasked_generic< descr, Monoid >(
 			x, A, monoid
 		);
 	}
@@ -1664,7 +1658,7 @@ namespace grb {
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
 			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
+			"called with a postfactor input type that does not match the second domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
@@ -1675,7 +1669,7 @@ namespace grb {
 		std::cout << "In grb::foldl( reference, mask, matrix, monoid )\n";
 #endif
 
-		return internal::fold_masked_generic< descr, Monoid, InputType, IOType, MaskType >(
+		return internal::fold_masked_generic< descr, Monoid >(
 			x, A, mask, monoid
 		);
 	}
@@ -1713,7 +1707,7 @@ namespace grb {
 		);
 		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
 			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
+			"called with a postfactor input type that does not match the second domain of the given operator"
 		);
 		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
 			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
@@ -1724,7 +1718,7 @@ namespace grb {
 		std::cout << "In grb::foldl( reference, matrix, monoid )\n";
 #endif
 
-		return internal::fold_unmasked_generic< descr, Monoid, InputType, IOType, void >(
+		return internal::fold_unmasked_generic< descr, Monoid >(
 			x, A, monoid
 		);
 	}

From 218775e5165c08ad3224200865da91be066bb672 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 15:03:13 +0200
Subject: [PATCH 30/63] matrixReduce using foldl+foldr unit-test

---
 tests/unit/matrixReduce.cpp | 284 ++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 tests/unit/matrixReduce.cpp

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
new file mode 100644
index 000000000..47106b361
--- /dev/null
+++ b/tests/unit/matrixReduce.cpp
@@ -0,0 +1,284 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Tests for the reduce( Matrix<D>, T, Operator<T,D,T> ) API call
+ *
+ * @author Benjamin Lozes
+ * @date 17/05/2023
+ *
+ * Tests whether the foldl and foldl API calls produce the expected results.
+ * 
+ * The test cases are focused on the following aspects:
+ *   * The types of the result, the matrix values and the operator
+ * 	 * The initial value of the reduction result
+ * 	 * The order of the operands (foldr, foldl)
+ */
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <vector>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+using nz_t = float;
+
+template< typename T, typename V, class Operator >
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	T value = initial;
+	foldl( value, A, op );
+
+	std::cout << "foldl_test \"" << test_label << "\": ";
+	if( value == expected )
+		std::cout << "OK" << std::endl;
+	else
+		std::cerr << "Failed" << std::endl
+				  << test_description << std::endl
+				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+	return value == expected ? RC::SUCCESS : RC::FAILED;
+}
+
+template< typename T, typename V, class Operator >
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	T value = initial;
+	foldr( value, A, op );
+
+	std::cout << "foldr_test \"" << test_label << "\": ";
+	if( value == expected )
+		std::cout << "OK" << std::endl;
+	else
+		std::cerr << "Failed" << std::endl
+				  << test_description << std::endl
+				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+	return value == expected ? RC::SUCCESS : RC::FAILED;
+}
+
+template< typename T, typename V, class Operator >
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+	RC rc = foldl_test( test_label, test_description, A, initial, expected, op );
+	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, op );
+}
+
+void grb_program( const long & n, grb::RC & rc ) {
+	// Build an identity matrix
+	Matrix< nz_t > I( n, n );
+	std::vector< size_t > I_rows( n ), I_cols( n );
+	std::vector< nz_t > I_vals( n, 1 );
+	std::iota( I_rows.begin(), I_rows.end(), 0 );
+	std::iota( I_cols.begin(), I_cols.end(), 0 );
+	buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), n, PARALLEL );
+
+	/**    Test case 1:
+	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  * Initial value is 0
+	 *  * Expected result: n
+	 */
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, operators::add< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 2:
+	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  * Initial value is n
+	 *  * Expected result: 2*n
+	 */
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), operators::add< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 3:
+	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  * Initial value is 0
+	 *  * Expected result: n
+	 */
+	rc = foldl_test(
+		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 4:
+	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  * Initial value is n
+	 *  * Expected result: 2*n
+	 */
+	rc = foldl_test(
+		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 5:
+	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is 0
+	 * * Expected result: 0
+	 */
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::mul< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 6:
+	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is 1
+	 * * Expected result: 1
+	 */
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::mul< nz_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 7:
+	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * * Initial value is 0
+	 * * Expected result: 0
+	 */
+	rc = foldl_test(
+		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 8:
+	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * * Initial value is 1
+	 * * Expected result: 1
+	 */
+	rc = foldl_test(
+		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< size_t, nz_t, size_t >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< nz_t, size_t, size_t >() );
+	if( rc )
+		return;
+
+	/**     Test case 9:
+	 * A simple binary equal reduction with different types for the nnzs and the reduction result (bool <- bool == float).
+	 * * Initial value is true
+	 * * Expected result: true
+	 */
+	rc = foldl_test(
+		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< bool, nz_t, bool >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< nz_t, bool, bool >() );
+	if( rc )
+		return;
+
+	/**     Test case 10:
+	 * A simple binary logical_or reduction with different types for the nnzs and the reduction result (bool <- bool || float).
+	 * * Initial value is false
+	 * * Expected result: true
+	 */
+	rc = foldl_test(
+		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< bool, nz_t, bool >() );
+	if( rc )
+		return;
+	rc = foldr_test(
+		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< nz_t, bool, bool >() );
+	if( rc )
+		return;
+
+	/**     Test case 11:  Non-commutative reduction
+	 * A simple substraction reduction with the same types for the nnzs and the reduction result.
+	 * * Initial value is for foldl is 0
+	 * * Expected result for foldl: -n
+	 * 
+	 * * Initial value is for foldr is 0
+	 * * Expected result for foldr: 0
+	 * 
+	 * * Initial value is for foldr is 1
+	 * * Expected result for foldr: 1
+	 */
+	rc = foldl_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)( -n ), operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::subtract< nz_t >() );
+	if( rc )
+		return;
+	
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 10;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 10): an even integer, the test "
+				  << "size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out = RC::SUCCESS;
+	if( launcher.exec( &grb_program, (long)in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}

From 17b6d7da7a9381c46e0f03e95e9e25e35dd98075 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:08:54 +0200
Subject: [PATCH 31/63] parserr bugfix in matrixReduce unit-test

---
 tests/unit/matrixReduce.cpp | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
index 47106b361..2aca39581 100644
--- a/tests/unit/matrixReduce.cpp
+++ b/tests/unit/matrixReduce.cpp
@@ -244,21 +244,7 @@ int main( int argc, char ** argv ) {
 		printUsage = true;
 	}
 	if( argc == 2 ) {
-		size_t read;
-		std::istringstream ss( argv[ 1 ] );
-		if( ! ( ss >> read ) ) {
-			std::cerr << "Error parsing first argument\n";
-			printUsage = true;
-		} else if( ! ss.eof() ) {
-			std::cerr << "Error parsing first argument\n";
-			printUsage = true;
-		} else if( read % 2 != 0 ) {
-			std::cerr << "Given value for n is odd\n";
-			printUsage = true;
-		} else {
-			// all OK
-			in = read;
-		}
+		in = std::atol( argv[ 1 ] );
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";

From aea630dd54cde674944d82110ced6b2ab8606d28 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Wed, 17 May 2023 17:09:34 +0200
Subject: [PATCH 32/63] Adapt matrixReduce test for OMP foldl+foldr

---
 tests/unit/matrixReduce.cpp | 121 ++++++++++++++++++------------------
 1 file changed, 60 insertions(+), 61 deletions(-)

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
index 2aca39581..f05ccfd0e 100644
--- a/tests/unit/matrixReduce.cpp
+++ b/tests/unit/matrixReduce.cpp
@@ -22,13 +22,14 @@
  * @date 17/05/2023
  *
  * Tests whether the foldl and foldl API calls produce the expected results.
- * 
+ *
  * The test cases are focused on the following aspects:
  *   * The types of the result, the matrix values and the operator
  * 	 * The initial value of the reduction result
  * 	 * The order of the operands (foldr, foldl)
  */
 
+#include <chrono>
 #include <iostream>
 #include <numeric>
 #include <sstream>
@@ -38,12 +39,24 @@
 
 using namespace grb;
 
+constexpr bool PRINT_TIMERS = false;
+constexpr bool SKIP_FOLDL = false;
+constexpr bool SKIP_FOLDR = false;
+
 using nz_t = float;
 
-template< typename T, typename V, class Operator >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+template< typename T, typename V, class Monoid >
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	if( SKIP_FOLDL )
+		return RC::SUCCESS;
+
 	T value = initial;
-	foldl( value, A, op );
+	auto start_chrono = std::chrono::high_resolution_clock::now();
+	foldl( value, A, monoid );
+	auto end_chrono = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+	if( PRINT_TIMERS )
+		std::cout << "foldl_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
 	std::cout << "foldl_test \"" << test_label << "\": ";
 	if( value == expected )
@@ -58,10 +71,18 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	return value == expected ? RC::SUCCESS : RC::FAILED;
 }
 
-template< typename T, typename V, class Operator >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
+template< typename T, typename V, class Monoid >
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	if( SKIP_FOLDR )
+		return RC::SUCCESS;
+
 	T value = initial;
-	foldr( value, A, op );
+	auto start_chrono = std::chrono::high_resolution_clock::now();
+	foldr( value, A, monoid );
+	auto end_chrono = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+	if( PRINT_TIMERS )
+		std::cout << "foldr_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
 	std::cout << "foldr_test \"" << test_label << "\": ";
 	if( value == expected )
@@ -76,10 +97,10 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 	return value == expected ? RC::SUCCESS : RC::FAILED;
 }
 
-template< typename T, typename V, class Operator >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Operator & op ) {
-	RC rc = foldl_test( test_label, test_description, A, initial, expected, op );
-	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, op );
+template< typename T, typename V, class Monoid >
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+	RC rc = foldl_test( test_label, test_description, A, initial, expected, monoid );
+	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
 }
 
 void grb_program( const long & n, grb::RC & rc ) {
@@ -96,7 +117,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, operators::add< nz_t >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, Monoid< operators::add< nz_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -105,7 +126,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), operators::add< nz_t >() );
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), Monoid< operators::add< nz_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -114,12 +135,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test(
-		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< size_t, nz_t, size_t >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n, operators::add< nz_t, size_t, size_t >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -128,12 +149,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test(
-		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< size_t, nz_t, size_t >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ), operators::add< nz_t, size_t, size_t >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -142,7 +163,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::mul< nz_t >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, Monoid< operators::mul< nz_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -151,7 +172,7 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::mul< nz_t >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, Monoid< operators::mul< nz_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -160,12 +181,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test(
-		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< size_t, nz_t, size_t >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0, operators::mul< nz_t, size_t, size_t >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -174,12 +195,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test(
-		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< size_t, nz_t, size_t >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1, operators::mul< nz_t, size_t, size_t >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
@@ -188,12 +209,12 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test(
-		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< bool, nz_t, bool >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, nz_t, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test(
-		"9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true, operators::equal< nz_t, bool, bool >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< nz_t, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
@@ -202,36 +223,14 @@ void grb_program( const long & n, grb::RC & rc ) {
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test(
-		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< bool, nz_t, bool >() );
-	if( rc )
-		return;
-	rc = foldr_test(
-		"10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true, operators::logical_or< nz_t, bool, bool >() );
-	if( rc )
-		return;
-
-	/**     Test case 11:  Non-commutative reduction
-	 * A simple substraction reduction with the same types for the nnzs and the reduction result.
-	 * * Initial value is for foldl is 0
-	 * * Expected result for foldl: -n
-	 * 
-	 * * Initial value is for foldr is 0
-	 * * Expected result for foldr: 0
-	 * 
-	 * * Initial value is for foldr is 1
-	 * * Expected result for foldr: 1
-	 */
-	rc = foldl_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)( -n ), operators::subtract< nz_t >() );
-	if( rc )
-		return;
-	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, operators::subtract< nz_t >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, nz_t, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "11", "A non-commutative reduction(-) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, operators::subtract< nz_t >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< nz_t, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	
 }
 
 int main( int argc, char ** argv ) {

From d80ccf56b549fec3c5180206f483ca92f4e803f3 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Mon, 22 May 2023 13:32:29 +0200
Subject: [PATCH 33/63] Extend matrixeReduce unit-test

---
 tests/unit/matrixReduce.cpp | 189 +++++++++++++++++++++++++-----------
 1 file changed, 132 insertions(+), 57 deletions(-)

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
index f05ccfd0e..e67d02d0e 100644
--- a/tests/unit/matrixReduce.cpp
+++ b/tests/unit/matrixReduce.cpp
@@ -43,8 +43,6 @@ constexpr bool PRINT_TIMERS = false;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
 
-using nz_t = float;
-
 template< typename T, typename V, class Monoid >
 RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDL )
@@ -103,132 +101,126 @@ RC foldLR_test( const char * test_label, const char * test_description, const gr
 	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
 }
 
-void grb_program( const long & n, grb::RC & rc ) {
-	// Build an identity matrix
-	Matrix< nz_t > I( n, n );
-	std::vector< size_t > I_rows( n ), I_cols( n );
-	std::vector< nz_t > I_vals( n, 1 );
-	std::iota( I_rows.begin(), I_rows.end(), 0 );
-	std::iota( I_cols.begin(), I_cols.end(), 0 );
-	buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), n, PARALLEL );
+void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
+	const long n = grb::nnz( I );
 
 	/**    Test case 1:
-	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)n, Monoid< operators::add< nz_t >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 2:
-	 *  A simple additive reduction with the same types for the nnzs and the reduction result.
+	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nnzs and the reduction result.", I, (nz_t)n, (nz_t)( 2 * n ), Monoid< operators::add< nz_t >, identities::zero >() );
+	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
+		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nnzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< size_t, nz_t, size_t >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< nz_t, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
+		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 5:
-	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)0, (nz_t)0, Monoid< operators::mul< nz_t >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 6:
-	 * A simple multiplicative reduction with the same types for the nnzs and the reduction result.
+	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nnzs and the reduction result.", I, (nz_t)1, (nz_t)1, Monoid< operators::mul< nz_t >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
-		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nnzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, nz_t, size_t >, identities::one >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nnzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
-		Monoid< operators::mul< nz_t, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nnzs and the reduction result (bool <- bool == float).
+	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == float).
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, nz_t, bool >, identities::logical_true >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nnzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
-		Monoid< operators::equal< nz_t, bool, bool >, identities::logical_true >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
 	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nnzs and the reduction result (bool <- bool || float).
+	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || float).
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, nz_t, bool >, identities::logical_false >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nnzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
-		Monoid< operators::logical_or< nz_t, bool, bool >, identities::logical_false >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
 }
@@ -236,14 +228,14 @@ void grb_program( const long & n, grb::RC & rc ) {
 int main( int argc, char ** argv ) {
 	// defaults
 	bool printUsage = false;
-	size_t in = 10;
+	size_t n = 10;
 
 	// error checking
 	if( argc > 2 ) {
 		printUsage = true;
 	}
 	if( argc == 2 ) {
-		in = std::atol( argv[ 1 ] );
+		n = std::atol( argv[ 1 ] );
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
@@ -254,14 +246,97 @@ int main( int argc, char ** argv ) {
 
 	std::cout << "This is functional test " << argv[ 0 ] << "\n";
 	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC out = RC::SUCCESS;
-	if( launcher.exec( &grb_program, (long)in, out, true ) != SUCCESS ) {
-		std::cerr << "Launching test FAILED\n";
-		return 255;
+	grb::RC rc = RC::SUCCESS;
+
+	if( ! rc ) { // Build an identity square-matrix
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n ), I_cols( n );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 01 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Build a square-matrix with n 1s on the first row
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 02 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
 	}
-	if( out != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
-		return out;
+
+	if( ! rc ) { // Build a square-matrix with n 1s on the first column
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 03 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
+		Matrix< float > I( n, n );
+		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
+		std::vector< float > I_vals( 2 * n - 1, 1.f );
+		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
+		std::iota( I_cols.begin() + n, I_cols.end(), 1 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 04 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
+		Matrix< float > I( 1, n );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_cols.begin(), I_cols.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 04 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
+		Matrix< float > I( n, 1 );
+		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
+		std::vector< float > I_vals( n, 1.f );
+		std::iota( I_rows.begin(), I_rows.end(), 0 );
+		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
+		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+			std::cerr << "Launching test 06 FAILED\n";
+			return 255;
+		}
+		std::cout << std::endl;
+	}
+
+	if( rc != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl;
+		return rc;
 	} else {
 		std::cout << "Test OK" << std::endl;
 		return 0;

From 96ea3fa52eb88546dc72e8bb5acd0572c3142362 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 22:09:59 +0200
Subject: [PATCH 34/63] Matrix-to-scalar foldl+r masked version

---
 tests/unit/matrixReduce.cpp | 231 ++++++++++++++++++++++++------------
 1 file changed, 155 insertions(+), 76 deletions(-)

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
index e67d02d0e..d27a1fd85 100644
--- a/tests/unit/matrixReduce.cpp
+++ b/tests/unit/matrixReduce.cpp
@@ -42,111 +42,178 @@ using namespace grb;
 constexpr bool PRINT_TIMERS = false;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
+constexpr bool SKIP_UNMASKED = false;
+constexpr bool SKIP_MASKED = false;
 
 template< typename T, typename V, class Monoid >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDL )
 		return RC::SUCCESS;
+	RC rc = RC::SUCCESS;
+
+	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldl( value, A, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldl (unmasked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
+
+	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldl( value, A, mask, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldl (masked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
 
-	T value = initial;
-	auto start_chrono = std::chrono::high_resolution_clock::now();
-	foldl( value, A, monoid );
-	auto end_chrono = std::chrono::high_resolution_clock::now();
-	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
-	if( PRINT_TIMERS )
-		std::cout << "foldl_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-	std::cout << "foldl_test \"" << test_label << "\": ";
-	if( value == expected )
-		std::cout << "OK" << std::endl;
-	else
-		std::cerr << "Failed" << std::endl
-				  << test_description << std::endl
-				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-	return value == expected ? RC::SUCCESS : RC::FAILED;
+	return rc;
 }
 
 template< typename T, typename V, class Monoid >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
+RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
 	if( SKIP_FOLDR )
 		return RC::SUCCESS;
+	RC rc = RC::SUCCESS;
+
+	if( rc == RC::SUCCESS ) { // Unmasked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldr( value, A, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldr (unmasked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
 
-	T value = initial;
-	auto start_chrono = std::chrono::high_resolution_clock::now();
-	foldr( value, A, monoid );
-	auto end_chrono = std::chrono::high_resolution_clock::now();
-	auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
-	if( PRINT_TIMERS )
-		std::cout << "foldr_test \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-	std::cout << "foldr_test \"" << test_label << "\": ";
-	if( value == expected )
-		std::cout << "OK" << std::endl;
-	else
-		std::cerr << "Failed" << std::endl
-				  << test_description << std::endl
-				  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-				  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-				  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-	return value == expected ? RC::SUCCESS : RC::FAILED;
+	if( rc == RC::SUCCESS ) { // Masked
+		T value = initial;
+		auto start_chrono = std::chrono::high_resolution_clock::now();
+		foldr( value, A, mask, monoid );
+		auto end_chrono = std::chrono::high_resolution_clock::now();
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		if( PRINT_TIMERS )
+			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
+
+		std::cout << "foldr (masked) \"" << test_label << "\": ";
+		if( value == expected )
+			std::cout << "OK" << std::endl;
+		else
+			std::cerr << "Failed" << std::endl
+					  << test_description << std::endl
+					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
+					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
+					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
+
+		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
+	}
+
+	return rc;
 }
 
 template< typename T, typename V, class Monoid >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, T initial, T expected, const Monoid & monoid ) {
-	RC rc = foldl_test( test_label, test_description, A, initial, expected, monoid );
-	return rc ? rc : foldr_test( test_label, test_description, A, initial, expected, monoid );
+RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
+	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid );
+	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid );
 }
 
-void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
+struct input {
+	const grb::Matrix< float > & A;
+	const grb::Matrix< void > & mask;
+};
+
+void grb_program( const input & in, grb::RC & rc ) {
+	const grb::Matrix< float > & I = in.A;
+	const grb::Matrix< void > & mask = in.mask;
+
 	const long n = grb::nnz( I );
 
 	/**    Test case 1:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is 0
-	 *  * Expected result: n
+	 *  * Expected unmasked result: n
+	 *  * Expected masked result: 0
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
+	return;
 
 	/**     Test case 2:
 	 *  A simple additive reduction with the same types for the nzs and the reduction result.
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldLR_test( "2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test(
+		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< int, float, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)n,
-		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< float, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (size_t <- size_t + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< size_t, float, size_t >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< int, float, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)n, (size_t)( 2 * n ),
-		Monoid< operators::add< float, size_t, size_t >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< float, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -155,7 +222,7 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
@@ -164,7 +231,7 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
 	if( rc )
 		return;
 
@@ -173,11 +240,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
 		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)0, (size_t)0,
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
 		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
@@ -187,11 +254,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
 		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, (size_t)1, (size_t)1,
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
 		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
@@ -201,11 +268,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
 		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, (bool)true, (bool)true,
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
 		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
@@ -215,11 +282,11 @@ void grb_program( const grb::Matrix< float > & I, grb::RC & rc ) {
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
 		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, (bool)false, (bool)true,
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
 		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
@@ -248,15 +315,17 @@ int main( int argc, char ** argv ) {
 	grb::Launcher< AUTOMATIC > launcher;
 	grb::RC rc = RC::SUCCESS;
 
-	if( ! rc ) { // Build an identity square-matrix
+	if( ! rc ) { // Identity square-matrix
 		Matrix< float > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n );
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
 		}
@@ -269,22 +338,26 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
 		}
 		std::cout << std::endl;
 	}
 
-	if( ! rc ) { // Build a square-matrix with n 1s on the first column
+	if( ! rc ) { // Square-matrix with n 1s on the first column
 		Matrix< float > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
 		}
@@ -296,10 +369,12 @@ int main( int argc, char ** argv ) {
 		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
 		std::vector< float > I_vals( 2 * n - 1, 1.f );
 		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
-		std::iota( I_cols.begin() + n, I_cols.end(), 1 );
+		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -312,8 +387,10 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( 1, n );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
@@ -326,8 +403,10 @@ int main( int argc, char ** argv ) {
 		std::vector< float > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
+		Matrix< void > mask( n, 1 );
+		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
 		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, I, rc, true ) != SUCCESS ) {
+		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;
 		}

From 1d3064646bcde2fc5d3df06953ea8dd9cc29f71b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 15 Jun 2023 09:44:52 +0200
Subject: [PATCH 35/63] Add benchmarking quick to matrixReduce test

---
 tests/unit/matrixReduce.cpp | 146 ++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 65 deletions(-)

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
index d27a1fd85..08dbe3bed 100644
--- a/tests/unit/matrixReduce.cpp
+++ b/tests/unit/matrixReduce.cpp
@@ -39,11 +39,15 @@
 
 using namespace grb;
 
-constexpr bool PRINT_TIMERS = false;
+using NzType = double;
+
+
+constexpr bool PRINT_TIMERS = true;
 constexpr bool SKIP_FOLDL = false;
 constexpr bool SKIP_FOLDR = false;
 constexpr bool SKIP_UNMASKED = false;
 constexpr bool SKIP_MASKED = false;
+constexpr size_t ITERATIONS = 100;
 
 template< typename T, typename V, class Monoid >
 RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
@@ -54,9 +58,12 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldl( value, A, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldl( value, A, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -76,9 +83,12 @@ RC foldl_test( const char * test_label, const char * test_description, const grb
 	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldl( value, A, mask, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldl( value, A, mask, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -104,12 +114,15 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 		return RC::SUCCESS;
 	RC rc = RC::SUCCESS;
 
-	if( rc == RC::SUCCESS ) { // Unmasked
+	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldr( value, A, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldr( value, A, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -126,12 +139,15 @@ RC foldr_test( const char * test_label, const char * test_description, const grb
 		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
 	}
 
-	if( rc == RC::SUCCESS ) { // Masked
+	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
 		T value = initial;
 		auto start_chrono = std::chrono::high_resolution_clock::now();
-		foldr( value, A, mask, monoid );
+		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
+			value = initial;
+			foldr( value, A, mask, monoid );
+		}
 		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono );
+		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
 		if( PRINT_TIMERS )
 			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
 
@@ -158,12 +174,12 @@ RC foldLR_test( const char * test_label, const char * test_description, const gr
 }
 
 struct input {
-	const grb::Matrix< float > & A;
+	const grb::Matrix< NzType > & A;
 	const grb::Matrix< void > & mask;
 };
 
 void grb_program( const input & in, grb::RC & rc ) {
-	const grb::Matrix< float > & I = in.A;
+	const grb::Matrix< NzType > & I = in.A;
 	const grb::Matrix< void > & mask = in.mask;
 
 	const long n = grb::nnz( I );
@@ -174,7 +190,7 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 *  * Expected unmasked result: n
 	 *  * Expected masked result: 0
 	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)n, Monoid< operators::add< float >, identities::zero >() );
+	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
 	if( rc )
 		return;
 	return;
@@ -185,35 +201,35 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 *  * Expected result: 2*n
 	 */
 	rc = foldLR_test(
-		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (float)n, (float)( 2 * n ), Monoid< operators::add< float >, identities::zero >() );
+		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is 0
 	 *  * Expected result: n
 	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< int, float, int >, identities::zero >() );
+	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< int, NzType, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< float, int, int >, identities::zero >() );
+	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
+		Monoid< operators::add< NzType, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
 	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + float).
+	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
 	 *  * Initial value is n
 	 *  * Expected result: 2*n
 	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + float).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< int, float, int >, identities::zero >() );
+	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< int, NzType, int >, identities::zero >() );
 	if( rc )
 		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- float + int).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< float, int, int >, identities::zero >() );
+	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
+		Monoid< operators::add< NzType, int, int >, identities::zero >() );
 	if( rc )
 		return;
 
@@ -222,7 +238,7 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)0, (float)0, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
 	if( rc )
 		return;
 
@@ -231,63 +247,63 @@ void grb_program( const input & in, grb::RC & rc ) {
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (float)1, (float)1, Monoid< operators::mul< float >, identities::one >() );
+	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 0
 	 * * Expected result: 0
 	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
+	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
+		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * float).
+	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
 	 * * Initial value is 1
 	 * * Expected result: 1
 	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, float, size_t >, identities::one >() );
+	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
 	if( rc )
 		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * float).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< float, size_t, size_t >, identities::one >() );
+	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
+		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
 	if( rc )
 		return;
 
 	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == float).
+	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == NzType).
 	 * * Initial value is true
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, float, bool >, identities::logical_true >() );
+	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+		Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
 	if( rc )
 		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == float).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< float, bool, bool >, identities::logical_true >() );
+	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
+		Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
 	if( rc )
 		return;
 
 	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || float).
+	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || NzType).
 	 * * Initial value is false
 	 * * Expected result: true
 	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, float, bool >, identities::logical_false >() );
+	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+		Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
 	if( rc )
 		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || float).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< float, bool, bool >, identities::logical_false >() );
+	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
+		Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
 	if( rc )
 		return;
 }
@@ -316,9 +332,9 @@ int main( int argc, char ** argv ) {
 	grb::RC rc = RC::SUCCESS;
 
 	if( ! rc ) { // Identity square-matrix
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
@@ -329,13 +345,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 01 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Build a square-matrix with n 1s on the first row
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, n );
@@ -345,13 +361,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 02 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Square-matrix with n 1s on the first column
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, n );
@@ -361,13 +377,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 03 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
-		Matrix< float > I( n, n );
+		Matrix< NzType > I( n, n );
 		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
-		std::vector< float > I_vals( 2 * n - 1, 1.f );
+		std::vector< NzType > I_vals( 2 * n - 1, 1.f );
 		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
 		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
@@ -378,13 +394,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
-		Matrix< float > I( 1, n );
+		Matrix< NzType > I( 1, n );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_cols.begin(), I_cols.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( 1, n );
@@ -394,13 +410,13 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 04 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
-		Matrix< float > I( n, 1 );
+		Matrix< NzType > I( n, 1 );
 		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< float > I_vals( n, 1.f );
+		std::vector< NzType > I_vals( n, 1.f );
 		std::iota( I_rows.begin(), I_rows.end(), 0 );
 		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
 		Matrix< void > mask( n, 1 );
@@ -410,7 +426,7 @@ int main( int argc, char ** argv ) {
 			std::cerr << "Launching test 06 FAILED\n";
 			return 255;
 		}
-		std::cout << std::endl;
+		std::cout << std::endl << std::flush;
 	}
 
 	if( rc != SUCCESS ) {

From 90380a6250e04cde6cb39bbeab3cca0cf653d537 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 15:48:02 +0200
Subject: [PATCH 36/63] Restrict foldr+l test to implemented backends

---
 tests/unit/matrixReduce.cpp | 439 ------------------------------------
 1 file changed, 439 deletions(-)
 delete mode 100644 tests/unit/matrixReduce.cpp

diff --git a/tests/unit/matrixReduce.cpp b/tests/unit/matrixReduce.cpp
deleted file mode 100644
index 08dbe3bed..000000000
--- a/tests/unit/matrixReduce.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Tests for the reduce( Matrix<D>, T, Operator<T,D,T> ) API call
- *
- * @author Benjamin Lozes
- * @date 17/05/2023
- *
- * Tests whether the foldl and foldl API calls produce the expected results.
- *
- * The test cases are focused on the following aspects:
- *   * The types of the result, the matrix values and the operator
- * 	 * The initial value of the reduction result
- * 	 * The order of the operands (foldr, foldl)
- */
-
-#include <chrono>
-#include <iostream>
-#include <numeric>
-#include <sstream>
-#include <vector>
-
-#include <graphblas.hpp>
-
-using namespace grb;
-
-using NzType = double;
-
-
-constexpr bool PRINT_TIMERS = true;
-constexpr bool SKIP_FOLDL = false;
-constexpr bool SKIP_FOLDR = false;
-constexpr bool SKIP_UNMASKED = false;
-constexpr bool SKIP_MASKED = false;
-constexpr size_t ITERATIONS = 100;
-
-template< typename T, typename V, class Monoid >
-RC foldl_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
-	if( SKIP_FOLDL )
-		return RC::SUCCESS;
-	RC rc = RC::SUCCESS;
-
-	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldl( value, A, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldl (unmasked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldl( value, A, mask, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldl (masked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	return rc;
-}
-
-template< typename T, typename V, class Monoid >
-RC foldr_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
-	if( SKIP_FOLDR )
-		return RC::SUCCESS;
-	RC rc = RC::SUCCESS;
-
-	if( rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldr( value, A, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldr (unmasked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	if( rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldr( value, A, mask, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldr (masked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	return rc;
-}
-
-template< typename T, typename V, class Monoid >
-RC foldLR_test( const char * test_label, const char * test_description, const grb::Matrix< V > & A, const grb::Matrix< void > & mask, T initial, T expected, const Monoid & monoid ) {
-	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid );
-	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid );
-}
-
-struct input {
-	const grb::Matrix< NzType > & A;
-	const grb::Matrix< void > & mask;
-};
-
-void grb_program( const input & in, grb::RC & rc ) {
-	const grb::Matrix< NzType > & I = in.A;
-	const grb::Matrix< void > & mask = in.mask;
-
-	const long n = grb::nnz( I );
-
-	/**    Test case 1:
-	 *  A simple additive reduction with the same types for the nzs and the reduction result.
-	 *  * Initial value is 0
-	 *  * Expected unmasked result: n
-	 *  * Expected masked result: 0
-	 */
-	rc = foldLR_test( "1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
-	if( rc )
-		return;
-	return;
-
-	/**     Test case 2:
-	 *  A simple additive reduction with the same types for the nzs and the reduction result.
-	 *  * Initial value is n
-	 *  * Expected result: 2*n
-	 */
-	rc = foldLR_test(
-		"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
-	if( rc )
-		return;
-
-	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
-	 *  * Initial value is 0
-	 *  * Expected result: n
-	 */
-	rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< int, NzType, int >, identities::zero >() );
-	if( rc )
-		return;
-	rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
-		Monoid< operators::add< NzType, int, int >, identities::zero >() );
-	if( rc )
-		return;
-
-	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
-	 *  * Initial value is n
-	 *  * Expected result: 2*n
-	 */
-	rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< int, NzType, int >, identities::zero >() );
-	if( rc )
-		return;
-	rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
-		Monoid< operators::add< NzType, int, int >, identities::zero >() );
-	if( rc )
-		return;
-
-	/**     Test case 5:
-	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
-	 * * Initial value is 0
-	 * * Expected result: 0
-	 */
-	rc = foldLR_test( "5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
-	if( rc )
-		return;
-
-	/**     Test case 6:
-	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
-	 * * Initial value is 1
-	 * * Expected result: 1
-	 */
-	rc = foldLR_test( "6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
-	if( rc )
-		return;
-
-	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
-	 * * Initial value is 0
-	 * * Expected result: 0
-	 */
-	rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-	if( rc )
-		return;
-	rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-	if( rc )
-		return;
-
-	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
-	 * * Initial value is 1
-	 * * Expected result: 1
-	 */
-	rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-	if( rc )
-		return;
-	rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-		Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-	if( rc )
-		return;
-
-	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == NzType).
-	 * * Initial value is true
-	 * * Expected result: true
-	 */
-	rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
-	if( rc )
-		return;
-	rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-		Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
-	if( rc )
-		return;
-
-	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || NzType).
-	 * * Initial value is false
-	 * * Expected result: true
-	 */
-	rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
-	if( rc )
-		return;
-	rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-		Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
-	if( rc )
-		return;
-}
-
-int main( int argc, char ** argv ) {
-	// defaults
-	bool printUsage = false;
-	size_t n = 10;
-
-	// error checking
-	if( argc > 2 ) {
-		printUsage = true;
-	}
-	if( argc == 2 ) {
-		n = std::atol( argv[ 1 ] );
-	}
-	if( printUsage ) {
-		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
-		std::cerr << "  -n (optional, default is 10): an even integer, the test "
-				  << "size.\n";
-		return 1;
-	}
-
-	std::cout << "This is functional test " << argv[ 0 ] << "\n";
-	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC rc = RC::SUCCESS;
-
-	if( ! rc ) { // Identity square-matrix
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n ), I_cols( n );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 01 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Build a square-matrix with n 1s on the first row
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 02 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Square-matrix with n 1s on the first column
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 03 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
-		std::vector< NzType > I_vals( 2 * n - 1, 1.f );
-		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
-		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 04 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
-		Matrix< NzType > I( 1, n );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( 1, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 04 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
-		Matrix< NzType > I( n, 1 );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, 1 );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		if( launcher.exec( &grb_program, { I, mask }, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 06 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( rc != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl;
-		return rc;
-	} else {
-		std::cout << "Test OK" << std::endl;
-		return 0;
-	}
-}

From d6c785d71c40f36cf77e4592ba4b4476be8bb6c0 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 20 Jun 2023 14:34:11 +0200
Subject: [PATCH 37/63] Add signature for grb::tril in base

---
 include/graphblas/base/blas3.hpp | 86 ++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index d62d43330..9faac188d 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -675,6 +675,92 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/**
+	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 *
+	 * @tparam descr      The descriptor to be used (descriptors::no_operation
+	 * 					  if left unspecified).
+	 * @tparam InputType  The type of the elements in the supplied ALP/GraphBLAS
+	 *                    matrix \a A.
+	 * @tparam OutputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                    matrix \a L.
+	 *
+	 * @param[out] L       The lower triangular portion of \a A, below the k-th
+	 * 					   diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::MISMATCH If the dimensions of \a L and \a A do not match.
+	 *
+ 	 * \parblock
+	 * \par Allowed descriptors
+	 * - transpose_matrix: Consider A^T instead of A.
+	 * - no_casting: If the types of \a L and \a A differ, the primitive
+	 * 				 will fail.
+	 * \endparblock
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename OutputType,
+		typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC tril(
+		Matrix< OutputType, backend, RIT, CIT, NIT > &L,
+		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
+		const long int k,
+		const Phase &phase = Phase::EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType >::value &&
+			std::is_convertible< InputType, OutputType >::value
+		>::type * const = nullptr
+	) {
+		(void) L;
+		(void) A;
+		(void) phase;
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement grb::tril()\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_tril = false;
+		assert( selected_backend_does_not_support_tril );
+#endif
+		const RC ret = grb::clear( A );
+		return ret == SUCCESS ? UNSUPPORTED : ret;
+	}
+
+	/**
+	 * Return the lower triangular portion of a matrix,
+	 * below main diagonal (excluded).
+	 *
+	 * This primitive is strictly equivalent to calling
+	 * grb::tril( L, A, 0, phase ).
+	 * see grb::tril( L, A, k, phase ) for full description.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename OutputType,
+		typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC tril(
+		Matrix< OutputType, backend, RIT, CIT, NIT > &L,
+		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
+		const Phase &phase = Phase::EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType >::value &&
+			std::is_convertible< InputType, OutputType >::value
+		>::type * const = nullptr
+	) {
+		return tril< descr >( L, A, 0, phase );
+	}
+
 	/**
 	 * @}
 	 */

From 4d9ba81019d006ff1cfc303f484faaa128a83ac6 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 20 Jun 2023 14:55:47 +0200
Subject: [PATCH 38/63] Add unit-test for grb::tril

---
 tests/unit/CMakeLists.txt |   4 ++
 tests/unit/tril.cpp       | 124 ++++++++++++++++++++++++++++++++++++++
 tests/unit/unittests.sh   |   6 ++
 3 files changed, 134 insertions(+)
 create mode 100644 tests/unit/tril.cpp

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 2ee3de02e..14bf424a2 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -85,6 +85,10 @@ add_grb_executables( ewiseapply ewiseapply.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+add_grb_executables( tril tril.cpp
+	BACKENDS reference reference_omp
+)
+
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
diff --git a/tests/unit/tril.cpp b/tests/unit/tril.cpp
new file mode 100644
index 000000000..5d6deffdb
--- /dev/null
+++ b/tests/unit/tril.cpp
@@ -0,0 +1,124 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+template< typename D >
+RC is_lower_triangle( const grb::Matrix< D > & L ) {
+	for( const auto & triple : L ) {
+		const size_t & i = triple.first.first;
+		const size_t & j = triple.first.second;
+		const size_t & v = triple.second;
+		if( i < j ) {
+			std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
+					  << "-- only expected entries on the diagonal\n";
+			return RC::FAILED;
+		}
+		if( v != 2 ) {
+			std::cout << "Unexpected value at position ( " << i << ", " << j << " ) "
+					  << "-- expected 2, found " << v << "\n";
+			return RC::FAILED;
+		}
+	}
+	return RC::SUCCESS;
+}
+
+void grb_program( const size_t & n, grb::RC & rc ) {
+	rc = RC::SUCCESS;
+
+	// Matrix initialisation
+	grb::Matrix< int > A( n, n );
+	grb::Matrix< size_t > L_A( n, n );  // L_A is the lower triangular matrix of A
+	grb::Matrix< size_t > L_At( n, n ); // L_At is the lower triangular matrix of A^T
+	size_t * I = new size_t[ n ];
+	size_t * J = new size_t[ n ];
+	double * V = new double[ n ];
+	for( size_t k = 0; k < n; ++k ) {
+		I[ k ] = k % 3 == 0 ? k : k - 1;
+		J[ k ] = std::rand() % n;
+		V[ k ] = 2;
+	}
+	assert( not grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ) );
+
+	{ // Mixed-domain matrix, should be successful
+		rc = grb::tril( L_A, A, Phase::RESIZE );
+		rc = rc ? rc : grb::tril( L_A, A, Phase::EXECUTE );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
+			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
+			return;
+		}
+		rc = is_lower_triangle( L_A );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
+			std::cerr << "Error on result, not a lower-triangle" << std::endl;
+			return;
+		}
+	}
+	{ // Transpose_matrix descriptor, should be successful
+		rc = grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::RESIZE );
+		rc = rc ? rc : grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::EXECUTE );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: Transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
+			return;
+		}
+		rc = is_lower_triangle< size_t >( L_At );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: Transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on result, not a lower-triangle" << std::endl;
+			return;
+		}
+	}
+	
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	size_t n = 1000000;
+
+	// error checking
+	if( argc == 2 ) {
+		n = std::strtoul( argv[ 1 ], nullptr, 10 );
+	}
+	if( argc > 3 ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "[n = " << n << "]\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out;
+	if( launcher.exec( &grb_program, n, out, false ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 8aa42d597..13ed2186d 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -208,6 +208,12 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::tril on matrices of doubles of size 100."
+				$runner ${TEST_BIN_DIR}/tril_${MODE}_${BACKEND} 14 &> ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
 				echo "                                 of doubles of size 100."
 				$runner ${TEST_BIN_DIR}/ewiseapply_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/ewiseapply_${MODE}_${BACKEND}_${P}_${T}

From fee2add916443a447c5932c3a22c87c8bca4bc27 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 20 Jun 2023 14:56:13 +0200
Subject: [PATCH 39/63] Implementation of grb::tril for reference+omp

---
 include/graphblas/reference/blas3.hpp | 165 ++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 6a397dab2..54fb052ea 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1402,6 +1402,121 @@ namespace grb {
 			return SUCCESS;
 		}
 
+		template< Descriptor descr, typename OutputType, typename InputType, typename RIT, typename CIT, typename NIT >
+		RC tril_generic( Matrix< OutputType, reference, RIT, CIT, NIT > & L, const Matrix< InputType, reference, RIT, CIT, NIT > & A, const long int k, const Phase & phase ) {
+			const size_t m = descr & descriptors::transpose_matrix ? ncols( A ) : nrows( A );
+			const size_t n = descr & descriptors::transpose_matrix ? nrows( A ) : ncols( A );
+
+			// Run-time checks
+			if( m != nrows( L ) || n != ncols( L ) ) {
+				return RC::MISMATCH;
+			}
+
+#ifdef _DEBUG
+			std::cout << "In grb::internal::tril_generic( reference )\n";
+#endif
+			const auto & A_raw = descr & descriptors::transpose_matrix ? internal::getCCS( A ) : internal::getCRS( A );
+
+			if( phase == Phase::RESIZE ) {
+				size_t nzc = 0;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel for reduction( + : nzc ) default( none ) shared( A_raw ) firstprivate( k, m )
+#endif
+				for( size_t i = 0; i < m; ++i ) {
+					for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+						const size_t A_j = A_raw.row_index[ A_k ];
+						// If the value is in the lower triangle, increment the count
+						if( A_j <= i + k ) {
+							nzc += 1;
+						}
+					}
+				}
+#ifdef _DEBUG
+				std::cout << "RESIZE phase: resize( L, " << nzc << " )\n";
+#endif
+				return resize( L, nzc );
+			} 
+
+			if( phase == Phase::EXECUTE ) {
+
+				const auto & L_crs_raw = internal::getCRS( L );
+				const auto & L_ccs_raw = internal::getCCS( L );
+				const size_t nzc = capacity( L );
+				
+				L_crs_raw.col_start[ 0 ] = 0;
+				L_ccs_raw.col_start[ 0 ] = 0;
+				
+				// Prefix sum computation into L.CRS.col_start
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel for simd default( none ) shared( A_raw, L_crs_raw, L_ccs_raw ) firstprivate( k, m )
+#endif
+				for( size_t i = 0; i < m; i++ ) {
+					size_t cumul = 0UL;
+					for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+						const size_t A_j = A_raw.row_index[ A_k ];
+						// If the value is in the lower triangle, increment the sum
+						if( A_j > i + k ) {
+							continue;
+						}
+						cumul += 1;
+					}
+					L_crs_raw.col_start[ i + 1 ] = cumul;
+				}
+
+				// Apply the prefix sum
+				for( size_t i = 1; i <= m; i++ ) {
+					L_crs_raw.col_start[ i ] += L_crs_raw.col_start[ i - 1 ];
+					L_ccs_raw.col_start[ i ] = L_crs_raw.col_start[ i ];
+				}
+
+				// Check if the number of nonzeros is greater than the capacity
+				if( L_crs_raw.col_start[ m ] > nzc ) {
+#ifdef _DEBUG
+					std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n"
+							  << "Requested " << L_crs_raw.col_start[ m ] << " nonzeros, but capacity is " << nzc << "\n";
+#endif
+					return RC::MISMATCH;
+				}
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#pragma omp parallel default( none ) shared( A_raw, L_crs_raw, L_ccs_raw ) firstprivate( k, m )
+#endif
+				{
+					size_t start_row = 0;
+					size_t end_row = m;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+					config::OMP::localRange( start_row, end_row, 0, m );
+#endif
+					// Update the CRS and CCS row indices and values
+					for( size_t i = start_row; i < end_row; i++ ) {
+						size_t L_k = L_crs_raw.col_start[ i ];
+						for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+							const size_t A_j = A_raw.row_index[ A_k ];
+							// If the value is in the upper triangle, skip it
+							if( A_j > i + k ) {
+								continue;
+							}
+
+							L_crs_raw.row_index[ L_k ] = A_j;
+							L_crs_raw.values[ L_k ] = A_raw.values[ A_k ];
+							L_ccs_raw.row_index[ L_k ] = i;
+							L_ccs_raw.values[ L_k ] = A_raw.values[ A_k ];
+							L_k += 1;
+						}
+					}
+				}
+
+#ifdef _DEBUG
+				std::cout << "EXECUTE phase: setCurrentNonzeroes( L, " << nzc << " )\n";
+#endif
+				internal::setCurrentNonzeroes( L, nzc );
+
+				return RC::SUCCESS;
+			}
+
+			return RC::SUCCESS;;
+		}
+
 	} // namespace internal
 
 	/**
@@ -1724,6 +1839,56 @@ namespace grb {
 	}
 
 
+	/**
+	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 *
+	 * @param[out] L       The lower triangular portion of \a A, below the k-th
+	 * 					   diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 *
+	 * \internal Pattern matrices are allowed
+	 *
+	 * \internal Dispatches to internal::tril_generic
+	 */
+	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
+	RC tril( Matrix< OutputType, reference, RIT, CIT, NIT > & L,
+		const Matrix< InputType, reference, RIT, CIT, NIT > & A,
+		const long int k,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
+			nullptr ) {
+		(void)L;
+		(void)A;
+		(void)phase;
+#ifdef _DEBUG
+		std::cerr << "In grb::tril (reference)\n";
+#endif
+
+		// Static checks
+		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ), "grb::tril (reference)",
+			"input matrix and output matrix are incompatible for implicit casting" );
+
+		return internal::tril_generic< descr >( L, A, k, phase );
+	}
+
+	/**
+	 * Return the lower triangular portion of a matrix, below main diagonal.
+	 *
+	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
+	 * see grb::tril( L, A, k, phase ) for full description.
+	 */
+	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
+	RC tril( Matrix< OutputType, reference, RIT, CIT, NIT > & L,
+		const Matrix< InputType, reference, RIT, CIT, NIT > & A,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
+			nullptr ) {
+		return tril< descr >( L, A, 0, phase );
+		
+	}
+
 } // namespace grb
 
 #undef NO_CAST_ASSERT

From 725b81f84ee283b0750c2a89361d7c1d3497855b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 20 Jun 2023 14:56:50 +0200
Subject: [PATCH 40/63] Implementation of grb::tril for hyperdags

---
 include/graphblas/hyperdags/blas3.hpp     | 64 +++++++++++++++++++++++
 include/graphblas/hyperdags/hyperdags.hpp |  5 +-
 src/graphblas/hyperdags/hyperdags.cpp     |  3 ++
 tests/unit/CMakeLists.txt                 |  2 +-
 4 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index ef51992fd..5258fe608 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -497,6 +497,70 @@ namespace grb {
 		);
 		return ret;
 	}
+	
+
+	/**
+	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 *
+	 * @param[out] L       The lower triangular portion of \a A, below the k-th
+	 * 					   diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 *
+	 * \internal Pattern matrices are allowed
+	 *
+	 * \internal Dispatches to internal::tril_generic
+	 */
+
+	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
+	RC tril( Matrix< OutputType, hyperdags, RIT, CIT, NIT > & L,
+		const Matrix< InputType, hyperdags, RIT, CIT, NIT > & A,
+		const long int k,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
+			nullptr ) {
+		
+#ifdef _DEBUG
+		std::cerr << "In grb::tril (hyperdags)\n";
+#endif
+
+		const RC ret = tril< descr >( 
+			internal::getMatrix( L ), 
+			internal::getMatrix( A ), 
+			k, phase 
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesL{
+			getID( internal::getMatrix(A) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(L) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::TRIL_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesL.begin(), sourcesL.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/**
+	 * Return the lower triangular portion of a matrix, below main diagonal.
+	 *
+	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
+	 * see grb::tril( L, A, k, phase ) for full description.
+	 */
+	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
+	RC tril( Matrix< OutputType, hyperdags, RIT, CIT, NIT > & L,
+		const Matrix< InputType, hyperdags, RIT, CIT, NIT > & A,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
+			nullptr ) {
+		return tril< descr >( L, A, 0, phase );
+	}
 
 } // end namespace grb
 
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index d032c8e38..432ee9965 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -497,10 +497,12 @@ namespace grb {
 				FOLDR_SCALAR_MATRIX_MASK_MONOID,
 
 				FOLDR_SCALAR_MATRIX_MONOID,
+
+				TRIL_MATRIX
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 110;
+			const constexpr size_t numOperationVertexTypes = 111;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
@@ -616,6 +618,7 @@ namespace grb {
 				FOLDL_SCALAR_MATRIX_MONOID,
 				FOLDR_SCALAR_MATRIX_MASK_MONOID,
 				FOLDR_SCALAR_MATRIX_MONOID,
+				TRIL_MATRIX
 			};
 
 			/** \internal @returns The operation vertex type as a string. */
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index 90774d0e2..746244741 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -391,6 +391,9 @@ std::string grb::internal::hyperdags::toString(
 
 		case FOLDR_SCALAR_MATRIX_MONOID:
 			return "foldr( scalar, matrix, monoid )";
+			
+		case TRIL_MATRIX:
+			return "tril( matrix, matrix )";
 
 	}
 	assert( false );
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 14bf424a2..1f9ea7a23 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -86,7 +86,7 @@ add_grb_executables( ewiseapply ewiseapply.cpp
 )
 
 add_grb_executables( tril tril.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags
 )
 
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp

From ba7c80e39aea5c73d58e6cc4d887ee149f34b83b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 20 Jun 2023 16:22:38 +0200
Subject: [PATCH 41/63] Explicit templates declaration

---
 include/graphblas/base/blas3.hpp      | 18 ++++----
 include/graphblas/hyperdags/blas3.hpp | 39 +++++++++++-----
 include/graphblas/reference/blas3.hpp | 65 +++++++++++++++++++--------
 3 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 9faac188d..6779884c6 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -705,12 +705,13 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		typename InputType,
 		typename OutputType,
-		typename RIT, typename CIT, typename NIT,
-		Backend backend
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		Backend implementation
 	>
 	RC tril(
-		Matrix< OutputType, backend, RIT, CIT, NIT > &L,
-		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
+		Matrix< OutputType, implementation, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
 		const long int k,
 		const Phase &phase = Phase::EXECUTE,
 		const typename std::enable_if<
@@ -745,12 +746,13 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		typename InputType,
 		typename OutputType,
-		typename RIT, typename CIT, typename NIT,
-		Backend backend
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		Backend implementation
 	>
 	RC tril(
-		Matrix< OutputType, backend, RIT, CIT, NIT > &L,
-		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
+		Matrix< OutputType, implementation, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
 		const Phase &phase = Phase::EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< OutputType >::value &&
diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index 5258fe608..341fbef6c 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -513,14 +513,22 @@ namespace grb {
 	 * \internal Dispatches to internal::tril_generic
 	 */
 
-	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
-	RC tril( Matrix< OutputType, hyperdags, RIT, CIT, NIT > & L,
-		const Matrix< InputType, hyperdags, RIT, CIT, NIT > & A,
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC tril(
+		Matrix< OutputType, hyperdags, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
 		const long int k,
 		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
-			nullptr ) {
-		
+		const typename std::enable_if< 
+			! grb::is_object< OutputType >::value && 
+			! grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+			>::type * const = nullptr ) {
 #ifdef _DEBUG
 		std::cerr << "In grb::tril (hyperdags)\n";
 #endif
@@ -553,12 +561,21 @@ namespace grb {
 	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
 	 * see grb::tril( L, A, k, phase ) for full description.
 	 */
-	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
-	RC tril( Matrix< OutputType, hyperdags, RIT, CIT, NIT > & L,
-		const Matrix< InputType, hyperdags, RIT, CIT, NIT > & A,
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC tril( 
+		Matrix< OutputType, hyperdags, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
 		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
-			nullptr ) {
+		const typename std::enable_if< 
+			! grb::is_object< OutputType >::value && 
+			! grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+			>::type * const = nullptr ) {
 		return tril< descr >( L, A, 0, phase );
 	}
 
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 54fb052ea..82750b449 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1402,8 +1402,18 @@ namespace grb {
 			return SUCCESS;
 		}
 
-		template< Descriptor descr, typename OutputType, typename InputType, typename RIT, typename CIT, typename NIT >
-		RC tril_generic( Matrix< OutputType, reference, RIT, CIT, NIT > & L, const Matrix< InputType, reference, RIT, CIT, NIT > & A, const long int k, const Phase & phase ) {
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename InputType, typename OutputType,
+			typename RIT_L, typename CIT_L, typename NIT_L,
+			typename RIT_A, typename CIT_A, typename NIT_A
+		>
+		RC tril_generic(
+			Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
+			const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
+			const long int k,
+			const Phase & phase ) {
+
 			const size_t m = descr & descriptors::transpose_matrix ? ncols( A ) : nrows( A );
 			const size_t n = descr & descriptors::transpose_matrix ? nrows( A ) : ncols( A );
 
@@ -1435,17 +1445,17 @@ namespace grb {
 				std::cout << "RESIZE phase: resize( L, " << nzc << " )\n";
 #endif
 				return resize( L, nzc );
-			} 
+			}
 
 			if( phase == Phase::EXECUTE ) {
 
 				const auto & L_crs_raw = internal::getCRS( L );
 				const auto & L_ccs_raw = internal::getCCS( L );
 				const size_t nzc = capacity( L );
-				
+
 				L_crs_raw.col_start[ 0 ] = 0;
 				L_ccs_raw.col_start[ 0 ] = 0;
-				
+
 				// Prefix sum computation into L.CRS.col_start
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 #pragma omp parallel for simd default( none ) shared( A_raw, L_crs_raw, L_ccs_raw ) firstprivate( k, m )
@@ -1852,23 +1862,34 @@ namespace grb {
 	 *
 	 * \internal Dispatches to internal::tril_generic
 	 */
-	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
-	RC tril( Matrix< OutputType, reference, RIT, CIT, NIT > & L,
-		const Matrix< InputType, reference, RIT, CIT, NIT > & A,
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC tril(
+		Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
 		const long int k,
 		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
-			nullptr ) {
-		(void)L;
-		(void)A;
-		(void)phase;
+		const typename std::enable_if<
+			! grb::is_object< OutputType >::value &&
+			! grb::is_object< InputType >::value &&
+			std::is_convertible< InputType, OutputType >::value
+		>::type * const = nullptr ) {
+
 #ifdef _DEBUG
 		std::cerr << "In grb::tril (reference)\n";
 #endif
 
 		// Static checks
-		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ), "grb::tril (reference)",
-			"input matrix and output matrix are incompatible for implicit casting" );
+		NO_CAST_ASSERT(
+			(   not ( descr & descriptors::no_casting )
+				|| std::is_same< InputType, OutputType >::value
+			), "grb::tril (reference)",
+			"input matrix and output matrix are incompatible for implicit casting"
+		);
 
 		return internal::tril_generic< descr >( L, A, k, phase );
 	}
@@ -1879,14 +1900,20 @@ namespace grb {
 	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
 	 * see grb::tril( L, A, k, phase ) for full description.
 	 */
-	template< Descriptor descr = descriptors::no_operation, typename InputType, typename OutputType, typename RIT, typename CIT, typename NIT >
-	RC tril( Matrix< OutputType, reference, RIT, CIT, NIT > & L,
-		const Matrix< InputType, reference, RIT, CIT, NIT > & A,
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_L, typename CIT_L, typename NIT_L,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC tril(
+		Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
 		const Phase & phase = Phase::EXECUTE,
 		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
 			nullptr ) {
 		return tril< descr >( L, A, 0, phase );
-		
+
 	}
 
 } // namespace grb

From a005d6495e90d1e63ff2c307a8b0127724b645fb Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 11:30:51 +0200
Subject: [PATCH 42/63] More robust test with values verification

---
 tests/unit/tril.cpp | 65 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/tests/unit/tril.cpp b/tests/unit/tril.cpp
index 5d6deffdb..8acd32534 100644
--- a/tests/unit/tril.cpp
+++ b/tests/unit/tril.cpp
@@ -22,20 +22,56 @@
 
 using namespace grb;
 
+template< class Iterator >
+void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+	if( rows < 64 || cols > 64 ) {
+		return;
+	}
+	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+	// os.precision( 3 );
+	for( size_t y = 0; y < rows; y++ ) {
+		os << std::string( 3, ' ' );
+		for( size_t x = 0; x < cols; x++ ) {
+			auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+				return a.first.first == y && a.first.second == x;
+			} );
+			if( nnz_val != end )
+				os << std::fixed << ( *nnz_val ).second;
+			else
+				os << '_';
+			os << " ";
+		}
+		os << std::endl;
+	}
+	os << "]" << std::endl;
+}
+
 template< typename D >
-RC is_lower_triangle( const grb::Matrix< D > & L ) {
+void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+	grb::wait( mat );
+	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+}
+
+template< typename D, Descriptor descr = descriptors::no_operation >
+D compute_value( size_t i, size_t j ) {
+	return descr & descriptors::transpose_matrix ? i + 2 * j : 2 * i + j;
+}
+
+template< Descriptor descr = descriptors::no_operation, typename D >
+RC check_obtained( const grb::Matrix< D > & L ) {
 	for( const auto & triple : L ) {
 		const size_t & i = triple.first.first;
 		const size_t & j = triple.first.second;
 		const size_t & v = triple.second;
 		if( i < j ) {
 			std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
-					  << "-- only expected entries on the diagonal\n";
+					  << "-- only expected entries on the lower triangular part\n";
 			return RC::FAILED;
 		}
-		if( v != 2 ) {
+		const D expected_value = compute_value< D, descr >( i, j );
+		if( v != expected_value ) {
 			std::cout << "Unexpected value at position ( " << i << ", " << j << " ) "
-					  << "-- expected 2, found " << v << "\n";
+					  << "-- expected " << expected_value << ", found " << v << "\n";
 			return RC::FAILED;
 		}
 	}
@@ -55,43 +91,48 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	for( size_t k = 0; k < n; ++k ) {
 		I[ k ] = k % 3 == 0 ? k : k - 1;
 		J[ k ] = std::rand() % n;
-		V[ k ] = 2;
+		V[ k ] = compute_value< int >( I[ k ], J[ k ] );
 	}
 	assert( not grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ) );
 
 	{ // Mixed-domain matrix, should be successful
+		printSparseMatrix( A, "A" );
 		rc = grb::tril( L_A, A, Phase::RESIZE );
 		rc = rc ? rc : grb::tril( L_A, A, Phase::EXECUTE );
+		printSparseMatrix( L_A, "L_A" );
 
 		if( rc != SUCCESS ) {
 			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
 			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
 			return;
 		}
-		rc = is_lower_triangle( L_A );
+		rc = check_obtained( L_A );
 		if( rc != SUCCESS ) {
 			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
-			std::cerr << "Error on result, not a lower-triangle" << std::endl;
+			std::cerr << "Error on result, incorrect result" << std::endl;
 			return;
 		}
+		std::cout << std::flush << " -- Test passed: mixed-domain matrix" << std::flush << std::endl;
 	}
 	{ // Transpose_matrix descriptor, should be successful
+		printSparseMatrix( A, "A" );
 		rc = grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::RESIZE );
 		rc = rc ? rc : grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::EXECUTE );
+		printSparseMatrix( L_At, "L_At" );
 
 		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: Transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
 			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
 			return;
 		}
-		rc = is_lower_triangle< size_t >( L_At );
+		rc = check_obtained< descriptors::transpose_matrix >( L_At );
 		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: Transpose_matrix descriptor" << std::endl;
-			std::cerr << "Error on result, not a lower-triangle" << std::endl;
+			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on result, incorrect result" << std::endl;
 			return;
 		}
+		std::cout << std::flush << " -- Test passed: transpose_matrix descriptor" << std::flush << std::endl;
 	}
-	
 }
 
 int main( int argc, char ** argv ) {

From 37c3ef4800988ff18e53139c50a6cfa2816d49b3 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 14:11:33 +0200
Subject: [PATCH 43/63] Add unit-test for grb::triu

---
 tests/unit/CMakeLists.txt |   4 +
 tests/unit/triu.cpp       | 165 ++++++++++++++++++++++++++++++++++++++
 tests/unit/unittests.sh   |  18 +++--
 3 files changed, 182 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/triu.cpp

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 1f9ea7a23..b6fcbb4f6 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -89,6 +89,10 @@ add_grb_executables( tril tril.cpp
 	BACKENDS reference reference_omp hyperdags
 )
 
+add_grb_executables( triu triu.cpp
+	BACKENDS reference reference_omp hyperdags
+)
+
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
diff --git a/tests/unit/triu.cpp b/tests/unit/triu.cpp
new file mode 100644
index 000000000..4cf66d97c
--- /dev/null
+++ b/tests/unit/triu.cpp
@@ -0,0 +1,165 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+template< class Iterator >
+void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+	if( rows > 64 || cols > 64 ) {
+		return;
+	}
+	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+	// os.precision( 3 );
+	for( size_t y = 0; y < rows; y++ ) {
+		os << std::string( 3, ' ' );
+		for( size_t x = 0; x < cols; x++ ) {
+			auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+				return a.first.first == y && a.first.second == x;
+			} );
+			if( nnz_val != end )
+				os << std::fixed << ( *nnz_val ).second;
+			else
+				os << '_';
+			os << " ";
+		}
+		os << std::endl;
+	}
+	os << "]" << std::endl;
+}
+
+template< typename D >
+void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+	grb::wait( mat );
+	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+}
+
+template< typename D, Descriptor descr = descriptors::no_operation >
+D compute_value( size_t i, size_t j ) {
+	return descr & descriptors::transpose_matrix ? i + 2 * j : 2 * i + j;
+}
+
+template< Descriptor descr = descriptors::no_operation, typename D >
+RC check_obtained( const grb::Matrix< D > & U ) {
+	for( const auto & triple : U ) {
+		const size_t & i = triple.first.first;
+		const size_t & j = triple.first.second;
+		const size_t & v = triple.second;
+		if( i > j ) {
+			std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
+					  << "-- only expected entries on the upper triangular part\n";
+			return RC::FAILED;
+		}
+		const D expected_value = compute_value< D, descr >( i, j );
+		if( v != expected_value ) {
+			std::cout << "Unexpected value at position ( " << i << ", " << j << " ) "
+					  << "-- expected " << expected_value << ", found " << v << "\n";
+			return RC::FAILED;
+		}
+	}
+	return RC::SUCCESS;
+}
+
+void grb_program( const size_t & n, grb::RC & rc ) {
+	rc = RC::SUCCESS;
+
+	// Matrix initialisation
+	grb::Matrix< int > A( n, n );
+	grb::Matrix< size_t > U_A( n, n );  // U_A is the upper triangular matrix of A
+	grb::Matrix< size_t > U_At( n, n ); // U_At is the upper triangular matrix of A^T
+	size_t * I = new size_t[ n ];
+	size_t * J = new size_t[ n ];
+	double * V = new double[ n ];
+	for( size_t k = 0; k < n; ++k ) {
+		I[ k ] = k % 3 == 0 ? k : k - 1;
+		J[ k ] = std::rand() % n;
+		V[ k ] = compute_value< int >( I[ k ], J[ k ] );
+	}
+	assert( not grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ) );
+
+	{ // Mixed-domain matrix, should be successful
+		printSparseMatrix( A, "A" );
+		rc = grb::triu( U_A, A, Phase::RESIZE );
+		rc = rc ? rc : grb::triu( U_A, A, Phase::EXECUTE );
+		printSparseMatrix( U_A, "U_A" );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
+			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
+			return;
+		}
+		rc = check_obtained( U_A );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
+			std::cerr << "Error on result, incorrect result" << std::endl;
+			return;
+		}
+		std::cout << std::flush << " -- Test passed: mixed-domain matrix" << std::flush << std::endl;
+	}
+	{ // Transpose_matrix descriptor, should be successful
+		printSparseMatrix( A, "A" );
+		rc = grb::triu< descriptors::transpose_matrix >( U_At, A, Phase::RESIZE );
+		rc = rc ? rc : grb::triu< descriptors::transpose_matrix >( U_At, A, Phase::EXECUTE );
+		printSparseMatrix( U_At, "U_At" );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
+			return;
+		}
+		rc = check_obtained< descriptors::transpose_matrix >( U_At );
+		if( rc != SUCCESS ) {
+			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
+			std::cerr << "Error on result, incorrect result" << std::endl;
+			return;
+		}
+		std::cout << std::flush << " -- Test passed: transpose_matrix descriptor" << std::flush << std::endl;
+	}
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	size_t n = 1000000;
+
+	// error checking
+	if( argc == 2 ) {
+		n = std::strtoul( argv[ 1 ], nullptr, 10 );
+	}
+	if( argc > 3 ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "[n = " << n << "]\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out;
+	if( launcher.exec( &grb_program, n, out, false ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 13ed2186d..c2fa33110 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -208,11 +208,19 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::tril on matrices of doubles of size 100."
-				$runner ${TEST_BIN_DIR}/tril_${MODE}_${BACKEND} 14 &> ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
-				head -1 ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
-				grep 'Test OK' ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
-				echo " "
+				if [ "$BACKEND" = "reference" ] || [ "$BACKEND" = "reference_omp" ]|| [ "$BACKEND" = "hyperdags" ]; then
+					echo ">>>      [x]           [ ]       Testing grb::tril"
+					$runner ${TEST_BIN_DIR}/tril_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
+					head -1 ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
+					grep 'Test OK' ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+					echo " "
+
+					echo ">>>      [x]           [ ]       Testing grb::triu"
+					$runner ${TEST_BIN_DIR}/triu_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T}
+					head -1 ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T}
+					grep 'Test OK' ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+					echo " "
+				fi
 
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
 				echo "                                 of doubles of size 100."

From aaafcace87d7ab39e3c165decbe5ff6f95f63860 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 14:35:58 +0200
Subject: [PATCH 44/63] Typo in unit-test tril.cpp

---
 tests/unit/tril.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/tril.cpp b/tests/unit/tril.cpp
index 8acd32534..c57b1aeab 100644
--- a/tests/unit/tril.cpp
+++ b/tests/unit/tril.cpp
@@ -24,7 +24,7 @@ using namespace grb;
 
 template< class Iterator >
 void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-	if( rows < 64 || cols > 64 ) {
+	if( rows > 64 || cols > 64 ) {
 		return;
 	}
 	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;

From e0cd59684bbb22ab4c5d7711382e14c57d1761a1 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 14:36:29 +0200
Subject: [PATCH 45/63] grb::tril documentation enhancement

---
 include/graphblas/base/blas3.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 6779884c6..580887fe6 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -676,7 +676,8 @@ namespace grb {
 	}
 
 	/**
-	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 * Return the lower triangular portion of a matrix, strictly below the k-th 
+	 * diagonal (excluded).
 	 *
 	 * @tparam descr      The descriptor to be used (descriptors::no_operation
 	 * 					  if left unspecified).
@@ -685,8 +686,8 @@ namespace grb {
 	 * @tparam OutputType The type of the elements in the supplied ALP/GraphBLAS
 	 *                    matrix \a L.
 	 *
-	 * @param[out] L       The lower triangular portion of \a A, below the k-th
-	 * 					   diagonal.
+	 * @param[out] L       The lower triangular portion of \a A, strictly below
+	 * 					   the k-th diagonal.
 	 * @param[in]  A       Any ALP/GraphBLAS matrix.
 	 * @param[in]  k       The diagonal above which to zero out \a A.
 	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
@@ -736,10 +737,11 @@ namespace grb {
 
 	/**
 	 * Return the lower triangular portion of a matrix,
-	 * below main diagonal (excluded).
+	 * strictly below main diagonal (excluded).
 	 *
 	 * This primitive is strictly equivalent to calling
 	 * grb::tril( L, A, 0, phase ).
+	 * 
 	 * see grb::tril( L, A, k, phase ) for full description.
 	 */
 	template<

From 8ce27afca37133c48d1b04218ae6a7b8caa8042f Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 14:37:12 +0200
Subject: [PATCH 46/63] grb::triu signature and documentation in base

---
 include/graphblas/base/blas3.hpp | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 580887fe6..9f61d3b2e 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -723,6 +723,7 @@ namespace grb {
 	) {
 		(void) L;
 		(void) A;
+		(void) k;
 		(void) phase;
 #ifdef _DEBUG
 		std::cerr << "Selected backend does not implement grb::tril()\n";
@@ -765,6 +766,97 @@ namespace grb {
 		return tril< descr >( L, A, 0, phase );
 	}
 
+	/**
+	 * Return the upper triangular portion of a matrix, strictly above the k-th 
+	 * diagonal (excluded).
+	 *
+	 * @tparam descr      The descriptor to be used (descriptors::no_operation
+	 * 					  if left unspecified).
+	 * @tparam InputType  The type of the elements in the supplied ALP/GraphBLAS
+	 *                    matrix \a A.
+	 * @tparam OutputType The type of the elements in the supplied ALP/GraphBLAS
+	 *                    matrix \a U.
+	 *
+	 * @param[out] U       The upper triangular portion of \a A, strictly above 
+	 * 					   the k-th diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 *
+	 * @return grb::SUCCESS  When the call completed successfully.
+	 * @return grb::MISMATCH If the dimensions of \a U and \a A do not match.
+	 *
+ 	 * \parblock
+	 * \par Allowed descriptors
+	 * - transpose_matrix: Consider A^T instead of A.
+	 * - no_casting: If the types of \a T and \a A differ, the primitive
+	 * 				 will fail.
+	 * \endparblock
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename OutputType,
+		typename RIT_U, typename CIT_U, typename NIT_U,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		Backend implementation
+	>
+	RC triu(
+		Matrix< OutputType, implementation, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
+		const long int k,
+		const Phase &phase = Phase::EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType >::value &&
+			std::is_convertible< InputType, OutputType >::value
+		>::type * const = nullptr
+	) {
+		(void) U;
+		(void) A;
+		(void) k;
+		(void) phase;
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement grb::triu()\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_triu = false;
+		assert( selected_backend_does_not_support_triu );
+#endif
+		const RC ret = grb::clear( A );
+		return ret == SUCCESS ? UNSUPPORTED : ret;
+	}
+
+	/**
+	 * Return the upper triangular portion of a matrix,
+	 * strictly above main diagonal (excluded).
+	 *
+	 * This primitive is strictly equivalent to calling
+	 * grb::triu( U, A, 0, phase ) 
+	 * 
+	 * see grb::triu( U, A, k, phase ) for full description.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename OutputType,
+		typename RIT_U, typename CIT_U, typename NIT_U,
+		typename RIT_A, typename CIT_A, typename NIT_A,
+		Backend implementation
+	>
+	RC triu(
+		Matrix< OutputType, implementation, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
+		const Phase &phase = Phase::EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType >::value &&
+			std::is_convertible< InputType, OutputType >::value
+		>::type * const = nullptr
+	) {
+		return triu< descr >( U, A, 0, phase );
+	}
+
 	/**
 	 * @}
 	 */

From ce3a176acbe5ef8d7404caf61dcbf020dfdac7e6 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 14:42:51 +0200
Subject: [PATCH 47/63] grb::triu implementation for reference+omp

---
 include/graphblas/base/blas3.hpp      |   4 +-
 include/graphblas/reference/blas3.hpp | 155 ++++++++++++++++++++------
 2 files changed, 123 insertions(+), 36 deletions(-)

diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 9f61d3b2e..841ba981a 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -732,7 +732,7 @@ namespace grb {
 		const bool selected_backend_does_not_support_tril = false;
 		assert( selected_backend_does_not_support_tril );
 #endif
-		const RC ret = grb::clear( A );
+		const RC ret = grb::clear( L );
 		return ret == SUCCESS ? UNSUPPORTED : ret;
 	}
 
@@ -823,7 +823,7 @@ namespace grb {
 		const bool selected_backend_does_not_support_triu = false;
 		assert( selected_backend_does_not_support_triu );
 #endif
-		const RC ret = grb::clear( A );
+		const RC ret = grb::clear( U );
 		return ret == SUCCESS ? UNSUPPORTED : ret;
 	}
 
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 82750b449..3961a7556 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -1403,12 +1403,13 @@ namespace grb {
 		}
 
 		template<
+			bool upper,
 			Descriptor descr = descriptors::no_operation,
 			typename InputType, typename OutputType,
 			typename RIT_L, typename CIT_L, typename NIT_L,
 			typename RIT_A, typename CIT_A, typename NIT_A
 		>
-		RC tril_generic(
+		RC trilu_generic(
 			Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
 			const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
 			const long int k,
@@ -1423,7 +1424,7 @@ namespace grb {
 			}
 
 #ifdef _DEBUG
-			std::cout << "In grb::internal::tril_generic( reference )\n";
+			std::cout << "In grb::internal::trilu_generic( reference )\n";
 #endif
 			const auto & A_raw = descr & descriptors::transpose_matrix ? internal::getCCS( A ) : internal::getCRS( A );
 
@@ -1433,12 +1434,16 @@ namespace grb {
 #pragma omp parallel for reduction( + : nzc ) default( none ) shared( A_raw ) firstprivate( k, m )
 #endif
 				for( size_t i = 0; i < m; ++i ) {
-					for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-						const size_t A_j = A_raw.row_index[ A_k ];
-						// If the value is in the lower triangle, increment the count
-						if( A_j <= i + k ) {
-							nzc += 1;
+					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+						const auto A_j = A_raw.row_index[ A_k ];
+						// If the value is in the appropriate triangle, skip it
+						if( not upper && A_j > i + k ) {
+							continue;
+						} 
+						if( upper && A_j < i - k ) {
+							continue;
 						}
+						nzc += 1;
 					}
 				}
 #ifdef _DEBUG
@@ -1462,10 +1467,13 @@ namespace grb {
 #endif
 				for( size_t i = 0; i < m; i++ ) {
 					size_t cumul = 0UL;
-					for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-						const size_t A_j = A_raw.row_index[ A_k ];
-						// If the value is in the lower triangle, increment the sum
-						if( A_j > i + k ) {
+					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+						const auto A_j = A_raw.row_index[ A_k ];
+						// If the value is in the appropriate triangle, skip it
+						if( not upper && A_j > i + k ) {
+							continue;
+						} 
+						if( upper && A_j < i - k ) {
 							continue;
 						}
 						cumul += 1;
@@ -1499,11 +1507,14 @@ namespace grb {
 #endif
 					// Update the CRS and CCS row indices and values
 					for( size_t i = start_row; i < end_row; i++ ) {
-						size_t L_k = L_crs_raw.col_start[ i ];
-						for( size_t A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-							const size_t A_j = A_raw.row_index[ A_k ];
-							// If the value is in the upper triangle, skip it
-							if( A_j > i + k ) {
+						auto L_k = L_crs_raw.col_start[ i ];
+						for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
+							const auto A_j = A_raw.row_index[ A_k ];
+							// If the value is in the appropriate triangle, skip it
+							if( not upper && A_j > i + k ) {
+								continue;
+							} 
+							if( upper && A_j < i - k ) {
 								continue;
 							}
 
@@ -1850,17 +1861,18 @@ namespace grb {
 
 
 	/**
-	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 * Return the lower triangular portion of a matrix, strictly below 
+	 * the k-th diagonal.
 	 *
-	 * @param[out] L       The lower triangular portion of \a A, below the k-th
-	 * 					   diagonal.
+	 * @param[out] L       The lower triangular portion of \a A, strictly
+	 * 					   below the k-th diagonal.
 	 * @param[in]  A       Any ALP/GraphBLAS matrix.
 	 * @param[in]  k       The diagonal above which to zero out \a A.
 	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
 	 *
 	 * \internal Pattern matrices are allowed
 	 *
-	 * \internal Dispatches to internal::tril_generic
+	 * \internal Dispatches to internal::trilu_generic
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -1873,29 +1885,29 @@ namespace grb {
 		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
 		const long int k,
 		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if<
-			! grb::is_object< OutputType >::value &&
-			! grb::is_object< InputType >::value &&
-			std::is_convertible< InputType, OutputType >::value
-		>::type * const = nullptr ) {
-
+		const typename std::enable_if< 
+			not grb::is_object< OutputType >::value && 
+			not grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+		>::type * const = nullptr ) 
+	{
 #ifdef _DEBUG
 		std::cerr << "In grb::tril (reference)\n";
 #endif
 
 		// Static checks
-		NO_CAST_ASSERT(
-			(   not ( descr & descriptors::no_casting )
-				|| std::is_same< InputType, OutputType >::value
-			), "grb::tril (reference)",
+		NO_CAST_ASSERT( 
+			( not ( descr & descriptors::no_casting ) || 
+			std::is_same< InputType, OutputType >::value ), 
+			"grb::tril (reference)",
 			"input matrix and output matrix are incompatible for implicit casting"
 		);
 
-		return internal::tril_generic< descr >( L, A, k, phase );
+		return internal::trilu_generic< false, descr >( L, A, k, phase );
 	}
 
 	/**
-	 * Return the lower triangular portion of a matrix, below main diagonal.
+	 * Return the lower triangular portion of a matrix, strictly below main diagonal.
 	 *
 	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
 	 * see grb::tril( L, A, k, phase ) for full description.
@@ -1910,10 +1922,85 @@ namespace grb {
 		Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
 		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
 		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType >::value && std::is_convertible< InputType, OutputType >::value >::type * const =
-			nullptr ) {
+		const typename std::enable_if< 
+			not grb::is_object< OutputType >::value && 
+			not grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+		>::type * const = nullptr )
+	{
 		return tril< descr >( L, A, 0, phase );
+	}
 
+	/**
+	 * Return the upper triangular portion of a matrix, strictly above 
+	 * the k-th diagonal.
+	 *
+	 * @param[out] U       The upper triangular portion of \a A, strictly 
+	 * 					   above the k-th diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 *
+	 * \internal Pattern matrices are allowed
+	 *
+	 * \internal Dispatches to internal::trilu_generic
+	 */
+	template< 
+		Descriptor descr = descriptors::no_operation, 
+		typename InputType, typename OutputType, 
+		typename RIT_U, typename CIT_U, typename NIT_U, 
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC triu(
+		Matrix< OutputType, reference, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
+		const long int k,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< 
+			not grb::is_object< OutputType >::value && 
+			not grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+		>::type * const = nullptr )
+	{
+#ifdef _DEBUG
+		std::cerr << "In grb::triu (reference)\n";
+#endif
+
+		// Static checks
+		NO_CAST_ASSERT( 
+			( not ( descr & descriptors::no_casting ) || 
+			std::is_same< InputType, OutputType >::value ), 
+			"grb::triu (reference)",
+			"input matrix and output matrix are incompatible for implicit casting"
+		);
+
+		// Add descriptors::transpose_matrix to descr
+		return internal::trilu_generic< true, descr >( U, A, k, phase );
+	}
+
+	/**
+	 * Return the upper triangular portion of a matrix, stricly above the main diagonal.
+	 *
+	 * This primitive is strictly equivalent to calling grb::triu( L, A, 0, phase ).
+	 * see grb::triu( L, A, k, phase ) for full description.
+	 */
+	template< 
+		Descriptor descr = descriptors::no_operation, 
+		typename InputType, typename OutputType, 
+		typename RIT_U, typename CIT_U, typename NIT_U, 
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC triu(
+		Matrix< OutputType, reference, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< 
+			not grb::is_object< OutputType >::value && 
+			not grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+		>::type * const = nullptr )
+	{
+		return triu< descr >( U, A, 0, phase );
 	}
 
 } // namespace grb

From 8efbcbf8613a2af4c404ced778729c8051cbcdf9 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Wed, 21 Jun 2023 17:11:48 +0200
Subject: [PATCH 48/63] Implementation of grb::triu for hyperdags

---
 include/graphblas/hyperdags/blas3.hpp     | 108 ++++++++++++++++++++--
 include/graphblas/hyperdags/hyperdags.hpp |   9 +-
 src/graphblas/hyperdags/hyperdags.cpp     |   3 +
 3 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index 341fbef6c..75785ee17 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -500,17 +500,17 @@ namespace grb {
 	
 
 	/**
-	 * Return the lower triangular portion of a matrix, below the k-th diagonal.
+	 * Return the lower triangular portion of a matrix, strictly 
+	 * below the k-th diagonal.
 	 *
-	 * @param[out] L       The lower triangular portion of \a A, below the k-th
-	 * 					   diagonal.
+	 * @param[out] L       The lower triangular portion of \a A, strictly 
+	 * 				 	   below the k-th diagonal.
 	 * @param[in]  A       Any ALP/GraphBLAS matrix.
 	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
+	 * @param[in]  phase   The #grb::Phase in which the primitive 
+	 * 					   is to proceed.
 	 *
 	 * \internal Pattern matrices are allowed
-	 *
-	 * \internal Dispatches to internal::tril_generic
 	 */
 
 	template<
@@ -545,7 +545,9 @@ namespace grb {
 		std::array< uintptr_t, 1 > sourcesL{
 			getID( internal::getMatrix(A) )
 		};
-		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(L) ) };
+		std::array< uintptr_t, 1 > destinations{ 
+			getID( internal::getMatrix(L) )
+		};
 		internal::hyperdags::generator.addOperation(
 			internal::hyperdags::TRIL_MATRIX,
 			sourcesP.begin(), sourcesP.end(),
@@ -556,9 +558,12 @@ namespace grb {
 	}
 
 	/**
-	 * Return the lower triangular portion of a matrix, below main diagonal.
+	 * Return the lower triangular portion of a matrix, strictly 
+	 * below the main diagonal.
 	 *
-	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
+	 * This primitive is strictly equivalent to calling 
+	 * grb::tril( L, A, 0, phase ).
+	 * 
 	 * see grb::tril( L, A, k, phase ) for full description.
 	 */
 	template<
@@ -579,6 +584,91 @@ namespace grb {
 		return tril< descr >( L, A, 0, phase );
 	}
 
+	/**
+	 * Return the upper triangular portion of a matrix, strictly
+	 * above the k-th diagonal.
+	 *
+	 * @param[out] U       The upper triangular portion of \a A, strictly 
+	 * 					   above the k-th diagonal.
+	 * @param[in]  A       Any ALP/GraphBLAS matrix.
+	 * @param[in]  k       The diagonal above which to zero out \a A.
+	 * @param[in]  phase   The #grb::Phase in which the primitive 
+	 * 					   is to proceed.
+	 *
+	 * \internal Pattern matrices are allowed
+	 */
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_U, typename CIT_U, typename NIT_U,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC triu(
+		Matrix< OutputType, hyperdags, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
+		const long int k,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< 
+			! grb::is_object< OutputType >::value && 
+			! grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+			>::type * const = nullptr ) {
+#ifdef _DEBUG
+		std::cerr << "In grb::triu (hyperdags)\n";
+#endif
+
+		const RC ret = triu< descr >( 
+			internal::getMatrix( U ), 
+			internal::getMatrix( A ), 
+			k, phase 
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesL{
+			getID( internal::getMatrix(A) )
+		};
+		std::array< uintptr_t, 1 > destinations{ 
+			getID( internal::getMatrix(U) )
+		};
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::TRIU_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesL.begin(), sourcesL.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/**
+	 * Return the lower triangular portion of a matrix, strictly 
+	 * above the main diagonal.
+	 *
+	 * This primitive is strictly equivalent to 
+	 * calling grb::triu( U, A, 0, phase ).
+	 * 
+	 * see grb::tril( U, A, k, phase ) for full description.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename OutputType,
+		typename RIT_U, typename CIT_U, typename NIT_U,
+		typename RIT_A, typename CIT_A, typename NIT_A
+	>
+	RC triu( 
+		Matrix< OutputType, hyperdags, RIT_U, CIT_U, NIT_U > & U,
+		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
+		const Phase & phase = Phase::EXECUTE,
+		const typename std::enable_if< 
+			! grb::is_object< OutputType >::value && 
+			! grb::is_object< InputType >::value && 
+			std::is_convertible< InputType, OutputType >::value 
+			>::type * const = nullptr ) {
+		return triu< descr >( U, A, 0, phase );
+	}
+
 } // end namespace grb
 
 #endif
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index 432ee9965..416fe4cfc 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -498,11 +498,13 @@ namespace grb {
 
 				FOLDR_SCALAR_MATRIX_MONOID,
 
-				TRIL_MATRIX
+				TRIL_MATRIX,
+
+				TRIU_MATRIX
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 111;
+			const constexpr size_t numOperationVertexTypes = 112;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
@@ -618,7 +620,8 @@ namespace grb {
 				FOLDL_SCALAR_MATRIX_MONOID,
 				FOLDR_SCALAR_MATRIX_MASK_MONOID,
 				FOLDR_SCALAR_MATRIX_MONOID,
-				TRIL_MATRIX
+				TRIL_MATRIX,
+				TRIU_MATRIX
 			};
 
 			/** \internal @returns The operation vertex type as a string. */
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index 746244741..bf574515d 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -395,6 +395,9 @@ std::string grb::internal::hyperdags::toString(
 		case TRIL_MATRIX:
 			return "tril( matrix, matrix )";
 
+		case TRIU_MATRIX:
+			return "triu( matrix, matrix )";
+
 	}
 	assert( false );
 	return "unknown operation";

From 671da323dbfdea7dc085d0811cd402de4a95bb1e Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Tue, 23 May 2023 10:53:38 +0200
Subject: [PATCH 49/63] Triangle counting smoke test

---
 tests/smoke/CMakeLists.txt     |   5 +
 tests/smoke/smoketests.sh      |  17 ++
 tests/smoke/triangle_count.cpp | 337 +++++++++++++++++++++++++++++++++
 3 files changed, 359 insertions(+)
 create mode 100644 tests/smoke/triangle_count.cpp

diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 1f99446ee..091d0fe10 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -180,6 +180,11 @@ add_grb_executables( kcore_decomposition kcore_decomposition.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking bsp1d hybrid
 )
 
+add_grb_executables( triangle_count triangle_count.cpp
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+	BACKENDS reference reference_omp
+)
+
 # targets to list and build the test for this category
 get_property( smoke_tests_list GLOBAL PROPERTY tests_category_smoke )
 add_custom_target( "list_tests_category_smoke"
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index cd2025dab..5d2512869 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -366,6 +366,23 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
+			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm."
+			if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
+				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx 30 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
+				elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
+					echo "Verification FAILED"
+					echo "Test FAILED"
+				else
+					echo "Test OK"
+				fi
+			else
+				echo "Test DISABLED: dwt_59.mtx was not found. To enable, please provide ${INPUT_DIR}/dwt_59.mtx"
+			fi
+			echo " "
+
 			if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
 				echo "Additional standardised smoke tests not yet supported for the ${BACKEND} backend"
 				echo
diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
new file mode 100644
index 000000000..a1d7f5644
--- /dev/null
+++ b/tests/smoke/triangle_count.cpp
@@ -0,0 +1,337 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/algorithms/triangle_count.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+#include <utils/output_verification.hpp>
+
+/** Must be an integer type (int, long, unsigned, etc.) */
+using nonzeroval_t = long;
+
+typedef struct {
+	grb::Matrix< nonzeroval_t > A;
+	size_t expected_triangle_count;
+} input_t;
+
+typedef struct {
+	grb::RC rc = grb::RC::SUCCESS;
+	std::vector< size_t > triangleCounts; // Per algorithm
+	grb::utils::TimerResults times;
+	size_t data_in_local;
+} output_t;
+
+void grbProgram( const input_t & input, output_t & output ) {
+
+	// get user process ID.
+	const size_t s = grb::spmd< grb::config::default_backend >::pid();
+	assert( s < grb::spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	std::cout << std::endl << "Running triangle counting with Burkhardt algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	triangle_count( grb::algorithms::TriangleCountAlgorithm::Burkhardt, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Burkhardt triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Burkhardt triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl << "Running triangle counting with Cohen algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	output.rc = output.rc ? output.rc : output.rc ? output.rc : grb::algorithms::triangle_count( grb::algorithms::TriangleCountAlgorithm::Cohen, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Cohen triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Cohen triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl << "Running triangle counting with Sandia_LL algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_LL, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Sandia_LL triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Sandia_LL triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl << "Running triangle counting with Sandia_LUT algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	grb::algorithms::triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_LUT, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Sandia_LUT triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Sandia_LUT triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl << "Running triangle counting with Sandia_ULT algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_ULT, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Sandia_ULT triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Sandia_ULT triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl << "Running triangle counting with Sandia_UU algorithm" << std::endl;
+	output.triangleCounts.push_back( 0 );
+	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_UU, output.triangleCounts.back(), input.A );
+	timer.reset();
+	if( output.triangleCounts.back() != input.expected_triangle_count ) {
+		std::cerr << "ERROR: Sandia_UU triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
+		output.rc = output.rc ? output.rc : grb::RC::FAILED;
+	} else {
+		std::cout << "Sandia_UU triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	}
+
+	std::cout << std::endl;
+}
+
+int main( int argc, char ** argv ) {
+	(void)argc;
+	(void)argv;
+	constexpr size_t niterations = 1;
+
+	grb::Benchmarker< grb::EXEC_MODE::AUTOMATIC > benchmarker;
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// Check if we are testing on a file
+	if( argc != 1 && argc != 3 ) {
+		std::cerr << "Usage: \n\t" << argv[ 0 ] << " [ <graph_path> <expected_triangle_count> ]" << std::endl;
+		return 1;
+	}
+	bool test_on_file = argc == 3;
+	std::string file_to_test( test_on_file ? argv[ 1 ] : "" );
+	size_t expected_file_triangles = test_on_file ? std::stoul( argv[ 2 ] ) : 0;
+
+	/** Matrix A0:
+	 *    0  1  2  3
+	 * 0  _  X  X  _
+	 * 1  X  _  X  X
+	 * 2  X  X  _  X
+	 * 3  _  X  X  _
+	 *
+	 * Schema:
+	 *  0 ------ 1
+	 *  |      /
+	 *  |    /
+	 *  |  /
+	 *  2        3
+	 *
+	 * => 1 triangle
+	 */
+	{ // Undirected version
+		size_t expected_triangle_count = 1;
+		grb::Matrix< nonzeroval_t > A0_undirected( 4, 4 );
+		std::vector< size_t > A0_undirected_rows { { 0, 0, 1, 1, 2, 2 } };
+		std::vector< size_t > A0_undirected_cols { { 1, 2, 0, 2, 0, 1 } };
+		std::vector< nonzeroval_t > A0_undirected_values( A0_undirected_rows.size(), 1 );
+		grb::buildMatrixUnique( A0_undirected, A0_undirected_rows.data(), A0_undirected_cols.data(), A0_undirected_values.data(), A0_undirected_values.size(), grb::IOMode::PARALLEL );
+		input_t input_A0_undirected { A0_undirected, expected_triangle_count };
+		output_t output_A0_undirected;
+		std::cout << "-- Running test on A0_undirected" << std::endl;
+		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A0_undirected, output_A0_undirected, niterations, 1 );
+		if( bench_rc ) {
+			std::cerr << "ERROR during execution of A0_undirected: rc = " << bench_rc << std::endl;
+			return bench_rc;
+		} else if( output_A0_undirected.rc ) {
+			std::cerr << "Test failed: rc = " << output_A0_undirected.rc << std::endl;
+			return output_A0_undirected.rc;
+		}
+		std::cout << std::endl;
+	}
+
+	/** Matrix A1:
+	 *    0  1  2  3
+	 * 0  _  X  X  _
+	 * 1  X  _  X  X
+	 * 2  X  X  _  X
+	 * 3  _  X  X  _
+	 *
+	 * Schema:
+	 *  0 ------ 1
+	 *  |      / |
+	 *  |    /   |
+	 *  |  /     |
+	 *  2 ------ 3
+	 *
+	 * => 2 triangles
+	 */
+	{ // Undirected version
+		size_t expected_triangle_count = 2;
+		grb::Matrix< nonzeroval_t > A1_undirected( 4, 4 );
+		std::vector< size_t > A1_undirected_rows { { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3 } };
+		std::vector< size_t > A1_undirected_cols { { 1, 2, 0, 2, 3, 0, 1, 3, 1, 2 } };
+		std::vector< nonzeroval_t > A1_undirected_values( A1_undirected_rows.size(), 1 );
+		grb::buildMatrixUnique( A1_undirected, A1_undirected_rows.data(), A1_undirected_cols.data(), A1_undirected_values.data(), A1_undirected_values.size(), grb::IOMode::PARALLEL );
+		input_t input_A1_undirected { A1_undirected, expected_triangle_count };
+		output_t output_A1_undirected;
+		std::cout << "-- Running test on A1_undirected" << std::endl;
+		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A1_undirected, output_A1_undirected, niterations, 1 );
+		if( bench_rc ) {
+			std::cerr << "ERROR during execution of A1_undirected: rc = " << bench_rc << std::endl;
+			return bench_rc;
+		} else if( output_A1_undirected.rc ) {
+			std::cerr << "Test failed: rc = " << output_A1_undirected.rc << std::endl;
+			return output_A1_undirected.rc;
+		}
+		std::cout << std::endl;
+	}
+
+	/** Matrix A2:
+	 *    0  1  2  3
+	 * 0  _  X  X  X
+	 * 1  X  _  X  X
+	 * 2  X  X  _  X
+	 * 3  X  X  X  _
+	 *
+	 * Schema:
+	 *  0 ----- 1
+	 *  |  \  / |
+	 *  |   X   |
+	 *  | /  \  |
+	 *  2 ----- 3
+	 *
+	 * => 4 triangles
+	 */
+	{ // Undirected version
+		size_t expected_triangle_count = 4;
+		grb::Matrix< nonzeroval_t > A2_undirected( 4, 4 );
+		std::vector< size_t > A2_undirected_rows { { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 } };
+		std::vector< size_t > A2_undirected_cols { { 1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2 } };
+		std::vector< nonzeroval_t > A2_undirected_values( A2_undirected_rows.size(), 1 );
+		grb::buildMatrixUnique( A2_undirected, A2_undirected_rows.data(), A2_undirected_cols.data(), A2_undirected_values.data(), A2_undirected_values.size(), grb::IOMode::PARALLEL );
+		input_t input_A2_undirected { A2_undirected, expected_triangle_count };
+		output_t output_A2_undirected;
+		std::cout << "-- Running test on A2_undirected" << std::endl;
+		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A2_undirected, output_A2_undirected, niterations, 1 );
+		if( bench_rc ) {
+			std::cerr << "ERROR during execution of A2_undirected: rc = " << bench_rc << std::endl;
+			return bench_rc;
+		} else if( output_A2_undirected.rc ) {
+			std::cerr << "Test failed: rc = " << output_A2_undirected.rc << std::endl;
+			return output_A2_undirected.rc;
+		}
+		std::cout << std::endl;
+	}
+
+	/** Matrix A3:
+	 *
+	 * Schema:
+	 * 0 ----- 1 ----- 2
+	 * |  \  / |  \  / |
+	 * |   X   |   X   |
+	 * | /  \  | /  \  |
+	 * 3 ----- 4 ----- 5
+	 * |  \  / |  \  / |
+	 * |   X   |   X   |
+	 * | /  \  | /  \  |
+	 * 6 ----- 7 ----- 8
+	 *
+	 * note: 1-7, 3-5 are not connected
+	 *
+	 * => 24 triangles
+	 */
+	{ // Undirected version
+		size_t expected_triangle_count = 24;
+		grb::Matrix< nonzeroval_t > A3_undirected( 9, 9 );
+		std::vector< size_t > A3_undirected_rows { { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8 } };
+		std::vector< size_t > A3_undirected_cols { { 1, 2, 3, 4, 6, 0, 2, 3, 4, 5, 0, 1, 4, 5, 8, 0, 1, 4, 6, 7, 0, 1, 2, 3, 5, 6, 7, 8, 1, 2, 4, 7, 8, 0, 3, 4, 7, 8, 3, 4, 5, 6, 8, 2, 4, 5, 6, 7 } };
+		std::vector< nonzeroval_t > A3_undirected_values( A3_undirected_rows.size(), 1 );
+		grb::buildMatrixUnique( A3_undirected, A3_undirected_rows.data(), A3_undirected_cols.data(), A3_undirected_values.data(), A3_undirected_values.size(), grb::IOMode::PARALLEL );
+		input_t input_A3_undirected { A3_undirected, expected_triangle_count };
+		output_t output_A3_undirected;
+		std::cout << "-- Running test on A3_undirected" << std::endl;
+		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A3_undirected, output_A3_undirected, niterations, 1 );
+		if( bench_rc ) {
+			std::cerr << "ERROR during execution of A3_undirected: rc = " << bench_rc << std::endl;
+			return bench_rc;
+		} else if( output_A3_undirected.rc ) {
+			std::cerr << "Test failed: rc = " << output_A3_undirected.rc << std::endl;
+			return output_A3_undirected.rc;
+		}
+		std::cout << std::endl;
+	}
+
+	/** Given matrix in input **/
+	if( test_on_file ) {
+		std::cout << "-- Running test on file " << file_to_test << std::endl;
+
+		// Read matrix from file as a pattern matrix (i.e. no values), then convert it to a nonzeroval_t matrix
+		grb::utils::MatrixFileReader< void > reader( file_to_test, false, true );
+		size_t r = reader.n(), c = reader.m();
+		if( r != c ) {
+			std::cerr << "ERROR: matrix needs to be square" << std::endl;
+			return 1;
+		}
+		grb::Matrix< void > A_pattern( r, r );
+		grb::RC rc_build = buildMatrixUnique( A_pattern, reader.cbegin( grb::IOMode::PARALLEL ), reader.cend( grb::IOMode::PARALLEL ), grb::IOMode::PARALLEL );
+		if( rc_build != grb::RC::SUCCESS ) {
+			std::cerr << "ERROR during buildMatrixUnique of the pattern matrix: rc = " << rc_build << std::endl;
+			return 1;
+		}
+		grb::Matrix< nonzeroval_t > A( r, r );
+		std::vector< size_t > A_rows, A_cols;
+		A_rows.reserve( grb::nnz( A_pattern ) );
+		A_cols.reserve( grb::nnz( A_pattern ) );
+		for( const std::pair< size_t, size_t > & p : A_pattern ) {
+			A_rows.push_back( p.first );
+			A_cols.push_back( p.second );
+		}
+		std::vector< nonzeroval_t > A_values( grb::nnz( A_pattern ), static_cast< nonzeroval_t >( 1 ) );
+		rc_build = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::PARALLEL );
+		if( rc_build != grb::RC::SUCCESS ) {
+			std::cerr << "ERROR during buildMatrixUnique of the integer matrix: rc = " << rc_build << std::endl;
+			return 1;
+		}
+		std::cout << "Matrix read successfully" << std::endl;
+		input_t input { A, expected_file_triangles };
+		output_t output;
+		grb::RC bench_rc = benchmarker.exec( &grbProgram, input, output, niterations, 1 );
+		if( bench_rc ) {
+			std::cerr << "ERROR during execution of file " << file_to_test << ": rc = " << bench_rc << std::endl;
+			return bench_rc;
+		} else if( output.rc ) {
+			std::cerr << "Test failed: rc = " << output.rc << std::endl;
+			return output.rc;
+		}
+	}
+
+	std::cout << "Test OK" << std::endl;
+
+	return 0;
+}

From be712156e7dcf7d514a40fbb509d788a5df29da2 Mon Sep 17 00:00:00 2001
From: byjtew <benjamin.lozes@protonmail.com>
Date: Tue, 23 May 2023 10:53:56 +0200
Subject: [PATCH 50/63] Triangle counting algorithms - stable

---
 .../graphblas/algorithms/triangle_count.hpp   | 589 ++++++++++++++++++
 1 file changed, 589 insertions(+)
 create mode 100644 include/graphblas/algorithms/triangle_count.hpp

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
new file mode 100644
index 000000000..aa641c3ae
--- /dev/null
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -0,0 +1,589 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the triangle counting and triangle enumeration algorithms.
+ *
+ * @author B. Lozes
+ * @date: May 10th, 2023
+ */
+
+#ifndef _H_GRB_TRIANGLE_ENUMERATION
+#define _H_GRB_TRIANGLE_ENUMERATION
+
+#include <numeric>
+#include <vector>
+
+#include <graphblas/utils/iterators/NonzeroIterator.hpp>
+
+#include <graphblas.hpp>
+
+constexpr bool DEBUG = false;
+
+namespace grb {
+
+	namespace algorithms {
+
+		namespace utils {
+			template< typename D >
+			bool is_diagonal_null( const grb::Matrix< D > & A ) {
+				return std::count_if( A.cbegin(), A.cend(), []( const std::pair< std::pair< size_t, size_t >, D > & e ) {
+					return e.first.first == e.first.second && e.second != static_cast< D >( 0 );
+				} ) == 0;
+			}
+
+			template< class Iterator >
+			void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
+				if( ! DEBUG )
+					return;
+				std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
+				if( rows > 1000 || cols > 1000 ) {
+					os << "   Matrix too large to print" << std::endl;
+				} else {
+					// os.precision( 3 );
+					for( size_t y = 0; y < rows; y++ ) {
+						os << std::string( 3, ' ' );
+						for( size_t x = 0; x < cols; x++ ) {
+							auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
+								return a.first.first == y && a.first.second == x;
+							} );
+							if( nnz_val != end )
+								os << std::fixed << ( *nnz_val ).second;
+							else
+								os << '_';
+							os << " ";
+						}
+						os << std::endl;
+					}
+				}
+				os << "]" << std::endl;
+			}
+
+			template< typename D >
+			void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+				grb::wait( mat );
+				printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
+			}
+
+			void debugPrint( const std::string & msg, std::ostream & os = std::cout ) {
+				if( ! DEBUG )
+					return;
+				os << msg;
+			}
+
+			template< typename D >
+			bool tryGet( const grb::Matrix< D > & A, size_t i, size_t j, D & val ) {
+				auto found = std::find_if( A.cbegin(), A.cend(), [ i, j ]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
+					return a.first.first == i && a.first.second == j;
+				} );
+				if( found == A.cend() )
+					return false;
+				val = ( *found ).second;
+				return true;
+			}
+		} // namespace utils
+
+		namespace {
+
+			template< typename Iterator >
+			class ConditionalIterator : public std::iterator< std::input_iterator_tag, typename std::iterator_traits< Iterator >::value_type > {
+
+			public:
+				typedef typename std::iterator_traits< Iterator >::value_type value_type;
+				typedef typename std::iterator_traits< Iterator >::pointer pointer;
+				typedef typename std::iterator_traits< Iterator >::reference reference;
+				typedef typename std::iterator_traits< Iterator >::iterator_category iterator_category;
+				typedef typename std::iterator_traits< Iterator >::difference_type difference_type;
+
+				ConditionalIterator( std::function< bool( typename Iterator::value_type ) > func, Iterator it, Iterator endbound ) : _iterator( it ), _endbound( endbound ), _condition( func ) {
+					while( _iterator != _endbound && ! _condition( *_iterator ) )
+						++( *this );
+				}
+
+				ConditionalIterator( const ConditionalIterator & other ) : _iterator( other._iterator ), _endbound( other._endbound ), _condition( other._condition ) {}
+
+				// Overload the dereference operator
+				value_type operator*() const {
+					return *_iterator;
+				}
+
+				// Overload the arrow operator
+				value_type operator->() const {
+					return *_iterator;
+				}
+
+				// Overload the increment operator
+				ConditionalIterator & operator++() {
+					do
+						++_iterator;
+					while( _iterator != _endbound && ! _condition( *_iterator ) );
+					return *this;
+				}
+
+				// Overload the inequality operator
+				bool operator!=( const ConditionalIterator & other ) const {
+					return _iterator != other._iterator;
+				}
+
+				// Overload the equality operator
+				bool operator==( const ConditionalIterator & other ) const {
+					return _iterator == other._iterator;
+				}
+
+			private:
+				Iterator _iterator, _endbound;
+				std::function< bool( typename Iterator::value_type ) > _condition;
+			};
+
+			template< typename D >
+			class MatrixConditionalAccessor {
+				typedef ConditionalIterator< typename grb::Matrix< D >::const_iterator > iterator_type;
+
+			public:
+				MatrixConditionalAccessor( const std::function< bool( std::pair< std::pair< size_t, size_t >, D > ) > & f, const grb::Matrix< D > & A ) :
+					_begin( f, A.cbegin(), A.cend() ), _end( f, A.cend(), A.cend() ) {}
+
+				MatrixConditionalAccessor( const MatrixConditionalAccessor & other ) = delete;
+
+				MatrixConditionalAccessor & operator=( const MatrixConditionalAccessor & other ) = delete;
+
+				virtual ~MatrixConditionalAccessor() {}
+
+				iterator_type cbegin() const {
+					return _begin;
+				}
+
+				iterator_type begin() const {
+					return cbegin();
+				}
+
+				iterator_type cend() const {
+					return _end;
+				}
+
+				iterator_type end() const {
+					return cend();
+				}
+
+			private:
+				iterator_type _begin, _end;
+			};
+
+			template< typename D >
+			class LUMatrixAccessor {
+			public:
+				LUMatrixAccessor( const grb::Matrix< D > & A ) :
+					_lower(
+						[]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
+							return a.first.first > a.first.second;
+						},
+						A ),
+					_upper(
+						[]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
+							return a.first.first < a.first.second;
+						},
+						A ) {}
+
+				MatrixConditionalAccessor< D > & lower() {
+					return _lower;
+				}
+
+				MatrixConditionalAccessor< D > & upper() {
+					return _upper;
+				}
+
+			private:
+				MatrixConditionalAccessor< D > _lower, _upper;
+			};
+
+			template< typename D, typename I, typename J >
+			grb::RC trilu( const grb::Matrix< D, grb::config::default_backend, I, J > & A,
+				grb::Matrix< D, grb::config::default_backend, I, J > & L,
+				grb::Matrix< D, grb::config::default_backend, I, J > & U ) {
+				//
+				grb::RC rc = grb::RC::SUCCESS;
+
+				// Create the custom accessor
+				grb::wait( A );
+				LUMatrixAccessor< D > luAccesor( A );
+
+				// Create the lower and upper matrices from the accessor
+				const std::vector< std::pair< std::pair< I, J >, D > > nnzs_lower( luAccesor.lower().cbegin(), luAccesor.lower().cend() );
+				grb::buildMatrixUnique( L, grb::utils::makeNonzeroIterator< I, J, D >( nnzs_lower.cbegin() ), grb::utils::makeNonzeroIterator< I, J, D >( nnzs_lower.cend() ), IOMode::PARALLEL );
+				const std::vector< std::pair< std::pair< I, J >, D > > nnzs_upper( luAccesor.upper().cbegin(), luAccesor.upper().cend() );
+				grb::buildMatrixUnique( U, grb::utils::makeNonzeroIterator< I, J, D >( nnzs_upper.cbegin() ), grb::utils::makeNonzeroIterator< I, J, D >( nnzs_upper.cend() ), IOMode::PARALLEL );
+
+				return rc;
+			}
+
+			template< typename InputType1, typename InputType2, typename OutputType >
+			RC _eWiseMul( Matrix< OutputType > & C,
+				const Matrix< InputType1 > & A,
+				const Matrix< InputType2 > & B,
+				const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value >::type * const = nullptr ) {
+				grb::wait( A );
+				grb::wait( B );
+				grb::wait( C );
+				RC rc = grb::eWiseApply( C, A, B, grb::operators::mul< InputType1, InputType2, OutputType >(), RESIZE );
+				return rc ? rc : grb::eWiseApply( C, A, B, grb::operators::mul< InputType1, InputType2, OutputType >(), EXECUTE );
+			}
+
+			/**
+			 * @brief Reduce operation over a matrix.
+			 *
+			 * @tparam D         The type of the matrix.
+			 * @param A          The matrix to reduce.
+			 * @param result     The result of the reduction. Initial value taken from here.
+			 * @param op         The binary operator to use.
+			 * @return grb::RC   Returns #grb::SUCCESS upon succesful completion.
+			 */
+			template< typename D, typename T, typename Func >
+			grb::RC matrixReduce( const grb::Matrix< D > & A, T & result, const Func op ) {
+				std::pair< std::pair< size_t, size_t >, T > init = std::make_pair( std::make_pair( 0ul, 0ul ), result );
+				std::pair< std::pair< size_t, size_t >, T > accumulator = std::accumulate(
+					A.cbegin(), A.cend(), init, [ op ]( const std::pair< std::pair< size_t, size_t >, T > & a, const std::pair< std::pair< size_t, size_t >, D > & b ) {
+						return std::make_pair( a.first, op( a.second, b.second ) );
+					} );
+				grb::wait( A );
+				result = accumulator.second;
+				return grb::RC::SUCCESS;
+			}
+
+			template< typename D, typename T >
+			grb::RC matrixSumReduce( const grb::Matrix< D > & A, T & result ) {
+				return matrixReduce( A, result, []( T a, D b ) -> T {
+					return a + b;
+				} );
+			}
+
+			template< Descriptor descr = descriptors::no_operation, typename OutputType, typename InputType1, typename InputType2, class Semiring >
+			RC _mxm( Matrix< OutputType > & C, const Matrix< InputType1 > & A, const Matrix< InputType2 > & B, const Semiring & ring ) {
+				grb::wait( A );
+				grb::wait( B );
+				grb::wait( C );
+				auto rc = mxm< descr >( C, A, B, ring, RESIZE );
+				return rc ? rc : mxm< descr >( C, A, B, ring, EXECUTE );
+			}
+
+		} // namespace
+
+		enum class TriangleCountAlgorithm { Burkhardt, Cohen, Sandia_LL, Sandia_UU, Sandia_LUT, Sandia_ULT };
+
+		template< typename D >
+		RC triangle_count_burkhardt( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Compute B = A^2
+			const Semiring< grb::operators::add< D >, grb::operators::mul< D >, grb::identities::zero, grb::identities::one > semiring;
+			Matrix< D > B( rows, cols );
+
+			// FIXME: A-squared is not working
+			_mxm< descriptors::transpose_right >( B, A, A, semiring );
+			utils::printSparseMatrix( B, "A^2" );
+
+			// Compute C = A .* B
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, A, B );
+			utils::printSparseMatrix( C, "(A^2) .* A" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((L * U) .* A)) = " + std::to_string( tmpU ) + "\n" );
+
+			tmpU /= 6;
+			utils::debugPrint( "sum (sum ((L * U) .* A)) / 6 = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		template< typename D >
+		RC triangle_count_cohen( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Split A into L (lower) and U (upper) triangular matrices
+			Matrix< D > L( rows, cols ), U( rows, cols );
+			rc = rc ? rc : trilu( A, L, U );
+			utils::printSparseMatrix( L, "L" );
+			utils::printSparseMatrix( U, "U" );
+
+			// Compute B = L * U
+			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+			Matrix< D > B( rows, cols, nnz( A ) );
+			_mxm( B, L, U, semiring );
+			utils::printSparseMatrix( B, "L * U" );
+
+			// Compute C = B .* A
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, B, A );
+			utils::printSparseMatrix( C, "(L * U) .* A" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((L * U) .* A)) = " + std::to_string( tmpU ) + "\n" );
+
+			tmpU /= 2;
+			utils::debugPrint( "sum (sum ((L * U) .* A)) / 2 = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		template< typename D >
+		RC triangle_count_sandia_ll( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Split A into L (lower) and U (upper) triangular matrices
+			Matrix< D > L( rows, cols ), _( rows, cols );
+			rc = rc ? rc : trilu( A, L, _ );
+			utils::printSparseMatrix( L, "L" );
+
+			// Compute B = L * L
+			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+			Matrix< D > B( rows, cols, nnz( A ) );
+			_mxm( B, L, L, semiring );
+			utils::printSparseMatrix( B, "L * L" );
+
+			// Compute C = L .* B
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, B, L );
+			utils::printSparseMatrix( C, "(L * L) .* L" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((L * L) .* L)) = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		template< typename D >
+		RC triangle_count_sandia_uu( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Split A into L (lower) and U (upper) triangular matrices
+			Matrix< D > _( rows, cols ), U( rows, cols );
+			rc = rc ? rc : trilu( A, _, U );
+			utils::printSparseMatrix( U, "U" );
+
+			// Compute B = U * U
+			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+			Matrix< D > B( rows, cols, nnz( A ) );
+			_mxm( B, U, U, semiring );
+			utils::printSparseMatrix( B, "U * U" );
+
+			// Compute C = U .* B
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, B, U );
+			utils::printSparseMatrix( C, "(U * U) .* U" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((U * U) .* U)) = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		template< typename D >
+		RC triangle_count_sandia_lut( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Split A into L (lower) and U (upper) triangular matrices
+			Matrix< D > L( rows, cols ), U( rows, cols );
+			rc = rc ? rc : trilu( A, L, U );
+			utils::printSparseMatrix( L, "L" );
+			utils::printSparseMatrix( U, "U" );
+
+			// Compute B = L * U
+			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+			Matrix< D > B( rows, cols, nnz( A ) );
+			_mxm< descriptors::transpose_right >( B, L, U, semiring );
+			utils::printSparseMatrix( B, "L * U" );
+
+			// Compute C = L .* B
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, B, L );
+			utils::printSparseMatrix( C, "(L * U) .* L" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((L * U) .* L)) = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		template< typename D >
+		RC triangle_count_sandia_ult( size_t & u, const Matrix< D > & A ) {
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			RC rc = RC::SUCCESS;
+
+			utils::printSparseMatrix( A, "A" );
+			size_t rows = nrows( A ), cols = ncols( A );
+
+			// Split A into L (lower) and U (upper) triangular matrices
+			Matrix< D > L( rows, cols ), U( rows, cols );
+			rc = rc ? rc : trilu( A, L, U );
+			utils::printSparseMatrix( L, "L" );
+			utils::printSparseMatrix( U, "U" );
+
+			// Compute B = U * L
+			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+			Matrix< D > B( rows, cols, nnz( A ) );
+			_mxm( B, U, L, semiring );
+			utils::printSparseMatrix( B, "U * L" );
+
+			// Compute C = U .* B
+			Matrix< D > C( rows, cols );
+			_eWiseMul( C, B, U );
+			utils::printSparseMatrix( C, "(L * U) .* U" );
+
+			D tmpU = static_cast< D >( 0 );
+			matrixSumReduce( C, tmpU );
+			utils::debugPrint( "sum (sum ((L * U) .* U)) = " + std::to_string( tmpU ) + "\n" );
+
+			u = (size_t)tmpU;
+
+			// done
+			return rc;
+		}
+
+		/**
+		 * Given a graph, indicates how many triangles are contained within.
+		 *
+		 * This implementation is based on the masked matrix multiplication kernel.
+		 *
+		 * @param[out]    n    The number of triangles. Any prior contents will be ignored.
+		 * @param[in]     A    The input graph.
+		 *
+		 *
+		 * @returns #grb::SUCCESS  When the computation completes successfully.
+		 * @returns #grb::MISMATCH ?
+		 * @returns #grb::ILLEGAL  ?
+		 * @returns #grb::PANIC    If an unrecoverable error has been encountered. The
+		 *                         output as well as the state of ALP/GraphBLAS is
+		 *                         undefined.
+		 *
+		 * \par Performance semantics
+		 *
+		 *   -# This function does not allocate nor free dynamic memory, nor shall it
+		 *      make any system calls.
+		 *
+		 * For performance semantics regarding work, inter-process data movement,
+		 * intra-process data movement, synchronisations, and memory use, please see
+		 * the specification of the ALP primitives this function relies on. These
+		 * performance semantics, with the exception of getters such as #grb::nnz, are
+		 * specific to the backend selected during compilation.
+		 */
+		template< typename D >
+		RC triangle_count( TriangleCountAlgorithm Algo, size_t & u, const Matrix< D > & A_constant ) {
+			auto A = A_constant;
+			// Static assertions
+			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			// Dynamic assertions
+			if( grb::nrows( A ) != grb::ncols( A ) ) {
+				std::cerr << "A must be square" << std::endl;
+				return RC::ILLEGAL;
+			}
+			if( ! utils::is_diagonal_null( A ) ) {
+				// Create a mask with null values on the diagonal, and ones everywhere else
+				grb::Matrix< D > M( grb::nrows( A ), grb::ncols( A ) );
+				size_t nnz_mask = grb::nrows( A ) * grb::ncols( A ) - grb::nrows( A );
+				std::vector< size_t > I(nnz_mask), J( nnz_mask );
+				std::vector< D > V( nnz_mask, static_cast< D >(1) );
+				for( size_t i = 0, k = 0; i < grb::nrows( A ); ++i ) {
+					for( size_t j = 0; j < grb::ncols( A ); ++j ) {
+						if( i == j ) continue;
+						I[k] = i;
+						J[k] = j;
+						++k;
+					}
+				}
+
+				buildMatrixUnique( M, I.data(), J.data(), V.data(), V.size(), grb::IOMode::PARALLEL );
+				// Multiply A with the mask
+				Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
+				utils::printSparseMatrix( A, "A before diagonal annihilation" );
+				utils::printSparseMatrix( M, "Mask" );
+				_eWiseMul( A, A_constant, M );
+				utils::printSparseMatrix( A, "A after diagonal annihilation" );
+				assert( utils::is_diagonal_null( A ) );
+			}
+
+			switch( Algo ) {
+				case TriangleCountAlgorithm::Burkhardt:
+					utils::debugPrint( "-- Burkhardt\n" );
+					return triangle_count_burkhardt( u, A );
+				case TriangleCountAlgorithm::Cohen:
+					utils::debugPrint( "-- Cohen\n" );
+					return triangle_count_cohen( u, A );
+				case TriangleCountAlgorithm::Sandia_LL:
+					utils::debugPrint( "-- Sandia LL\n" );
+					return triangle_count_sandia_ll( u, A );
+				case TriangleCountAlgorithm::Sandia_UU:
+					utils::debugPrint( "-- Sandia UU\n" );
+					return triangle_count_sandia_uu( u, A );
+				case TriangleCountAlgorithm::Sandia_LUT:
+					utils::debugPrint( "-- Sandia LUT\n" );
+					return triangle_count_sandia_lut( u, A );
+				case TriangleCountAlgorithm::Sandia_ULT:
+					utils::debugPrint( "-- Sandia ULT\n" );
+					return triangle_count_sandia_ult( u, A );
+				default:
+					utils::debugPrint( "-- Unknown\n", std::cerr );
+					return RC::FAILED;
+			}
+		}
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_TRIANGLE_ENUMERATION

From 8cef1ab03162e6c5f608ef8f8d34c9e0b9435fd0 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 11:53:45 +0200
Subject: [PATCH 51/63] Test and algorithm signature refactoring

---
 .../graphblas/algorithms/triangle_count.hpp   | 477 ++++++------------
 tests/smoke/triangle_count.cpp                | 473 ++++++++---------
 2 files changed, 351 insertions(+), 599 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index aa641c3ae..d2800c906 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -27,6 +27,7 @@
 #ifndef _H_GRB_TRIANGLE_ENUMERATION
 #define _H_GRB_TRIANGLE_ENUMERATION
 
+#include <map>
 #include <numeric>
 #include <vector>
 
@@ -34,24 +35,16 @@
 
 #include <graphblas.hpp>
 
-constexpr bool DEBUG = false;
+constexpr bool Debug = true;
 
 namespace grb {
 
 	namespace algorithms {
 
 		namespace utils {
-			template< typename D >
-			bool is_diagonal_null( const grb::Matrix< D > & A ) {
-				return std::count_if( A.cbegin(), A.cend(), []( const std::pair< std::pair< size_t, size_t >, D > & e ) {
-					return e.first.first == e.first.second && e.second != static_cast< D >( 0 );
-				} ) == 0;
-			}
 
 			template< class Iterator >
 			void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-				if( ! DEBUG )
-					return;
 				std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
 				if( rows > 1000 || cols > 1000 ) {
 					os << "   Matrix too large to print" << std::endl;
@@ -75,28 +68,20 @@ namespace grb {
 				os << "]" << std::endl;
 			}
 
-			template< typename D >
+			template< bool debug, typename D >
 			void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
+				if( ! debug )
+					return;
 				grb::wait( mat );
 				printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
 			}
 
-			void debugPrint( const std::string & msg, std::ostream & os = std::cout ) {
-				if( ! DEBUG )
+			template< bool debug >
+			void printf( const std::string & msg, std::ostream & os = std::cout ) {
+				if( ! debug )
 					return;
 				os << msg;
 			}
-
-			template< typename D >
-			bool tryGet( const grb::Matrix< D > & A, size_t i, size_t j, D & val ) {
-				auto found = std::find_if( A.cbegin(), A.cend(), [ i, j ]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
-					return a.first.first == i && a.first.second == j;
-				} );
-				if( found == A.cend() )
-					return false;
-				val = ( *found ).second;
-				return true;
-			}
 		} // namespace utils
 
 		namespace {
@@ -232,281 +217,78 @@ namespace grb {
 				return rc;
 			}
 
-			template< typename InputType1, typename InputType2, typename OutputType >
-			RC _eWiseMul( Matrix< OutputType > & C,
-				const Matrix< InputType1 > & A,
-				const Matrix< InputType2 > & B,
-				const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value >::type * const = nullptr ) {
-				grb::wait( A );
-				grb::wait( B );
-				grb::wait( C );
-				RC rc = grb::eWiseApply( C, A, B, grb::operators::mul< InputType1, InputType2, OutputType >(), RESIZE );
-				return rc ? rc : grb::eWiseApply( C, A, B, grb::operators::mul< InputType1, InputType2, OutputType >(), EXECUTE );
-			}
-
-			/**
-			 * @brief Reduce operation over a matrix.
-			 *
-			 * @tparam D         The type of the matrix.
-			 * @param A          The matrix to reduce.
-			 * @param result     The result of the reduction. Initial value taken from here.
-			 * @param op         The binary operator to use.
-			 * @return grb::RC   Returns #grb::SUCCESS upon succesful completion.
-			 */
-			template< typename D, typename T, typename Func >
-			grb::RC matrixReduce( const grb::Matrix< D > & A, T & result, const Func op ) {
-				std::pair< std::pair< size_t, size_t >, T > init = std::make_pair( std::make_pair( 0ul, 0ul ), result );
-				std::pair< std::pair< size_t, size_t >, T > accumulator = std::accumulate(
-					A.cbegin(), A.cend(), init, [ op ]( const std::pair< std::pair< size_t, size_t >, T > & a, const std::pair< std::pair< size_t, size_t >, D > & b ) {
-						return std::make_pair( a.first, op( a.second, b.second ) );
-					} );
-				grb::wait( A );
-				result = accumulator.second;
-				return grb::RC::SUCCESS;
-			}
-
-			template< typename D, typename T >
-			grb::RC matrixSumReduce( const grb::Matrix< D > & A, T & result ) {
-				return matrixReduce( A, result, []( T a, D b ) -> T {
-					return a + b;
-				} );
-			}
-
-			template< Descriptor descr = descriptors::no_operation, typename OutputType, typename InputType1, typename InputType2, class Semiring >
-			RC _mxm( Matrix< OutputType > & C, const Matrix< InputType1 > & A, const Matrix< InputType2 > & B, const Semiring & ring ) {
-				grb::wait( A );
-				grb::wait( B );
-				grb::wait( C );
-				auto rc = mxm< descr >( C, A, B, ring, RESIZE );
-				return rc ? rc : mxm< descr >( C, A, B, ring, EXECUTE );
-			}
-
 		} // namespace
 
-		enum class TriangleCountAlgorithm { Burkhardt, Cohen, Sandia_LL, Sandia_UU, Sandia_LUT, Sandia_ULT };
-
-		template< typename D >
-		RC triangle_count_burkhardt( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			RC rc = RC::SUCCESS;
-
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Compute B = A^2
-			const Semiring< grb::operators::add< D >, grb::operators::mul< D >, grb::identities::zero, grb::identities::one > semiring;
-			Matrix< D > B( rows, cols );
-
-			// FIXME: A-squared is not working
-			_mxm< descriptors::transpose_right >( B, A, A, semiring );
-			utils::printSparseMatrix( B, "A^2" );
-
-			// Compute C = A .* B
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, A, B );
-			utils::printSparseMatrix( C, "(A^2) .* A" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((L * U) .* A)) = " + std::to_string( tmpU ) + "\n" );
-
-			tmpU /= 6;
-			utils::debugPrint( "sum (sum ((L * U) .* A)) / 6 = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
-			return rc;
-		}
-
-		template< typename D >
-		RC triangle_count_cohen( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			RC rc = RC::SUCCESS;
-
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Split A into L (lower) and U (upper) triangular matrices
-			Matrix< D > L( rows, cols ), U( rows, cols );
-			rc = rc ? rc : trilu( A, L, U );
-			utils::printSparseMatrix( L, "L" );
-			utils::printSparseMatrix( U, "U" );
-
-			// Compute B = L * U
-			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-			Matrix< D > B( rows, cols, nnz( A ) );
-			_mxm( B, L, U, semiring );
-			utils::printSparseMatrix( B, "L * U" );
-
-			// Compute C = B .* A
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, B, A );
-			utils::printSparseMatrix( C, "(L * U) .* A" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((L * U) .* A)) = " + std::to_string( tmpU ) + "\n" );
-
-			tmpU /= 2;
-			utils::debugPrint( "sum (sum ((L * U) .* A)) / 2 = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
-			return rc;
-		}
-
-		template< typename D >
-		RC triangle_count_sandia_ll( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			RC rc = RC::SUCCESS;
-
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Split A into L (lower) and U (upper) triangular matrices
-			Matrix< D > L( rows, cols ), _( rows, cols );
-			rc = rc ? rc : trilu( A, L, _ );
-			utils::printSparseMatrix( L, "L" );
-
-			// Compute B = L * L
-			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-			Matrix< D > B( rows, cols, nnz( A ) );
-			_mxm( B, L, L, semiring );
-			utils::printSparseMatrix( B, "L * L" );
-
-			// Compute C = L .* B
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, B, L );
-			utils::printSparseMatrix( C, "(L * L) .* L" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((L * L) .* L)) = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
-			return rc;
-		}
-
-		template< typename D >
-		RC triangle_count_sandia_uu( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			RC rc = RC::SUCCESS;
-
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Split A into L (lower) and U (upper) triangular matrices
-			Matrix< D > _( rows, cols ), U( rows, cols );
-			rc = rc ? rc : trilu( A, _, U );
-			utils::printSparseMatrix( U, "U" );
-
-			// Compute B = U * U
-			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-			Matrix< D > B( rows, cols, nnz( A ) );
-			_mxm( B, U, U, semiring );
-			utils::printSparseMatrix( B, "U * U" );
-
-			// Compute C = U .* B
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, B, U );
-			utils::printSparseMatrix( C, "(U * U) .* U" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((U * U) .* U)) = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
-			return rc;
-		}
-
-		template< typename D >
-		RC triangle_count_sandia_lut( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			RC rc = RC::SUCCESS;
-
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Split A into L (lower) and U (upper) triangular matrices
-			Matrix< D > L( rows, cols ), U( rows, cols );
-			rc = rc ? rc : trilu( A, L, U );
-			utils::printSparseMatrix( L, "L" );
-			utils::printSparseMatrix( U, "U" );
-
-			// Compute B = L * U
-			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-			Matrix< D > B( rows, cols, nnz( A ) );
-			_mxm< descriptors::transpose_right >( B, L, U, semiring );
-			utils::printSparseMatrix( B, "L * U" );
-
-			// Compute C = L .* B
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, B, L );
-			utils::printSparseMatrix( C, "(L * U) .* L" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((L * U) .* L)) = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
-			return rc;
-		}
-
-		template< typename D >
-		RC triangle_count_sandia_ult( size_t & u, const Matrix< D > & A ) {
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+		enum class TriangleCountAlgorithm { Burkhardt, Cohen, Sandia_TT };
+
+		std::map< TriangleCountAlgorithm, std::string > TriangleCountAlgorithmNames = { { TriangleCountAlgorithm::Burkhardt, "Burkhardt" }, { TriangleCountAlgorithm::Cohen, "Cohen" },
+			{ TriangleCountAlgorithm::Sandia_TT, "Sandia_TT" } };
+
+		template< Descriptor descr = descriptors::no_operation, typename D, typename I, typename J, class Semiring, class MulMonoid, class SumMonoid >
+		RC triangle_count_generic( size_t & count,
+			Matrix< D, grb::config::default_backend, I, J > & MXM_out,
+			const Matrix< D, grb::config::default_backend, I, J > & MXM_lhs,
+			const Matrix< D, grb::config::default_backend, I, J > & MXM_rhs,
+			Matrix< D, grb::config::default_backend, I, J > & EWA_out,
+			const Matrix< D, grb::config::default_backend, I, J > & EWA_rhs,
+			const D div_factor,
+			const Semiring mxm_semiring = Semiring(),
+			const MulMonoid ewiseapply_monoid = MulMonoid(),
+			const SumMonoid sumreduce_monoid = SumMonoid() ) {
 			RC rc = RC::SUCCESS;
 
-			utils::printSparseMatrix( A, "A" );
-			size_t rows = nrows( A ), cols = ncols( A );
-
-			// Split A into L (lower) and U (upper) triangular matrices
-			Matrix< D > L( rows, cols ), U( rows, cols );
-			rc = rc ? rc : trilu( A, L, U );
-			utils::printSparseMatrix( L, "L" );
-			utils::printSparseMatrix( U, "U" );
-
-			// Compute B = U * L
-			Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-			Matrix< D > B( rows, cols, nnz( A ) );
-			_mxm( B, U, L, semiring );
-			utils::printSparseMatrix( B, "U * L" );
+			rc = ( &MXM_out == &MXM_lhs ) ? RC::ILLEGAL : rc;
+			rc = ( &MXM_out == &MXM_rhs ) ? RC::ILLEGAL : rc;
+
+			// Compute MXM_out = Mlhs * Mrhs
+			utils::printSparseMatrix< Debug >( MXM_lhs, "MXM_lhs" );
+			utils::printSparseMatrix< Debug >( MXM_rhs, "MXM_rhs" );
+			rc = rc ? rc : mxm< descr >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::RESIZE );
+			rc = rc ? rc : mxm< descr >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::EXECUTE );
+			utils::printSparseMatrix< Debug >( MXM_out, "MXM_out = mxm( MXM_lhs, MXM_rhs )" );
+
+			// Compute MXM_out .*= EWA_rhs
+			utils::printSparseMatrix< Debug >( EWA_rhs, "EWA_rhs" );
+
+			// FIXME: Replace by a foldl( Matrix[in,out], Matrix[in], Monoid ) - not implemented yet
+			// Will then become:
+			// rc = rc ? rc : eWiseApply< descr >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
+			// rc = rc ? rc : eWiseApply< descr >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
+			// Instead of:
+			rc = rc ? rc : eWiseApply< descr >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
+			rc = rc ? rc : eWiseApply< descr >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
+			utils::printSparseMatrix< Debug >( EWA_out, "EWA_out = ewiseapply( MXM_out, EWA_rhs )" );
+
+			// Compute a sum reduction over <EWA_out> in <count>
+			count = 0;
+			rc = rc ? rc : foldl< descr >( count, EWA_out, sumreduce_monoid );
+			utils::printf< Debug >( "count = foldl(EWA_out) = " + std::to_string( count ) + "\n" );
+
+			// Apply the div_factor to the reduction result
+			count /= div_factor;
+			utils::printf< Debug >( "count = count / div_factor = " + std::to_string( count ) + "\n" );
 
-			// Compute C = U .* B
-			Matrix< D > C( rows, cols );
-			_eWiseMul( C, B, U );
-			utils::printSparseMatrix( C, "(L * U) .* U" );
-
-			D tmpU = static_cast< D >( 0 );
-			matrixSumReduce( C, tmpU );
-			utils::debugPrint( "sum (sum ((L * U) .* U)) = " + std::to_string( tmpU ) + "\n" );
-
-			u = (size_t)tmpU;
-
-			// done
 			return rc;
 		}
 
 		/**
 		 * Given a graph, indicates how many triangles are contained within.
 		 *
-		 * This implementation is based on the masked matrix multiplication kernel.
+		 * @tparam D 				The type of the matrix non-zero values.
 		 *
-		 * @param[out]    n    The number of triangles. Any prior contents will be ignored.
-		 * @param[in]     A    The input graph.
+		 * @param[out]    count     The number of triangles.
+		 * 						    Any prior contents will be ignored.
+		 * @param[in]     A         The input graph.
+		 * @param[in,out] MXM_out    Buffer matrix with the same dimensions as the input
+		 * 							graph. Any prior contents will be ignored.
+		 * @param[in] L 		Lower triangular matrix of the input graph (optional)
+		 * @param[in] U 		Lower triangular matrix of the input graph (optional)
 		 *
 		 *
 		 * @returns #grb::SUCCESS  When the computation completes successfully.
-		 * @returns #grb::MISMATCH ?
-		 * @returns #grb::ILLEGAL  ?
+		 * @returns #grb::MISMATCH If the dimensions of the input matrices/buffers
+		 * 						   are incompatible.
+		 * @returns #grb::ILLEGAL  If the given algorithm does not exist.
 		 * @returns #grb::PANIC    If an unrecoverable error has been encountered. The
 		 *                         output as well as the state of ALP/GraphBLAS is
 		 *                         undefined.
@@ -522,64 +304,95 @@ namespace grb {
 		 * performance semantics, with the exception of getters such as #grb::nnz, are
 		 * specific to the backend selected during compilation.
 		 */
-		template< typename D >
-		RC triangle_count( TriangleCountAlgorithm Algo, size_t & u, const Matrix< D > & A_constant ) {
-			auto A = A_constant;
+		template< Descriptor descr = descriptors::no_operation,
+			typename D,
+			typename I,
+			typename J,
+			class Semiring = grb::Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class MulMonoid = grb::Monoid< grb::operators::mul< D >, identities::one >,
+			class SumMonoid = grb::Monoid< operators::add< size_t, D, size_t >, identities::zero > >
+		RC triangle_count( const TriangleCountAlgorithm algo,
+			size_t & count,
+			const Matrix< D, grb::config::default_backend, I, J > & A,
+			Matrix< D, grb::config::default_backend, I, J > & MXM_out,
+			Matrix< D, grb::config::default_backend, I, J > & EWA_out,
+			Matrix< D, grb::config::default_backend, I, J > & L = { 0, 0 },
+			Matrix< D, grb::config::default_backend, I, J > & U = { 0, 0 } ) {
 			// Static assertions
 			static_assert( std::is_integral< D >::value, "Type D must be integral" );
-			// Dynamic assertions
-			if( grb::nrows( A ) != grb::ncols( A ) ) {
-				std::cerr << "A must be square" << std::endl;
-				return RC::ILLEGAL;
+
+			// Sanity checks
+			if( nrows( A ) != ncols( A ) ) {
+				std::cerr << "Matrix A must be square" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( ncols( L ) != nrows( L ) ) {
+				std::cerr << "Matrix L must be square" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( nrows( A ) != ncols( L ) ) {
+				std::cerr << "Matrices A and L must have the same dimensions" << std::endl;
+				return RC::MISMATCH;
 			}
-			if( ! utils::is_diagonal_null( A ) ) {
-				// Create a mask with null values on the diagonal, and ones everywhere else
-				grb::Matrix< D > M( grb::nrows( A ), grb::ncols( A ) );
-				size_t nnz_mask = grb::nrows( A ) * grb::ncols( A ) - grb::nrows( A );
-				std::vector< size_t > I(nnz_mask), J( nnz_mask );
-				std::vector< D > V( nnz_mask, static_cast< D >(1) );
-				for( size_t i = 0, k = 0; i < grb::nrows( A ); ++i ) {
-					for( size_t j = 0; j < grb::ncols( A ); ++j ) {
-						if( i == j ) continue;
-						I[k] = i;
-						J[k] = j;
-						++k;
+			if( ncols( U ) != nrows( U ) ) {
+				std::cerr << "Matrix U must be square" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( nrows( A ) != ncols( U ) ) {
+				std::cerr << "Matrices A and U must have the same dimensions" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( ncols( MXM_out ) != nrows( MXM_out ) ) {
+				std::cerr << "Matrix MXM_out must be square" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( nrows( A ) != ncols( MXM_out ) ) {
+				std::cerr << "Matrices A and MXM_out must have the same dimensions" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( ncols( EWA_out ) != nrows( EWA_out ) ) {
+				std::cerr << "Matrix EWA_out must be square" << std::endl;
+				return RC::MISMATCH;
+			}
+			if( nrows( A ) != ncols( EWA_out ) ) {
+				std::cerr << "Matrices A and EWA_out must have the same dimensions" << std::endl;
+				return RC::MISMATCH;
+			}
+
+			// Dispatch to the appropriate algorithm
+			switch( algo ) {
+				case TriangleCountAlgorithm::Burkhardt: {
+					return triangle_count_generic< descr | descriptors::transpose_right, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, A, A, EWA_out, A, 6 );
+				}
+
+				case TriangleCountAlgorithm::Cohen: {
+					trilu( A, L, U );
+					if( nrows( L ) + ncols( L ) == 0 ) {
+						std::cerr << "Matrix L must be provided for the Cohen algorithm" << std::endl;
+						return RC::MISMATCH;
+					} else if( nrows( U ) + ncols( U ) == 0 ) {
+						std::cerr << "Matrix U must be provided for the Cohen algorithm" << std::endl;
+						return RC::MISMATCH;
 					}
+					return triangle_count_generic< descr, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, L, U, EWA_out,  A, 2 );
 				}
 
-				buildMatrixUnique( M, I.data(), J.data(), V.data(), V.size(), grb::IOMode::PARALLEL );
-				// Multiply A with the mask
-				Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one > semiring;
-				utils::printSparseMatrix( A, "A before diagonal annihilation" );
-				utils::printSparseMatrix( M, "Mask" );
-				_eWiseMul( A, A_constant, M );
-				utils::printSparseMatrix( A, "A after diagonal annihilation" );
-				assert( utils::is_diagonal_null( A ) );
-			}
+				case TriangleCountAlgorithm::Sandia_TT: {
+					trilu( A, L, U );
+					if( ( nrows( U ) == 0 || ncols( U ) == 0 ) && ( nrows( L ) == 0 || ncols( L ) == 0 ) ) {
+						std::cerr << "Matrix L or U must be provided for the Sandia_TT algorithm" << std::endl;
+						return RC::MISMATCH;
+					}
+					const Matrix< D, grb::config::default_backend, I, J > & T = ( nrows( U ) == 0 || ncols( U ) == 0 ) ? L : U;
+					return triangle_count_generic< descr, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, T, T, EWA_out, T, 1 );
+				}
 
-			switch( Algo ) {
-				case TriangleCountAlgorithm::Burkhardt:
-					utils::debugPrint( "-- Burkhardt\n" );
-					return triangle_count_burkhardt( u, A );
-				case TriangleCountAlgorithm::Cohen:
-					utils::debugPrint( "-- Cohen\n" );
-					return triangle_count_cohen( u, A );
-				case TriangleCountAlgorithm::Sandia_LL:
-					utils::debugPrint( "-- Sandia LL\n" );
-					return triangle_count_sandia_ll( u, A );
-				case TriangleCountAlgorithm::Sandia_UU:
-					utils::debugPrint( "-- Sandia UU\n" );
-					return triangle_count_sandia_uu( u, A );
-				case TriangleCountAlgorithm::Sandia_LUT:
-					utils::debugPrint( "-- Sandia LUT\n" );
-					return triangle_count_sandia_lut( u, A );
-				case TriangleCountAlgorithm::Sandia_ULT:
-					utils::debugPrint( "-- Sandia ULT\n" );
-					return triangle_count_sandia_ult( u, A );
 				default:
-					utils::debugPrint( "-- Unknown\n", std::cerr );
-					return RC::FAILED;
+					std::cerr << "Unknown TriangleCountAlgorithm enum value" << std::endl;
+					return RC::ILLEGAL;
 			}
+
+			return RC::SUCCESS;
 		}
 
 	} // namespace algorithms
diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index a1d7f5644..aa598b2fa 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -18,6 +18,10 @@
 #include <iostream>
 #include <vector>
 
+#ifdef _CG_COMPLEX
+#include <complex>
+#endif
+
 #include <inttypes.h>
 
 #include <graphblas/algorithms/triangle_count.hpp>
@@ -28,310 +32,245 @@
 #include <utils/output_verification.hpp>
 
 /** Must be an integer type (int, long, unsigned, etc.) */
-using nonzeroval_t = long;
+using BaseScalarType = int;
+#ifdef _CG_COMPLEX
+using IntegerType = std::complex< BaseScalarType >;
+#else
+using IntegerType = BaseScalarType;
+#endif
 
-typedef struct {
-	grb::Matrix< nonzeroval_t > A;
-	size_t expected_triangle_count;
-} input_t;
+constexpr BaseScalarType TOL = 0;
+constexpr size_t MAX_ITERS = 10000;
 
-typedef struct {
-	grb::RC rc = grb::RC::SUCCESS;
-	std::vector< size_t > triangleCounts; // Per algorithm
-	grb::utils::TimerResults times;
-	size_t data_in_local;
-} output_t;
+using namespace grb;
+using namespace algorithms;
 
-void grbProgram( const input_t & input, output_t & output ) {
+struct input {
+	size_t inner_rep;
+	size_t outer_rep;
+	TriangleCountAlgorithm algorithm;
+	size_t expectedTriangleCount;
+	char filename[ 1024 ];
+	bool direct;
+};
+
+struct output {
+	RC rc = RC::SUCCESS;
+	size_t inner_rep;
+	size_t outer_rep;
+	size_t iterations;
+	size_t triangleCount;
+	grb::utils::TimerResults times;
+};
 
-	// get user process ID.
-	const size_t s = grb::spmd< grb::config::default_backend >::pid();
-	assert( s < grb::spmd<>::nprocs() );
+bool parse_arguments( int argc, char ** argv, input & in, int& err );
 
-	// get input n
+void grbProgram( const input & data_in, output & out ) {
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
 	grb::utils::Timer timer;
-	timer.reset();
 
-	std::cout << std::endl << "Running triangle counting with Burkhardt algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	triangle_count( grb::algorithms::TriangleCountAlgorithm::Burkhardt, output.triangleCounts.back(), input.A );
-	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Burkhardt triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Burkhardt triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	// Sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.rc = ILLEGAL;
+		return;
 	}
 
-	std::cout << std::endl << "Running triangle counting with Cohen algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	output.rc = output.rc ? output.rc : output.rc ? output.rc : grb::algorithms::triangle_count( grb::algorithms::TriangleCountAlgorithm::Cohen, output.triangleCounts.back(), input.A );
-	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Cohen triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Cohen triangle count succeeded: " << output.triangleCounts.back() << std::endl;
-	}
 
-	std::cout << std::endl << "Running triangle counting with Sandia_LL algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_LL, output.triangleCounts.back(), input.A );
 	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Sandia_LL triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Sandia_LL triangle count succeeded: " << output.triangleCounts.back() << std::endl;
+	// Create a local parser
+	grb::utils::MatrixFileReader< 
+		void,
+		std::conditional<
+			( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType 
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	assert( parser.m() == parser.n() );
+	const size_t n = parser.n();
+	// Load the matrix, first as a pattern, then copy it into a matrix with integer values
+	Matrix< IntegerType > A( n, n );
+	{
+		Matrix< void > A_pattern( n, n );
+		{
+			const RC rc = buildMatrixUnique( A_pattern,
+				parser.begin( SEQUENTIAL ), parser.end( SEQUENTIAL),
+				SEQUENTIAL
+			);
+			/* Once internal issue #342 is resolved this can be re-enabled
+			const RC rc = buildMatrixUnique( A_pattern,
+				parser.begin( PARALLEL ), parser.end( PARALLEL),
+				PARALLEL
+			);*/
+			if( rc != SUCCESS ) {
+				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+					<< "(" << toString( rc ) << ")." << std::endl;
+				return;
+			}
+		}
+		// Check number of non-zero entries between the parser and the matrix A_pattern
+		try {
+			const size_t global_nnz = nnz( A_pattern );
+			const size_t parser_nnz = parser.nz();
+			if( global_nnz != parser_nnz ) {
+				std::cerr << "Failure: global nnz (" << global_nnz << ") does not equal "
+					<< "parser nnz (" << parser_nnz << ")." << std::endl;
+				return;
+			}
+		} catch( const std::runtime_error & ) {
+			std::cout << "Info: nonzero check skipped as the number of nonzeroes "
+				<< "cannot be derived from the matrix file header. The "
+				<< "grb::Matrix reports " << nnz( A ) << " nonzeroes.\n";
+		}
+		// Build A from A_pattern, filled with static_cast< IntegerType>( 1 )
+		std::vector< size_t > rows, cols;
+		std::vector< IntegerType > values( nnz( A_pattern ), static_cast< IntegerType >( 1 ) );
+		rows.reserve( nnz( A_pattern ) );
+		cols.reserve( nnz( A_pattern ) );
+		for( const std::pair< size_t, size_t > p : A_pattern ) {
+			rows.push_back( p.first );
+			cols.push_back( p.second );
+		}
+		buildMatrixUnique( A, rows.data(), cols.data(), values.data(), values.size(), IOMode::SEQUENTIAL );
 	}
+	out.times.io = timer.time();
 
-	std::cout << std::endl << "Running triangle counting with Sandia_LUT algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	grb::algorithms::triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_LUT, output.triangleCounts.back(), input.A );
-	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Sandia_LUT triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Sandia_LUT triangle count succeeded: " << output.triangleCounts.back() << std::endl;
-	}
 
-	std::cout << std::endl << "Running triangle counting with Sandia_ULT algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_ULT, output.triangleCounts.back(), input.A );
 	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Sandia_ULT triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Sandia_ULT triangle count succeeded: " << output.triangleCounts.back() << std::endl;
-	}
+	// Allocate the buffers
+	Matrix< IntegerType > buffer( n, n );
+	Matrix< IntegerType > buffer2( n, n );
+	Matrix< IntegerType > L( n, n );
+	Matrix< IntegerType > U( n, n );
+	// Split A into L and U
+	// TODO:
+	out.times.preamble = timer.time();
 
-	std::cout << std::endl << "Running triangle counting with Sandia_UU algorithm" << std::endl;
-	output.triangleCounts.push_back( 0 );
-	triangle_count( grb::algorithms::TriangleCountAlgorithm::Sandia_UU, output.triangleCounts.back(), input.A );
 	timer.reset();
-	if( output.triangleCounts.back() != input.expected_triangle_count ) {
-		std::cerr << "ERROR: Sandia_UU triangle count failed: expected " << input.expected_triangle_count << " but got " << output.triangleCounts.back() << std::endl;
-		output.rc = output.rc ? output.rc : grb::RC::FAILED;
-	} else {
-		std::cout << "Sandia_UU triangle count succeeded: " << output.triangleCounts.back() << std::endl;
-	}
-
-	std::cout << std::endl;
+	out.rc = triangle_count( data_in.algorithm, out.triangleCount, A, buffer, buffer2, L, U );
+	out.times.useful = timer.time();
 }
 
 int main( int argc, char ** argv ) {
 	(void)argc;
 	(void)argv;
-	constexpr size_t niterations = 1;
 
-	grb::Benchmarker< grb::EXEC_MODE::AUTOMATIC > benchmarker;
 	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
 
-	// Check if we are testing on a file
-	if( argc != 1 && argc != 3 ) {
-		std::cerr << "Usage: \n\t" << argv[ 0 ] << " [ <graph_path> <expected_triangle_count> ]" << std::endl;
-		return 1;
+	// Input struct
+	struct input in;
+	int err;
+	if( !parse_arguments( argc, argv, in, err ) ) {
+		return err;
 	}
-	bool test_on_file = argc == 3;
-	std::string file_to_test( test_on_file ? argv[ 1 ] : "" );
-	size_t expected_file_triangles = test_on_file ? std::stoul( argv[ 2 ] ) : 0;
-
-	/** Matrix A0:
-	 *    0  1  2  3
-	 * 0  _  X  X  _
-	 * 1  X  _  X  X
-	 * 2  X  X  _  X
-	 * 3  _  X  X  _
-	 *
-	 * Schema:
-	 *  0 ------ 1
-	 *  |      /
-	 *  |    /
-	 *  |  /
-	 *  2        3
-	 *
-	 * => 1 triangle
-	 */
-	{ // Undirected version
-		size_t expected_triangle_count = 1;
-		grb::Matrix< nonzeroval_t > A0_undirected( 4, 4 );
-		std::vector< size_t > A0_undirected_rows { { 0, 0, 1, 1, 2, 2 } };
-		std::vector< size_t > A0_undirected_cols { { 1, 2, 0, 2, 0, 1 } };
-		std::vector< nonzeroval_t > A0_undirected_values( A0_undirected_rows.size(), 1 );
-		grb::buildMatrixUnique( A0_undirected, A0_undirected_rows.data(), A0_undirected_cols.data(), A0_undirected_values.data(), A0_undirected_values.size(), grb::IOMode::PARALLEL );
-		input_t input_A0_undirected { A0_undirected, expected_triangle_count };
-		output_t output_A0_undirected;
-		std::cout << "-- Running test on A0_undirected" << std::endl;
-		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A0_undirected, output_A0_undirected, niterations, 1 );
-		if( bench_rc ) {
-			std::cerr << "ERROR during execution of A0_undirected: rc = " << bench_rc << std::endl;
-			return bench_rc;
-		} else if( output_A0_undirected.rc ) {
-			std::cerr << "Test failed: rc = " << output_A0_undirected.rc << std::endl;
-			return output_A0_undirected.rc;
+	
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.inner_rep << ", and outer reptitions = " 
+		<< in.outer_rep	<< std::endl;
+
+	// Run the test for all algorithms
+	RC all_algorithms_rc = RC::SUCCESS;
+	for( const std::pair< TriangleCountAlgorithm, std::string > & algo : TriangleCountAlgorithmNames ) {
+		in.algorithm = algo.first;
+		std::cout << "  -- Running algorithm " << algo.second << std::endl;
+
+		// Output struct
+		struct output out;
+		RC rc = RC::SUCCESS;
+
+		// Launch the estimator (if requested)
+		if( in.inner_rep == 0 ) {
+			grb::Launcher< AUTOMATIC > launcher;
+			rc = launcher.exec( &grbProgram, in, out, true );
+			if( rc == RC::SUCCESS ) {
+				in.inner_rep = out.inner_rep;
+			}
+			if( rc != RC::SUCCESS ) {
+				std::cerr << "launcher.exec returns with non-SUCCESS error code "
+					<< (int)rc << std::endl;
+				return 6;
+			}
 		}
-		std::cout << std::endl;
-	}
 
-	/** Matrix A1:
-	 *    0  1  2  3
-	 * 0  _  X  X  _
-	 * 1  X  _  X  X
-	 * 2  X  X  _  X
-	 * 3  _  X  X  _
-	 *
-	 * Schema:
-	 *  0 ------ 1
-	 *  |      / |
-	 *  |    /   |
-	 *  |  /     |
-	 *  2 ------ 3
-	 *
-	 * => 2 triangles
-	 */
-	{ // Undirected version
-		size_t expected_triangle_count = 2;
-		grb::Matrix< nonzeroval_t > A1_undirected( 4, 4 );
-		std::vector< size_t > A1_undirected_rows { { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3 } };
-		std::vector< size_t > A1_undirected_cols { { 1, 2, 0, 2, 3, 0, 1, 3, 1, 2 } };
-		std::vector< nonzeroval_t > A1_undirected_values( A1_undirected_rows.size(), 1 );
-		grb::buildMatrixUnique( A1_undirected, A1_undirected_rows.data(), A1_undirected_cols.data(), A1_undirected_values.data(), A1_undirected_values.size(), grb::IOMode::PARALLEL );
-		input_t input_A1_undirected { A1_undirected, expected_triangle_count };
-		output_t output_A1_undirected;
-		std::cout << "-- Running test on A1_undirected" << std::endl;
-		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A1_undirected, output_A1_undirected, niterations, 1 );
-		if( bench_rc ) {
-			std::cerr << "ERROR during execution of A1_undirected: rc = " << bench_rc << std::endl;
-			return bench_rc;
-		} else if( output_A1_undirected.rc ) {
-			std::cerr << "Test failed: rc = " << output_A1_undirected.rc << std::endl;
-			return output_A1_undirected.rc;
+		// Launch the benchmarker
+		grb::Benchmarker< EXEC_MODE::AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, in.outer_rep, true );
+		if( rc != RC::SUCCESS ) {
+			std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+				<< grb::toString( rc ) << std::endl;
+			return 8;
+		}
+		if( out.rc == RC::SUCCESS ) {
+			std::cout << "Benchmark completed successfully.\n";
+			std::cout << "** Obtained " << out.triangleCount << " triangles.\n";
+			std::cout << "** Expected " << in.expectedTriangleCount << " triangles.\n";
+			if( out.triangleCount != in.expectedTriangleCount ) {
+				all_algorithms_rc = RC::FAILED;
+			}
+		} else {
+			std::cerr << "Benchmark failed with error code "
+				<< grb::toString( out.rc ) << std::endl;
+			std::cerr << std::flush;
+			all_algorithms_rc = RC::FAILED;
 		}
 		std::cout << std::endl;
 	}
 
-	/** Matrix A2:
-	 *    0  1  2  3
-	 * 0  _  X  X  X
-	 * 1  X  _  X  X
-	 * 2  X  X  _  X
-	 * 3  X  X  X  _
-	 *
-	 * Schema:
-	 *  0 ----- 1
-	 *  |  \  / |
-	 *  |   X   |
-	 *  | /  \  |
-	 *  2 ----- 3
-	 *
-	 * => 4 triangles
-	 */
-	{ // Undirected version
-		size_t expected_triangle_count = 4;
-		grb::Matrix< nonzeroval_t > A2_undirected( 4, 4 );
-		std::vector< size_t > A2_undirected_rows { { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 } };
-		std::vector< size_t > A2_undirected_cols { { 1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2 } };
-		std::vector< nonzeroval_t > A2_undirected_values( A2_undirected_rows.size(), 1 );
-		grb::buildMatrixUnique( A2_undirected, A2_undirected_rows.data(), A2_undirected_cols.data(), A2_undirected_values.data(), A2_undirected_values.size(), grb::IOMode::PARALLEL );
-		input_t input_A2_undirected { A2_undirected, expected_triangle_count };
-		output_t output_A2_undirected;
-		std::cout << "-- Running test on A2_undirected" << std::endl;
-		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A2_undirected, output_A2_undirected, niterations, 1 );
-		if( bench_rc ) {
-			std::cerr << "ERROR during execution of A2_undirected: rc = " << bench_rc << std::endl;
-			return bench_rc;
-		} else if( output_A2_undirected.rc ) {
-			std::cerr << "Test failed: rc = " << output_A2_undirected.rc << std::endl;
-			return output_A2_undirected.rc;
-		}
-		std::cout << std::endl;
+	if( all_algorithms_rc == RC::SUCCESS ) {
+		std::cout << "Test OK" << std::endl;
+	} else {
+		std::cout << "Test FAILED" << std::endl;
 	}
 
-	/** Matrix A3:
-	 *
-	 * Schema:
-	 * 0 ----- 1 ----- 2
-	 * |  \  / |  \  / |
-	 * |   X   |   X   |
-	 * | /  \  | /  \  |
-	 * 3 ----- 4 ----- 5
-	 * |  \  / |  \  / |
-	 * |   X   |   X   |
-	 * | /  \  | /  \  |
-	 * 6 ----- 7 ----- 8
-	 *
-	 * note: 1-7, 3-5 are not connected
-	 *
-	 * => 24 triangles
-	 */
-	{ // Undirected version
-		size_t expected_triangle_count = 24;
-		grb::Matrix< nonzeroval_t > A3_undirected( 9, 9 );
-		std::vector< size_t > A3_undirected_rows { { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8 } };
-		std::vector< size_t > A3_undirected_cols { { 1, 2, 3, 4, 6, 0, 2, 3, 4, 5, 0, 1, 4, 5, 8, 0, 1, 4, 6, 7, 0, 1, 2, 3, 5, 6, 7, 8, 1, 2, 4, 7, 8, 0, 3, 4, 7, 8, 3, 4, 5, 6, 8, 2, 4, 5, 6, 7 } };
-		std::vector< nonzeroval_t > A3_undirected_values( A3_undirected_rows.size(), 1 );
-		grb::buildMatrixUnique( A3_undirected, A3_undirected_rows.data(), A3_undirected_cols.data(), A3_undirected_values.data(), A3_undirected_values.size(), grb::IOMode::PARALLEL );
-		input_t input_A3_undirected { A3_undirected, expected_triangle_count };
-		output_t output_A3_undirected;
-		std::cout << "-- Running test on A3_undirected" << std::endl;
-		grb::RC bench_rc = benchmarker.exec( &grbProgram, input_A3_undirected, output_A3_undirected, niterations, 1 );
-		if( bench_rc ) {
-			std::cerr << "ERROR during execution of A3_undirected: rc = " << bench_rc << std::endl;
-			return bench_rc;
-		} else if( output_A3_undirected.rc ) {
-			std::cerr << "Test failed: rc = " << output_A3_undirected.rc << std::endl;
-			return output_A3_undirected.rc;
-		}
-		std::cout << std::endl;
+	return 0;
+}
+
+bool parse_arguments( int argc, char ** argv, input & in, int& err ) {
+	// Check if we are testing on a file
+	if( argc < 4 || argc > 6 ) {
+		std::cerr << "Usages: \n\t" 
+			<< argv[ 0 ] << " <graph_filepath> <direct/indirect> <expected_triangle_count> (inner iterations) (outer iterations)" 
+			<< std::endl;
+		err = 1;
+		return false;
 	}
 
-	/** Given matrix in input **/
-	if( test_on_file ) {
-		std::cout << "-- Running test on file " << file_to_test << std::endl;
+	// Get file name
+	(void)strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
 
-		// Read matrix from file as a pattern matrix (i.e. no values), then convert it to a nonzeroval_t matrix
-		grb::utils::MatrixFileReader< void > reader( file_to_test, false, true );
-		size_t r = reader.n(), c = reader.m();
-		if( r != c ) {
-			std::cerr << "ERROR: matrix needs to be square" << std::endl;
-			return 1;
-		}
-		grb::Matrix< void > A_pattern( r, r );
-		grb::RC rc_build = buildMatrixUnique( A_pattern, reader.cbegin( grb::IOMode::PARALLEL ), reader.cend( grb::IOMode::PARALLEL ), grb::IOMode::PARALLEL );
-		if( rc_build != grb::RC::SUCCESS ) {
-			std::cerr << "ERROR during buildMatrixUnique of the pattern matrix: rc = " << rc_build << std::endl;
-			return 1;
-		}
-		grb::Matrix< nonzeroval_t > A( r, r );
-		std::vector< size_t > A_rows, A_cols;
-		A_rows.reserve( grb::nnz( A_pattern ) );
-		A_cols.reserve( grb::nnz( A_pattern ) );
-		for( const std::pair< size_t, size_t > & p : A_pattern ) {
-			A_rows.push_back( p.first );
-			A_cols.push_back( p.second );
-		}
-		std::vector< nonzeroval_t > A_values( grb::nnz( A_pattern ), static_cast< nonzeroval_t >( 1 ) );
-		rc_build = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::PARALLEL );
-		if( rc_build != grb::RC::SUCCESS ) {
-			std::cerr << "ERROR during buildMatrixUnique of the integer matrix: rc = " << rc_build << std::endl;
-			return 1;
-		}
-		std::cout << "Matrix read successfully" << std::endl;
-		input_t input { A, expected_file_triangles };
-		output_t output;
-		grb::RC bench_rc = benchmarker.exec( &grbProgram, input, output, niterations, 1 );
-		if( bench_rc ) {
-			std::cerr << "ERROR during execution of file " << file_to_test << ": rc = " << bench_rc << std::endl;
-			return bench_rc;
-		} else if( output.rc ) {
-			std::cerr << "Test failed: rc = " << output.rc << std::endl;
-			return output.rc;
+	// Get direct or indirect addressing
+	in.direct = ( strncmp( argv[ 2 ], "direct", 6 ) == 0 );
+
+	// Get the expected number of triangles
+	in.expectedTriangleCount = std::stoul( argv[ 3 ] );
+
+	// Get the inner number of iterations
+	in.inner_rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc > 4 ) {
+		in.inner_rep = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 4 ] << " "
+				<< "for number of inner experiment repititions." << std::endl;
+			err = 4;
+			return false;
 		}
 	}
 
-	std::cout << "Test OK" << std::endl;
-
-	return 0;
-}
+	// Get the outer number of iterations
+	in.outer_rep = grb::config::BENCHMARKING::outer();
+	if( argc > 5 ) {
+		in.outer_rep = strtoumax( argv[ 5 ], &end, 10 );
+		if( argv[ 5 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 5 ] << " "
+				<< "for number of outer experiment repititions." << std::endl;
+			err = 5;
+			return false;
+		}
+	}
+	return true;
+}
\ No newline at end of file

From 22319fb5e403f21eb3fef511a9517f5fc44a63b2 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 12:04:47 +0200
Subject: [PATCH 52/63] Check for non-zero values on the diagonal

---
 tests/smoke/triangle_count.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index aa598b2fa..c845d365f 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -138,6 +138,14 @@ void grbProgram( const input & data_in, output & out ) {
 	}
 	out.times.io = timer.time();
 
+	// Check that the input matrix does not contains self-loops
+	for( const auto & p : A ) {
+		if( p.first.first == p.first.second ) {
+			std::cerr << "Failure: input matrix contains self-loops." << std::endl;
+			return;
+		}
+	}
+
 
 	timer.reset();
 	// Allocate the buffers

From dd8e90110e7261b927a38966efc27ba9a794cbf6 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 12:15:55 +0200
Subject: [PATCH 53/63] Skip diagonal values while reading matrix

---
 tests/smoke/triangle_count.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index c845d365f..e4631e2bc 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -127,13 +127,18 @@ void grbProgram( const input & data_in, output & out ) {
 		}
 		// Build A from A_pattern, filled with static_cast< IntegerType>( 1 )
 		std::vector< size_t > rows, cols;
-		std::vector< IntegerType > values( nnz( A_pattern ), static_cast< IntegerType >( 1 ) );
 		rows.reserve( nnz( A_pattern ) );
 		cols.reserve( nnz( A_pattern ) );
 		for( const std::pair< size_t, size_t > p : A_pattern ) {
+			// FIXME: this is a workaround while waiting for a masked version of mxm
+			if( p.first == p.second ) {
+				continue;
+			}
+			
 			rows.push_back( p.first );
 			cols.push_back( p.second );
 		}
+		std::vector< IntegerType > values( rows.size(), static_cast< IntegerType >( 1 ) );
 		buildMatrixUnique( A, rows.data(), cols.data(), values.data(), values.size(), IOMode::SEQUENTIAL );
 	}
 	out.times.io = timer.time();

From dd5f7ee471e181cb3fe2cd39791f509514feb31e Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Sat, 10 Jun 2023 12:16:40 +0200
Subject: [PATCH 54/63] Adding triangle_count tests for gyro_m & dwt_59

---
 .../graphblas/algorithms/triangle_count.hpp   |  4 ++--
 tests/smoke/smoketests.sh                     | 21 +++++++++++++++++--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index d2800c906..062e927f5 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -35,7 +35,7 @@
 
 #include <graphblas.hpp>
 
-constexpr bool Debug = true;
+constexpr bool Debug = false;
 
 namespace grb {
 
@@ -46,7 +46,7 @@ namespace grb {
 			template< class Iterator >
 			void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
 				std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-				if( rows > 1000 || cols > 1000 ) {
+				if( rows > 100 || cols > 100 ) {
 					os << "   Matrix too large to print" << std::endl;
 				} else {
 					// os.precision( 3 );
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 5d2512869..2421ba7cb 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -366,9 +366,9 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
-			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm."
+			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the dwt_59.mtx"
 			if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
-				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx 30 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx direct 30 1 1 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
 				head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
 				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
 					echo "Test FAILED"
@@ -383,6 +383,23 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
+			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the gyro_m.mtx"
+			if [ -f ${INPUT_DIR}/gyro_m.mtx ]; then
+				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/gyro_m.mtx direct 598470 1 1 &> ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
+				elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
+					echo "Verification FAILED"
+					echo "Test FAILED"
+				else
+					echo "Test OK"
+				fi
+			else
+				echo "Test DISABLED: gyro_m.mtx was not found. To enable, please provide ${INPUT_DIR}/gyro_m.mtx"
+			fi
+			echo " "
+
 			if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
 				echo "Additional standardised smoke tests not yet supported for the ${BACKEND} backend"
 				echo

From 888cb3fa7fdb378aa9c466e91d5cc7c7bf2dcadf Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 22 Jun 2023 14:20:07 +0200
Subject: [PATCH 55/63] Integration of grb::tril+u

---
 .../graphblas/algorithms/triangle_count.hpp   | 235 +++++-------------
 tests/smoke/triangle_count.cpp                |   6 +-
 2 files changed, 66 insertions(+), 175 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index 062e927f5..98bb1eec3 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -84,154 +84,31 @@ namespace grb {
 			}
 		} // namespace utils
 
-		namespace {
-
-			template< typename Iterator >
-			class ConditionalIterator : public std::iterator< std::input_iterator_tag, typename std::iterator_traits< Iterator >::value_type > {
-
-			public:
-				typedef typename std::iterator_traits< Iterator >::value_type value_type;
-				typedef typename std::iterator_traits< Iterator >::pointer pointer;
-				typedef typename std::iterator_traits< Iterator >::reference reference;
-				typedef typename std::iterator_traits< Iterator >::iterator_category iterator_category;
-				typedef typename std::iterator_traits< Iterator >::difference_type difference_type;
-
-				ConditionalIterator( std::function< bool( typename Iterator::value_type ) > func, Iterator it, Iterator endbound ) : _iterator( it ), _endbound( endbound ), _condition( func ) {
-					while( _iterator != _endbound && ! _condition( *_iterator ) )
-						++( *this );
-				}
-
-				ConditionalIterator( const ConditionalIterator & other ) : _iterator( other._iterator ), _endbound( other._endbound ), _condition( other._condition ) {}
-
-				// Overload the dereference operator
-				value_type operator*() const {
-					return *_iterator;
-				}
-
-				// Overload the arrow operator
-				value_type operator->() const {
-					return *_iterator;
-				}
-
-				// Overload the increment operator
-				ConditionalIterator & operator++() {
-					do
-						++_iterator;
-					while( _iterator != _endbound && ! _condition( *_iterator ) );
-					return *this;
-				}
-
-				// Overload the inequality operator
-				bool operator!=( const ConditionalIterator & other ) const {
-					return _iterator != other._iterator;
-				}
-
-				// Overload the equality operator
-				bool operator==( const ConditionalIterator & other ) const {
-					return _iterator == other._iterator;
-				}
-
-			private:
-				Iterator _iterator, _endbound;
-				std::function< bool( typename Iterator::value_type ) > _condition;
-			};
-
-			template< typename D >
-			class MatrixConditionalAccessor {
-				typedef ConditionalIterator< typename grb::Matrix< D >::const_iterator > iterator_type;
-
-			public:
-				MatrixConditionalAccessor( const std::function< bool( std::pair< std::pair< size_t, size_t >, D > ) > & f, const grb::Matrix< D > & A ) :
-					_begin( f, A.cbegin(), A.cend() ), _end( f, A.cend(), A.cend() ) {}
-
-				MatrixConditionalAccessor( const MatrixConditionalAccessor & other ) = delete;
-
-				MatrixConditionalAccessor & operator=( const MatrixConditionalAccessor & other ) = delete;
-
-				virtual ~MatrixConditionalAccessor() {}
-
-				iterator_type cbegin() const {
-					return _begin;
-				}
-
-				iterator_type begin() const {
-					return cbegin();
-				}
-
-				iterator_type cend() const {
-					return _end;
-				}
-
-				iterator_type end() const {
-					return cend();
-				}
-
-			private:
-				iterator_type _begin, _end;
-			};
-
-			template< typename D >
-			class LUMatrixAccessor {
-			public:
-				LUMatrixAccessor( const grb::Matrix< D > & A ) :
-					_lower(
-						[]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
-							return a.first.first > a.first.second;
-						},
-						A ),
-					_upper(
-						[]( const std::pair< std::pair< size_t, size_t >, D > & a ) {
-							return a.first.first < a.first.second;
-						},
-						A ) {}
-
-				MatrixConditionalAccessor< D > & lower() {
-					return _lower;
-				}
-
-				MatrixConditionalAccessor< D > & upper() {
-					return _upper;
-				}
-
-			private:
-				MatrixConditionalAccessor< D > _lower, _upper;
-			};
-
-			template< typename D, typename I, typename J >
-			grb::RC trilu( const grb::Matrix< D, grb::config::default_backend, I, J > & A,
-				grb::Matrix< D, grb::config::default_backend, I, J > & L,
-				grb::Matrix< D, grb::config::default_backend, I, J > & U ) {
-				//
-				grb::RC rc = grb::RC::SUCCESS;
-
-				// Create the custom accessor
-				grb::wait( A );
-				LUMatrixAccessor< D > luAccesor( A );
-
-				// Create the lower and upper matrices from the accessor
-				const std::vector< std::pair< std::pair< I, J >, D > > nnzs_lower( luAccesor.lower().cbegin(), luAccesor.lower().cend() );
-				grb::buildMatrixUnique( L, grb::utils::makeNonzeroIterator< I, J, D >( nnzs_lower.cbegin() ), grb::utils::makeNonzeroIterator< I, J, D >( nnzs_lower.cend() ), IOMode::PARALLEL );
-				const std::vector< std::pair< std::pair< I, J >, D > > nnzs_upper( luAccesor.upper().cbegin(), luAccesor.upper().cend() );
-				grb::buildMatrixUnique( U, grb::utils::makeNonzeroIterator< I, J, D >( nnzs_upper.cbegin() ), grb::utils::makeNonzeroIterator< I, J, D >( nnzs_upper.cend() ), IOMode::PARALLEL );
-
-				return rc;
-			}
-
-		} // namespace
-
 		enum class TriangleCountAlgorithm { Burkhardt, Cohen, Sandia_TT };
 
 		std::map< TriangleCountAlgorithm, std::string > TriangleCountAlgorithmNames = { { TriangleCountAlgorithm::Burkhardt, "Burkhardt" }, { TriangleCountAlgorithm::Cohen, "Cohen" },
 			{ TriangleCountAlgorithm::Sandia_TT, "Sandia_TT" } };
 
-		template< Descriptor descr = descriptors::no_operation, typename D, typename I, typename J, class Semiring, class MulMonoid, class SumMonoid >
-		RC triangle_count_generic( size_t & count,
-			Matrix< D, grb::config::default_backend, I, J > & MXM_out,
-			const Matrix< D, grb::config::default_backend, I, J > & MXM_lhs,
-			const Matrix< D, grb::config::default_backend, I, J > & MXM_rhs,
-			Matrix< D, grb::config::default_backend, I, J > & EWA_out,
-			const Matrix< D, grb::config::default_backend, I, J > & EWA_rhs,
-			const D div_factor,
+		template<
+			class Semiring, class MulMonoid, class SumMonoid,
+			Descriptor descr_mxm = descriptors::no_operation,
+			Descriptor descr_ewa = descriptors::no_operation,
+			Descriptor descr_reduce = descriptors::no_operation,
+			typename D1, typename RIT1, typename CIT1, typename NIT1,
+			typename D2, typename RIT2, typename CIT2, typename NIT2,
+			typename D3, typename RIT3, typename CIT3, typename NIT3,
+			typename D4, typename RIT4, typename CIT4, typename NIT4,
+			typename D5, typename RIT5, typename CIT5, typename NIT5,
+			typename D6
+		>
+		RC triangle_count_generic(
+			size_t & count,
+			Matrix< D1, grb::config::default_backend, RIT1, CIT1, NIT1 > & MXM_out,
+			const Matrix< D2, grb::config::default_backend, RIT2, CIT2, NIT2 > & MXM_lhs,
+			const Matrix< D3, grb::config::default_backend, RIT3, CIT3, NIT3 > & MXM_rhs,
+			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & EWA_out,
+			const Matrix< D5, grb::config::default_backend, RIT5, CIT5, NIT5 > & EWA_rhs,
+			const D6 div_factor,
 			const Semiring mxm_semiring = Semiring(),
 			const MulMonoid ewiseapply_monoid = MulMonoid(),
 			const SumMonoid sumreduce_monoid = SumMonoid() ) {
@@ -243,8 +120,8 @@ namespace grb {
 			// Compute MXM_out = Mlhs * Mrhs
 			utils::printSparseMatrix< Debug >( MXM_lhs, "MXM_lhs" );
 			utils::printSparseMatrix< Debug >( MXM_rhs, "MXM_rhs" );
-			rc = rc ? rc : mxm< descr >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::RESIZE );
-			rc = rc ? rc : mxm< descr >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::EXECUTE );
+			rc = rc ? rc : mxm< descr_mxm >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::RESIZE );
+			rc = rc ? rc : mxm< descr_mxm >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::EXECUTE );
 			utils::printSparseMatrix< Debug >( MXM_out, "MXM_out = mxm( MXM_lhs, MXM_rhs )" );
 
 			// Compute MXM_out .*= EWA_rhs
@@ -252,16 +129,16 @@ namespace grb {
 
 			// FIXME: Replace by a foldl( Matrix[in,out], Matrix[in], Monoid ) - not implemented yet
 			// Will then become:
-			// rc = rc ? rc : eWiseApply< descr >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
-			// rc = rc ? rc : eWiseApply< descr >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
+			// rc = rc ? rc : eWiseApply< descr_ewa >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
+			// rc = rc ? rc : eWiseApply< descr_ewa >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
 			// Instead of:
-			rc = rc ? rc : eWiseApply< descr >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
-			rc = rc ? rc : eWiseApply< descr >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
+			rc = rc ? rc : eWiseApply< descr_ewa >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
+			rc = rc ? rc : eWiseApply< descr_ewa >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
 			utils::printSparseMatrix< Debug >( EWA_out, "EWA_out = ewiseapply( MXM_out, EWA_rhs )" );
 
 			// Compute a sum reduction over <EWA_out> in <count>
-			count = 0;
-			rc = rc ? rc : foldl< descr >( count, EWA_out, sumreduce_monoid );
+			count = static_cast< size_t >( 0 );
+			rc = rc ? rc : foldl< descr_reduce >( count, EWA_out, sumreduce_monoid );
 			utils::printf< Debug >( "count = foldl(EWA_out) = " + std::to_string( count ) + "\n" );
 
 			// Apply the div_factor to the reduction result
@@ -304,22 +181,26 @@ namespace grb {
 		 * performance semantics, with the exception of getters such as #grb::nnz, are
 		 * specific to the backend selected during compilation.
 		 */
-		template< Descriptor descr = descriptors::no_operation,
-			typename D,
-			typename I,
-			typename J,
-			class Semiring = grb::Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
-			class MulMonoid = grb::Monoid< grb::operators::mul< D >, identities::one >,
-			class SumMonoid = grb::Monoid< operators::add< size_t, D, size_t >, identities::zero > >
-		RC triangle_count( const TriangleCountAlgorithm algo,
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename D1, typename RIT1, typename CIT1, typename NIT1,
+			typename D2, typename RIT2, typename CIT2, typename NIT2,
+			typename D3, typename RIT3, typename CIT3, typename NIT3,
+			typename D4, typename RIT4, typename CIT4, typename NIT4,
+			class Semiring = grb::Semiring< operators::add< D1 >, operators::mul< D1 >, identities::zero, identities::one >,
+			class MulMonoid = grb::Monoid< grb::operators::mul< D1 >, identities::one >,
+			class SumMonoid = grb::Monoid< operators::add< size_t, D1, size_t >, identities::zero > >
+		RC triangle_count(
+			const TriangleCountAlgorithm algo,
 			size_t & count,
-			const Matrix< D, grb::config::default_backend, I, J > & A,
-			Matrix< D, grb::config::default_backend, I, J > & MXM_out,
-			Matrix< D, grb::config::default_backend, I, J > & EWA_out,
-			Matrix< D, grb::config::default_backend, I, J > & L = { 0, 0 },
-			Matrix< D, grb::config::default_backend, I, J > & U = { 0, 0 } ) {
+			const Matrix< D1, grb::config::default_backend, RIT1, CIT1, NIT1 > & A,
+			Matrix< D2, grb::config::default_backend, RIT2, CIT2, NIT2 > & MXM_out,
+			Matrix< D3, grb::config::default_backend, RIT3, CIT3, NIT3 > & EWA_out,
+			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & L = { 0, 0 },
+			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & U = { 0, 0 }
+		) {
 			// Static assertions
-			static_assert( std::is_integral< D >::value, "Type D must be integral" );
+			static_assert( std::is_integral< D1 >::value, "Type D1 must be integral" );
 
 			// Sanity checks
 			if( nrows( A ) != ncols( A ) ) {
@@ -362,29 +243,37 @@ namespace grb {
 			// Dispatch to the appropriate algorithm
 			switch( algo ) {
 				case TriangleCountAlgorithm::Burkhardt: {
-					return triangle_count_generic< descr | descriptors::transpose_right, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, A, A, EWA_out, A, 6 );
+					return triangle_count_generic<
+						Semiring, MulMonoid, SumMonoid,
+						descr | descriptors::transpose_right
+					>( count, MXM_out, A, A, EWA_out, A, 6UL );
 				}
 
 				case TriangleCountAlgorithm::Cohen: {
-					trilu( A, L, U );
-					if( nrows( L ) + ncols( L ) == 0 ) {
+					if( nrows( L ) == 0 || ncols( L ) == 0 ) {
 						std::cerr << "Matrix L must be provided for the Cohen algorithm" << std::endl;
 						return RC::MISMATCH;
-					} else if( nrows( U ) + ncols( U ) == 0 ) {
+					}
+					if( nrows( U ) == 0 || ncols( U ) == 0 ) {
 						std::cerr << "Matrix U must be provided for the Cohen algorithm" << std::endl;
 						return RC::MISMATCH;
 					}
-					return triangle_count_generic< descr, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, L, U, EWA_out,  A, 2 );
+
+					return triangle_count_generic<
+						Semiring, MulMonoid, SumMonoid
+					>( count, MXM_out, L, U, EWA_out,  A, 2UL );
 				}
 
 				case TriangleCountAlgorithm::Sandia_TT: {
-					trilu( A, L, U );
 					if( ( nrows( U ) == 0 || ncols( U ) == 0 ) && ( nrows( L ) == 0 || ncols( L ) == 0 ) ) {
 						std::cerr << "Matrix L or U must be provided for the Sandia_TT algorithm" << std::endl;
 						return RC::MISMATCH;
 					}
-					const Matrix< D, grb::config::default_backend, I, J > & T = ( nrows( U ) == 0 || ncols( U ) == 0 ) ? L : U;
-					return triangle_count_generic< descr, D, I, J, Semiring, MulMonoid, SumMonoid >( count, MXM_out, T, T, EWA_out, T, 1 );
+
+					const Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & T = ( nrows( U ) == 0 || ncols( U ) == 0 ) ? L : U;
+					return triangle_count_generic<
+						Semiring, MulMonoid, SumMonoid
+					>( count, MXM_out, T, T, EWA_out, T, 1UL );
 				}
 
 				default:
diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index e4631e2bc..942bbaf71 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -151,7 +151,6 @@ void grbProgram( const input & data_in, output & out ) {
 		}
 	}
 
-
 	timer.reset();
 	// Allocate the buffers
 	Matrix< IntegerType > buffer( n, n );
@@ -159,7 +158,10 @@ void grbProgram( const input & data_in, output & out ) {
 	Matrix< IntegerType > L( n, n );
 	Matrix< IntegerType > U( n, n );
 	// Split A into L and U
-	// TODO:
+	grb::tril( L, A, Phase::RESIZE );
+	grb::triu( U, A, Phase::RESIZE );
+	grb::tril( L, A, Phase::EXECUTE );
+	grb::triu( U, A, Phase::EXECUTE );
 	out.times.preamble = timer.time();
 
 	timer.reset();

From a89b6ebeab2b614ca6155c488a92b826d19c3b72 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 22 Jun 2023 15:38:46 +0200
Subject: [PATCH 56/63] Restrict triangle_count test to reference+omp &
 hyperdags backends

---
 tests/smoke/CMakeLists.txt |  2 +-
 tests/smoke/smoketests.sh  | 56 ++++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 091d0fe10..8a8404f08 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -182,7 +182,7 @@ add_grb_executables( kcore_decomposition kcore_decomposition.cpp
 
 add_grb_executables( triangle_count triangle_count.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags
 )
 
 # targets to list and build the test for this category
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 2421ba7cb..567aef7dd 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -366,39 +366,41 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
-			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the dwt_59.mtx"
-			if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
-				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx direct 30 1 1 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
-				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
-					echo "Test FAILED"
-				elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
-					echo "Verification FAILED"
-					echo "Test FAILED"
+			if [ "$BACKEND" = "reference" ] || [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "hyperdags" ]; then
+				echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the dwt_59.mtx"
+				if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
+					$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx direct 30 1 1 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+					head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
+						echo "Test FAILED"
+					elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
+						echo "Verification FAILED"
+						echo "Test FAILED"
+					else
+						echo "Test OK"
+					fi
 				else
-					echo "Test OK"
+					echo "Test DISABLED: dwt_59.mtx was not found. To enable, please provide ${INPUT_DIR}/dwt_59.mtx"
 				fi
-			else
-				echo "Test DISABLED: dwt_59.mtx was not found. To enable, please provide ${INPUT_DIR}/dwt_59.mtx"
-			fi
-			echo " "
+				echo " "
 
-			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the gyro_m.mtx"
-			if [ -f ${INPUT_DIR}/gyro_m.mtx ]; then
-				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/gyro_m.mtx direct 598470 1 1 &> ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
-				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
-					echo "Test FAILED"
-				elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
-					echo "Verification FAILED"
-					echo "Test FAILED"
+				echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the gyro_m.mtx"
+				if [ -f ${INPUT_DIR}/gyro_m.mtx ]; then
+					$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/gyro_m.mtx direct 598470 1 1 &> ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+					head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
+						echo "Test FAILED"
+					elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
+						echo "Verification FAILED"
+						echo "Test FAILED"
+					else
+						echo "Test OK"
+					fi
 				else
-					echo "Test OK"
+					echo "Test DISABLED: gyro_m.mtx was not found. To enable, please provide ${INPUT_DIR}/gyro_m.mtx"
 				fi
-			else
-				echo "Test DISABLED: gyro_m.mtx was not found. To enable, please provide ${INPUT_DIR}/gyro_m.mtx"
+				echo " "
 			fi
-			echo " "
 
 			if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
 				echo "Additional standardised smoke tests not yet supported for the ${BACKEND} backend"

From 9e3007eaf3ca6f114386dbc3b04cad6727e25b43 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 22 Jun 2023 17:16:35 +0200
Subject: [PATCH 57/63] Fix triangle_count test verification

---
 tests/smoke/smoketests.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 567aef7dd..a28bac753 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -373,9 +373,6 @@ for BACKEND in ${BACKENDS[@]}; do
 					head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
 					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
 						echo "Test FAILED"
-					elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
-						echo "Verification FAILED"
-						echo "Test FAILED"
 					else
 						echo "Test OK"
 					fi
@@ -390,9 +387,6 @@ for BACKEND in ${BACKENDS[@]}; do
 					head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
 					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
 						echo "Test FAILED"
-					elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
-						echo "Verification FAILED"
-						echo "Test FAILED"
 					else
 						echo "Test OK"
 					fi

From 2ecdf1ec8b4dcda1fe110f2800a7e77450ccc41d Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 29 Jun 2023 11:20:41 +0200
Subject: [PATCH 58/63] Allow triangle_count test to run on all backends

---
 tests/smoke/CMakeLists.txt |  2 +-
 tests/smoke/smoketests.sh  | 44 ++++++++++++++++++--------------------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 8a8404f08..3e0cf830d 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -182,7 +182,7 @@ add_grb_executables( kcore_decomposition kcore_decomposition.cpp
 
 add_grb_executables( triangle_count triangle_count.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
-	BACKENDS reference reference_omp hyperdags
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 # targets to list and build the test for this category
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index a28bac753..fc83ba4c8 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -366,35 +366,33 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
-			if [ "$BACKEND" = "reference" ] || [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "hyperdags" ]; then
-				echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the dwt_59.mtx"
-				if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
-					$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx direct 30 1 1 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
-					head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
-					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
-						echo "Test FAILED"
-					else
-						echo "Test OK"
-					fi
+			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the dwt_59.mtx"
+			if [ -f ${INPUT_DIR}/dwt_59.mtx ]; then
+				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/dwt_59.mtx direct 30 1 1 &> ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_dwt_59_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
 				else
-					echo "Test DISABLED: dwt_59.mtx was not found. To enable, please provide ${INPUT_DIR}/dwt_59.mtx"
+					echo "Test OK"
 				fi
-				echo " "
+			else
+				echo "Test DISABLED: dwt_59.mtx was not found. To enable, please provide ${INPUT_DIR}/dwt_59.mtx"
+			fi
+			echo " "
 
-				echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the gyro_m.mtx"
-				if [ -f ${INPUT_DIR}/gyro_m.mtx ]; then
-					$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/gyro_m.mtx direct 598470 1 1 &> ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
-					head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
-					if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
-						echo "Test FAILED"
-					else
-						echo "Test OK"
-					fi
+			echo ">>>      [x]           [ ]       Testing the Triangle couting algorithm on the gyro_m.mtx"
+			if [ -f ${INPUT_DIR}/gyro_m.mtx ]; then
+				$runner ${TEST_BIN_DIR}/triangle_count_${BACKEND} ${INPUT_DIR}/gyro_m.mtx direct 598470 1 1 &> ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/triangle_count_gyro_m_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
 				else
-					echo "Test DISABLED: gyro_m.mtx was not found. To enable, please provide ${INPUT_DIR}/gyro_m.mtx"
+					echo "Test OK"
 				fi
-				echo " "
+			else
+				echo "Test DISABLED: gyro_m.mtx was not found. To enable, please provide ${INPUT_DIR}/gyro_m.mtx"
 			fi
+			echo " "
 
 			if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
 				echo "Additional standardised smoke tests not yet supported for the ${BACKEND} backend"

From 8933de16391f5b940e41a2eef72a780f53864110 Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Thu, 29 Jun 2023 11:22:41 +0200
Subject: [PATCH 59/63] Rename algorithm header

---
 include/graphblas/algorithms/triangle_count.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index 98bb1eec3..950023f72 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -18,14 +18,14 @@
 /**
  * @file
  *
- * Implements the triangle counting and triangle enumeration algorithms.
+ * Implements the triangle counting algorithm, using different methods.
  *
  * @author B. Lozes
  * @date: May 10th, 2023
  */
 
-#ifndef _H_GRB_TRIANGLE_ENUMERATION
-#define _H_GRB_TRIANGLE_ENUMERATION
+#ifndef _H_GRB_TRIANGLE_COUNT
+#define _H_GRB_TRIANGLE_COUNT
 
 #include <map>
 #include <numeric>
@@ -288,4 +288,4 @@ namespace grb {
 
 } // namespace grb
 
-#endif // _H_GRB_TRIANGLE_ENUMERATION
+#endif // _H_GRB_TRIANGLE_COUNT

From 5064f1b94ca99285831e4633ce1517c8ec35120c Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Tue, 11 Jul 2023 18:09:18 +0200
Subject: [PATCH 60/63] Cleaning

---
 .../graphblas/algorithms/triangle_count.hpp   | 136 ++++++------------
 tests/smoke/triangle_count.cpp                |   4 +-
 2 files changed, 50 insertions(+), 90 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index 950023f72..43632d05d 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -20,7 +20,7 @@
  *
  * Implements the triangle counting algorithm, using different methods.
  *
- * @author B. Lozes
+ * @author Benjamin Lozes
  * @date: May 10th, 2023
  */
 
@@ -35,59 +35,17 @@
 
 #include <graphblas.hpp>
 
-constexpr bool Debug = false;
-
 namespace grb {
 
 	namespace algorithms {
 
-		namespace utils {
-
-			template< class Iterator >
-			void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-				std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-				if( rows > 100 || cols > 100 ) {
-					os << "   Matrix too large to print" << std::endl;
-				} else {
-					// os.precision( 3 );
-					for( size_t y = 0; y < rows; y++ ) {
-						os << std::string( 3, ' ' );
-						for( size_t x = 0; x < cols; x++ ) {
-							auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-								return a.first.first == y && a.first.second == x;
-							} );
-							if( nnz_val != end )
-								os << std::fixed << ( *nnz_val ).second;
-							else
-								os << '_';
-							os << " ";
-						}
-						os << std::endl;
-					}
-				}
-				os << "]" << std::endl;
-			}
-
-			template< bool debug, typename D >
-			void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-				if( ! debug )
-					return;
-				grb::wait( mat );
-				printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
-			}
-
-			template< bool debug >
-			void printf( const std::string & msg, std::ostream & os = std::cout ) {
-				if( ! debug )
-					return;
-				os << msg;
-			}
-		} // namespace utils
-
 		enum class TriangleCountAlgorithm { Burkhardt, Cohen, Sandia_TT };
 
-		std::map< TriangleCountAlgorithm, std::string > TriangleCountAlgorithmNames = { { TriangleCountAlgorithm::Burkhardt, "Burkhardt" }, { TriangleCountAlgorithm::Cohen, "Cohen" },
-			{ TriangleCountAlgorithm::Sandia_TT, "Sandia_TT" } };
+		std::map< TriangleCountAlgorithm, std::string > TriangleCountAlgorithmNames = {
+			{ TriangleCountAlgorithm::Burkhardt, "Burkhardt" },
+			{ TriangleCountAlgorithm::Cohen, "Cohen" },
+			{ TriangleCountAlgorithm::Sandia_TT, "Sandia_TT" }
+		};
 
 		template<
 			class Semiring, class MulMonoid, class SumMonoid,
@@ -103,30 +61,27 @@ namespace grb {
 		>
 		RC triangle_count_generic(
 			size_t & count,
-			Matrix< D1, grb::config::default_backend, RIT1, CIT1, NIT1 > & MXM_out,
-			const Matrix< D2, grb::config::default_backend, RIT2, CIT2, NIT2 > & MXM_lhs,
-			const Matrix< D3, grb::config::default_backend, RIT3, CIT3, NIT3 > & MXM_rhs,
-			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & EWA_out,
-			const Matrix< D5, grb::config::default_backend, RIT5, CIT5, NIT5 > & EWA_rhs,
+			Matrix< D1, config::default_backend, RIT1, CIT1, NIT1 > & MXM_out,
+			const Matrix< D2, config::default_backend, RIT2, CIT2, NIT2 > & MXM_lhs,
+			const Matrix< D3, config::default_backend, RIT3, CIT3, NIT3 > & MXM_rhs,
+			Matrix< D4, config::default_backend, RIT4, CIT4, NIT4 > & EWA_out,
+			const Matrix< D5, config::default_backend, RIT5, CIT5, NIT5 > & EWA_rhs,
 			const D6 div_factor,
 			const Semiring mxm_semiring = Semiring(),
 			const MulMonoid ewiseapply_monoid = MulMonoid(),
-			const SumMonoid sumreduce_monoid = SumMonoid() ) {
-			RC rc = RC::SUCCESS;
+			const SumMonoid sumreduce_monoid = SumMonoid()
+		) {
+			if( ( &MXM_out == &MXM_lhs ) || ( &MXM_out == &MXM_rhs ) ) {
+				return ILLEGAL;
+			}
 
-			rc = ( &MXM_out == &MXM_lhs ) ? RC::ILLEGAL : rc;
-			rc = ( &MXM_out == &MXM_rhs ) ? RC::ILLEGAL : rc;
+			RC rc = SUCCESS;
 
 			// Compute MXM_out = Mlhs * Mrhs
-			utils::printSparseMatrix< Debug >( MXM_lhs, "MXM_lhs" );
-			utils::printSparseMatrix< Debug >( MXM_rhs, "MXM_rhs" );
 			rc = rc ? rc : mxm< descr_mxm >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::RESIZE );
 			rc = rc ? rc : mxm< descr_mxm >( MXM_out, MXM_lhs, MXM_rhs, mxm_semiring, Phase::EXECUTE );
-			utils::printSparseMatrix< Debug >( MXM_out, "MXM_out = mxm( MXM_lhs, MXM_rhs )" );
 
 			// Compute MXM_out .*= EWA_rhs
-			utils::printSparseMatrix< Debug >( EWA_rhs, "EWA_rhs" );
-
 			// FIXME: Replace by a foldl( Matrix[in,out], Matrix[in], Monoid ) - not implemented yet
 			// Will then become:
 			// rc = rc ? rc : eWiseApply< descr_ewa >( MXM_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
@@ -134,16 +89,13 @@ namespace grb {
 			// Instead of:
 			rc = rc ? rc : eWiseApply< descr_ewa >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::RESIZE );
 			rc = rc ? rc : eWiseApply< descr_ewa >( EWA_out, MXM_out, EWA_rhs, ewiseapply_monoid, Phase::EXECUTE );
-			utils::printSparseMatrix< Debug >( EWA_out, "EWA_out = ewiseapply( MXM_out, EWA_rhs )" );
 
-			// Compute a sum reduction over <EWA_out> in <count>
+			// Compute a sum reduction over <EWA_out> into <count>
 			count = static_cast< size_t >( 0 );
 			rc = rc ? rc : foldl< descr_reduce >( count, EWA_out, sumreduce_monoid );
-			utils::printf< Debug >( "count = foldl(EWA_out) = " + std::to_string( count ) + "\n" );
 
 			// Apply the div_factor to the reduction result
 			count /= div_factor;
-			utils::printf< Debug >( "count = count / div_factor = " + std::to_string( count ) + "\n" );
 
 			return rc;
 		}
@@ -187,17 +139,23 @@ namespace grb {
 			typename D2, typename RIT2, typename CIT2, typename NIT2,
 			typename D3, typename RIT3, typename CIT3, typename NIT3,
 			typename D4, typename RIT4, typename CIT4, typename NIT4,
-			class Semiring = grb::Semiring< operators::add< D1 >, operators::mul< D1 >, identities::zero, identities::one >,
-			class MulMonoid = grb::Monoid< grb::operators::mul< D1 >, identities::one >,
-			class SumMonoid = grb::Monoid< operators::add< size_t, D1, size_t >, identities::zero > >
+			class Semiring = Semiring< operators::add< D1 >,
+									   operators::mul< D1 >,
+									   identities::zero,
+									   identities::one >,
+			class MulMonoid = Monoid< operators::mul< D1 >, 
+									  identities::one >,
+			class SumMonoid = Monoid< operators::add< size_t, D1, size_t >, 
+									  identities::zero > 
+		>
 		RC triangle_count(
 			const TriangleCountAlgorithm algo,
 			size_t & count,
-			const Matrix< D1, grb::config::default_backend, RIT1, CIT1, NIT1 > & A,
-			Matrix< D2, grb::config::default_backend, RIT2, CIT2, NIT2 > & MXM_out,
-			Matrix< D3, grb::config::default_backend, RIT3, CIT3, NIT3 > & EWA_out,
-			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & L = { 0, 0 },
-			Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & U = { 0, 0 }
+			const Matrix< D1, config::default_backend, RIT1, CIT1, NIT1 > & A,
+			Matrix< D2, config::default_backend, RIT2, CIT2, NIT2 > & MXM_out,
+			Matrix< D3, config::default_backend, RIT3, CIT3, NIT3 > & EWA_out,
+			Matrix< D4, config::default_backend, RIT4, CIT4, NIT4 > & L = { 0, 0 },
+			Matrix< D4, config::default_backend, RIT4, CIT4, NIT4 > & U = { 0, 0 }
 		) {
 			// Static assertions
 			static_assert( std::is_integral< D1 >::value, "Type D1 must be integral" );
@@ -205,39 +163,39 @@ namespace grb {
 			// Sanity checks
 			if( nrows( A ) != ncols( A ) ) {
 				std::cerr << "Matrix A must be square" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( ncols( L ) != nrows( L ) ) {
 				std::cerr << "Matrix L must be square" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( nrows( A ) != ncols( L ) ) {
 				std::cerr << "Matrices A and L must have the same dimensions" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( ncols( U ) != nrows( U ) ) {
 				std::cerr << "Matrix U must be square" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( nrows( A ) != ncols( U ) ) {
 				std::cerr << "Matrices A and U must have the same dimensions" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( ncols( MXM_out ) != nrows( MXM_out ) ) {
 				std::cerr << "Matrix MXM_out must be square" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( nrows( A ) != ncols( MXM_out ) ) {
 				std::cerr << "Matrices A and MXM_out must have the same dimensions" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( ncols( EWA_out ) != nrows( EWA_out ) ) {
 				std::cerr << "Matrix EWA_out must be square" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 			if( nrows( A ) != ncols( EWA_out ) ) {
 				std::cerr << "Matrices A and EWA_out must have the same dimensions" << std::endl;
-				return RC::MISMATCH;
+				return MISMATCH;
 			}
 
 			// Dispatch to the appropriate algorithm
@@ -252,11 +210,11 @@ namespace grb {
 				case TriangleCountAlgorithm::Cohen: {
 					if( nrows( L ) == 0 || ncols( L ) == 0 ) {
 						std::cerr << "Matrix L must be provided for the Cohen algorithm" << std::endl;
-						return RC::MISMATCH;
+						return MISMATCH;
 					}
 					if( nrows( U ) == 0 || ncols( U ) == 0 ) {
 						std::cerr << "Matrix U must be provided for the Cohen algorithm" << std::endl;
-						return RC::MISMATCH;
+						return MISMATCH;
 					}
 
 					return triangle_count_generic<
@@ -267,10 +225,10 @@ namespace grb {
 				case TriangleCountAlgorithm::Sandia_TT: {
 					if( ( nrows( U ) == 0 || ncols( U ) == 0 ) && ( nrows( L ) == 0 || ncols( L ) == 0 ) ) {
 						std::cerr << "Matrix L or U must be provided for the Sandia_TT algorithm" << std::endl;
-						return RC::MISMATCH;
+						return MISMATCH;
 					}
 
-					const Matrix< D4, grb::config::default_backend, RIT4, CIT4, NIT4 > & T = ( nrows( U ) == 0 || ncols( U ) == 0 ) ? L : U;
+					const Matrix< D4, config::default_backend, RIT4, CIT4, NIT4 > & T = ( nrows( U ) == 0 || ncols( U ) == 0 ) ? L : U;
 					return triangle_count_generic<
 						Semiring, MulMonoid, SumMonoid
 					>( count, MXM_out, T, T, EWA_out, T, 1UL );
@@ -278,10 +236,10 @@ namespace grb {
 
 				default:
 					std::cerr << "Unknown TriangleCountAlgorithm enum value" << std::endl;
-					return RC::ILLEGAL;
+					return ILLEGAL;
 			}
 
-			return RC::SUCCESS;
+			return SUCCESS;
 		}
 
 	} // namespace algorithms
diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index 942bbaf71..c1b25924d 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -139,7 +139,9 @@ void grbProgram( const input & data_in, output & out ) {
 			cols.push_back( p.second );
 		}
 		std::vector< IntegerType > values( rows.size(), static_cast< IntegerType >( 1 ) );
-		buildMatrixUnique( A, rows.data(), cols.data(), values.data(), values.size(), IOMode::SEQUENTIAL );
+		assert( SUCCESS == 
+			buildMatrixUnique( A, rows.data(), cols.data(), values.data(), values.size(), IOMode::SEQUENTIAL )
+		);
 	}
 	out.times.io = timer.time();
 

From 331158eb80e725b09051634837a9248ebd14d7db Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 9 Feb 2024 12:13:31 +0100
Subject: [PATCH 61/63] Fix capitalization of include file names

---
 .../graphblas/algorithms/triangle_count.hpp    |  8 ++++----
 tests/smoke/triangle_count.cpp                 | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/graphblas/algorithms/triangle_count.hpp b/include/graphblas/algorithms/triangle_count.hpp
index 43632d05d..6cb9f2ca7 100644
--- a/include/graphblas/algorithms/triangle_count.hpp
+++ b/include/graphblas/algorithms/triangle_count.hpp
@@ -31,7 +31,7 @@
 #include <numeric>
 #include <vector>
 
-#include <graphblas/utils/iterators/NonzeroIterator.hpp>
+#include <graphblas/utils/iterators/nonzeroIterator.hpp>
 
 #include <graphblas.hpp>
 
@@ -143,10 +143,10 @@ namespace grb {
 									   operators::mul< D1 >,
 									   identities::zero,
 									   identities::one >,
-			class MulMonoid = Monoid< operators::mul< D1 >, 
+			class MulMonoid = Monoid< operators::mul< D1 >,
 									  identities::one >,
-			class SumMonoid = Monoid< operators::add< size_t, D1, size_t >, 
-									  identities::zero > 
+			class SumMonoid = Monoid< operators::add< size_t, D1, size_t >,
+									  identities::zero >
 		>
 		RC triangle_count(
 			const TriangleCountAlgorithm algo,
diff --git a/tests/smoke/triangle_count.cpp b/tests/smoke/triangle_count.cpp
index c1b25924d..f7c8892db 100644
--- a/tests/smoke/triangle_count.cpp
+++ b/tests/smoke/triangle_count.cpp
@@ -25,7 +25,7 @@
 #include <inttypes.h>
 
 #include <graphblas/algorithms/triangle_count.hpp>
-#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/timer.hpp>
 #include <graphblas/utils/parser.hpp>
 
 #include <graphblas.hpp>
@@ -81,12 +81,12 @@ void grbProgram( const input & data_in, output & out ) {
 
 	timer.reset();
 	// Create a local parser
-	grb::utils::MatrixFileReader< 
+	grb::utils::MatrixFileReader<
 		void,
 		std::conditional<
 			( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
 			grb::config::RowIndexType,
-			grb::config::ColIndexType 
+			grb::config::ColIndexType
 		>::type
 	> parser( data_in.filename, data_in.direct );
 	assert( parser.m() == parser.n() );
@@ -134,12 +134,12 @@ void grbProgram( const input & data_in, output & out ) {
 			if( p.first == p.second ) {
 				continue;
 			}
-			
+
 			rows.push_back( p.first );
 			cols.push_back( p.second );
 		}
 		std::vector< IntegerType > values( rows.size(), static_cast< IntegerType >( 1 ) );
-		assert( SUCCESS == 
+		assert( SUCCESS ==
 			buildMatrixUnique( A, rows.data(), cols.data(), values.data(), values.size(), IOMode::SEQUENTIAL )
 		);
 	}
@@ -183,9 +183,9 @@ int main( int argc, char ** argv ) {
 	if( !parse_arguments( argc, argv, in, err ) ) {
 		return err;
 	}
-	
+
 	std::cout << "Executable called with parameters " << in.filename << ", "
-		<< "inner repititions = " << in.inner_rep << ", and outer reptitions = " 
+		<< "inner repititions = " << in.inner_rep << ", and outer reptitions = "
 		<< in.outer_rep	<< std::endl;
 
 	// Run the test for all algorithms
@@ -248,8 +248,8 @@ int main( int argc, char ** argv ) {
 bool parse_arguments( int argc, char ** argv, input & in, int& err ) {
 	// Check if we are testing on a file
 	if( argc < 4 || argc > 6 ) {
-		std::cerr << "Usages: \n\t" 
-			<< argv[ 0 ] << " <graph_filepath> <direct/indirect> <expected_triangle_count> (inner iterations) (outer iterations)" 
+		std::cerr << "Usages: \n\t"
+			<< argv[ 0 ] << " <graph_filepath> <direct/indirect> <expected_triangle_count> (inner iterations) (outer iterations)"
 			<< std::endl;
 		err = 1;
 		return false;

From 194a303374558819541192f09a51fd6d2881341c Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 9 Feb 2024 14:59:34 +0100
Subject: [PATCH 62/63] Reveret to a clean state

---
 .../algorithms/conjugate_gradient.hpp         |   24 +-
 include/graphblas/backends.hpp                |   39 -
 include/graphblas/base/blas3.hpp              |  415 -----
 include/graphblas/base/vector.hpp             |   22 -
 include/graphblas/blas0.hpp                   |   32 -
 include/graphblas/bsp1d/blas3.hpp             |  272 ---
 include/graphblas/bsp1d/vector.hpp            |   61 -
 include/graphblas/hyperdags/blas3.hpp         |  337 ----
 include/graphblas/hyperdags/hyperdags.hpp     |   23 +-
 include/graphblas/hyperdags/io.hpp            |    3 +-
 include/graphblas/hyperdags/vector.hpp        |    9 -
 include/graphblas/nonblocking/blas1.hpp       |    4 +-
 include/graphblas/nonblocking/blas3.hpp       |  112 --
 include/graphblas/nonblocking/coordinates.hpp |   33 +-
 include/graphblas/nonblocking/matrix.hpp      |   26 -
 include/graphblas/nonblocking/vector.hpp      |   53 +-
 include/graphblas/reference/blas2.hpp         |   48 +-
 include/graphblas/reference/blas3.hpp         |  680 +-------
 include/graphblas/reference/vector.hpp        |   24 -
 src/graphblas/hyperdags/hyperdags.cpp         |   18 -
 src/transition/CMakeLists.txt                 |   83 +-
 src/transition/sparseblas.cpp                 | 1456 +++++++++++------
 tests/unit/fold_matrix_to_scalar.cpp          |  628 -------
 tests/unit/tril.cpp                           |  165 --
 tests/unit/triu.cpp                           |  165 --
 tests/unit/unittests.sh                       |   32 -
 26 files changed, 1013 insertions(+), 3751 deletions(-)
 delete mode 100644 tests/unit/fold_matrix_to_scalar.cpp
 delete mode 100644 tests/unit/tril.cpp
 delete mode 100644 tests/unit/triu.cpp

diff --git a/include/graphblas/algorithms/conjugate_gradient.hpp b/include/graphblas/algorithms/conjugate_gradient.hpp
index 27616ec77..b87cfa080 100644
--- a/include/graphblas/algorithms/conjugate_gradient.hpp
+++ b/include/graphblas/algorithms/conjugate_gradient.hpp
@@ -27,7 +27,7 @@
 #define _H_GRB_ALGORITHMS_CONJUGATE_GRADIENT
 
 #include <cstdio>
-#include <cmath>
+#include <complex>
 
 #include <graphblas.hpp>
 #include <graphblas/utils/iscomplex.hpp>
@@ -144,8 +144,7 @@ namespace grb {
 		 * performance semantics, with the exception of getters such as #grb::nnz, are
 		 * specific to the backend selected during compilation.
 		 */
-		template<
-			Descriptor descr = descriptors::no_operation,
+		template< Descriptor descr = descriptors::no_operation,
 			typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
@@ -155,20 +154,19 @@ namespace grb {
 				grb::identities::zero, grb::identities::one
 			>,
 			class Minus = operators::subtract< IOType >,
-			class Divide = operators::divide< IOType >,
-			typename RSI, typename NZI, Backend backend
+			class Divide = operators::divide< IOType >
 		>
 		grb::RC conjugate_gradient(
-			grb::Vector< IOType, backend > &x,
-			const grb::Matrix< NonzeroType, backend, RSI, RSI, NZI > &A,
-			const grb::Vector< InputType, backend > &b,
+			grb::Vector< IOType > &x,
+			const grb::Matrix< NonzeroType > &A,
+			const grb::Vector< InputType > &b,
 			const size_t max_iterations,
 			ResidualType tol,
 			size_t &iterations,
 			ResidualType &residual,
-			grb::Vector< IOType, backend > &r,
-			grb::Vector< IOType, backend > &u,
-			grb::Vector< IOType, backend > &temp,
+			grb::Vector< IOType > &r,
+			grb::Vector< IOType > &u,
+			grb::Vector< IOType > &temp,
 			const Ring &ring = Ring(),
 			const Minus &minus = Minus(),
 			const Divide &divide = Divide()
@@ -326,7 +324,7 @@ namespace grb {
 			assert( ret == SUCCESS );
 
 			if( ret == SUCCESS ) {
-				tol *= std::sqrt( grb::utils::is_complex< IOType >::modulus( bnorm ) );
+				tol *= sqrt( grb::utils::is_complex< IOType >::modulus( bnorm ) );
 			}
 
 			size_t iter = 0;
@@ -419,7 +417,7 @@ namespace grb {
 
 			// return correct error code
 			if( ret == SUCCESS ) {
-				if( std::sqrt( residual ) >= tol ) {
+				if( sqrt( residual ) >= tol ) {
 					// did not converge within iterations
 					return FAILED;
 				}
diff --git a/include/graphblas/backends.hpp b/include/graphblas/backends.hpp
index 653348112..3fd2f0ec1 100644
--- a/include/graphblas/backends.hpp
+++ b/include/graphblas/backends.hpp
@@ -29,9 +29,6 @@
 #ifndef _H_GRB_BACKENDS
 #define _H_GRB_BACKENDS
 
-#include <string>
-#include <iostream>
-
 
 namespace grb {
 
@@ -221,42 +218,6 @@ namespace grb {
 
 	};
 
-	/**
-	 * Converts a backend identifier to a human-readable string.
-	 *
-	 * @param[in] backend The backend whose string to return.
-	 *
-	 * @return The name of the given \a backend as a C++ string.
-	 */
-	static inline std::string toString( const enum grb::Backend backend ) {
-		switch( backend ) {
-			case grb::Backend::reference:     return "reference";
-			case grb::Backend::reference_omp: return "reference_omp";
-			case grb::Backend::hyperdags:     return "hyperdags";
-			case grb::Backend::nonblocking:   return "nonblocking";
-			case grb::Backend::shmem1D:       return "shmem1D";
-			case grb::Backend::NUMA1D:        return "NUMA1D";
-			case grb::Backend::GENERIC_BSP:   return "GENERIC_BSP";
-			case grb::Backend::BSP1D:         return "BSP1D";
-			case grb::Backend::doublyBSP1D:   return "doublyBSP1D";
-			case grb::Backend::BSP2D:         return "BSP2D";
-			case grb::Backend::autoBSP:       return "autoBSP";
-			case grb::Backend::optBSP:        return "optBSP";
-			case grb::Backend::hybrid:        return "hybrid";
-			case grb::Backend::hybridSmall:   return "hybridSmall";
-			case grb::Backend::hybridMid:     return "hybridMid";
-			case grb::Backend::hybridLarge:   return "hybridLarge";
-			case grb::Backend::minFootprint:  return "minFootprint";
-			case grb::Backend::banshee:       return "banshee";
-			case grb::Backend::banshee_ssr:   return "banshee_ssr";
-			default:
-				const int backend_id = static_cast< int >( backend );
-				std::cerr << "Warning, std::string( const grb::Backend ): unknown backend "
-					<< backend_id << " encountered, please submit a bug report.\n";
-				return "unknown_backend(id=" + std::to_string( backend_id ) + ")";
-		}
-	}
-
 } // namespace grb
 
 #endif
diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 841ba981a..425f7bc7a 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -442,421 +442,6 @@ namespace grb {
 		return ret == SUCCESS ? UNSUPPORTED : ret;
 	}
 
-
-	/**
-	 * Reduces, or \em folds, a matrix into a scalar.
-	 * Right-to-left masked variant.
-	 *
-	 * Reduction takes place according a monoid \f$ (\oplus,1) \f$, where
-	 * \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with associated identities
-	 * \f$ 1_k in D_k \f$. Usually, \f$ D_k \subseteq D_3, 1 \leq k < 3 \f$,
-	 * though other more exotic structures may be envisioned (and used).
-	 *
-	 * Let \f$ x_0 = 1 \f$ and let
-	 * \f$ x_{i+1} = \begin{cases}
-	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero and }m_i\text{ evaluates true}
-	 *   x_i\text{ otherwise}
-	 * \end{cases},\f$
-	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$.
-	 *
-	 * \note Per this definition, the folding happens in a right-to-left direction.
-	 *       If another direction is wanted, which may have use in cases where
-	 *       \f$ D_1 \f$ differs from \f$ D_2 \f$, then either a monoid with those
-	 *       operator domains switched may be supplied, or #grb::foldr may be used
-	 *       instead.
-	 *
-	 * Note that the operator \f$ \oplus \f$ must be associative since it is part
-	 * of a monoid. This algebraic property is exploited when parallelising the
-	 * requested operation. The identity is required when parallelising over
-	 * multiple user processes.
-	 *
-	 * \warning In so doing, the order of the evaluation of the reduction operation
-	 *          should not be expected to be a serial, right-to-left, evaluation of
-	 *          the computation chain.
-	 *
-	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
-	 *                   left unspecified).
-	 * @tparam Monoid    The monoid to use for reduction.
-	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a A.
-	 * @tparam IOType    The type of the output scalar \a x.
-	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
-	 *                   matrix \a mask.
-	 *
-	 * @param[in, out] x   The result of the reduction.
-	 * 					   Prior value will be considered.
-	 * @param[in]      A   Any ALP/GraphBLAS matrix, will be reduced into \a x.
-	 * @param[in]   mask   Any ALP/GraphBLAS matrix, will mask the matrix \a A.
-	 * 					   Dimensions must match those of \a A.
-	 * @param[in] monoid   The monoid under which to perform this reduction.
-	 * 					   An identity element must be provided when using
-	 * 					   threads in order to perform the local reductions.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::MISMATCH If a \a mask was not empty and does not have size
-	 *                       equal to \a y.
-	 *
-	 * @see grb::foldl provides similar in-place functionality, but folds in a
-	 * 	left-to-right direction.
-	 * @see The same primitive but unmasked is also provided.
-	 *
-	 * \parblock
-	 * \par Valid descriptors
-	 * - descriptors::no_operation: the default descriptor.
-	 * - descriptors::no_casting: the first domain of
-	 * 	 	\a monoid must match \a InputType, the second domain of \a op
-	 * 		match \a IOType, the third domain must match \a IOType.
-	 * - descriptors::transpose_left: A^T will be considered instead 
-	 * 	 	of \a A.
-	 * - descriptors::transpose_right: mask^T will be considered 
-	 * 	 	instead of \a mask.
-	 * - descriptors::invert_mask: Not supported yet.
-	 *
-	 * \note Invalid descriptors will be ignored.
-	 *
-	 * \endparblock
-	 *
-	 * \par Performance semantics
-	 * Each backend must define performance semantics for this primitive.
-	 *
-	 * @see perfSemantics
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M,
-		Backend backend
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, backend, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, backend, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifndef NDEBUG
-		const bool should_not_call_base_scalar_masked_matrix_foldr = false;
-		assert( should_not_call_base_scalar_masked_matrix_foldr );
-#endif
-		(void) A;
-		(void) x;
-		(void) mask;
-		(void) monoid;
-		return UNSUPPORTED;
-	}
-
-	/**
-	 * Reduces, or \em folds, a matrix into a scalar. 
-	 * Right-to-left unmasked variant.
-	 * 
-	 * Please see the masked grb::foldr variant for a full description.
-	 * 
-	 * \parblock
-	 * 
-	 * \par Valid descriptors specific to this variant
-	 * - descriptors::transpose_matrix: A^T will be considered instead 
-	 * 	 	of \a A.
-	 * 
-	 * \note See other valid descriptors in the masked variant.
-	 * \note Invalid descriptors will be ignored.
-	 * 
-	 * \endparblock
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT,
-		Backend backend
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifndef NDEBUG
-		const bool should_not_call_base_scalar_unmasked_matrix_foldr = false;
-		assert( should_not_call_base_scalar_unmasked_matrix_foldr );
-#endif
-		(void) A;
-		(void) x;
-		(void) monoid;
-		return UNSUPPORTED;
-	}
-
-
-	/**
-	 * Reduces, or \em folds, a matrix into a scalar. 
-	 * Left-to-right masked variant.
-	 * 
-	 * Please see the masked grb::foldr variant for a full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M,
-		Backend backend
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, backend, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, backend, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifndef NDEBUG
-		const bool should_not_call_base_scalar_matrix_foldl = false;
-		assert( should_not_call_base_scalar_matrix_foldl );
-#endif
-		(void) A;
-		(void) x;
-		(void) mask;
-		(void) monoid;
-		return UNSUPPORTED;
-	}
-
-	/**
-	 * Reduces, or \em folds, a matrix into a scalar. 
-	 * Left-to-right unmasked variant.
-	 * 
-	 * Please see the masked grb::foldr variant for a full description.
-	 * 
-	 * \parblock
-	 * 
-	 * \par Valid descriptors specific to this variant
-	 * - descriptors::transpose_matrix: A^T will be considered instead 
-	 * 	 	of \a A.
-	 * 
-	 * \note See other valid descriptors in the masked variant.
-	 * \note Invalid descriptors will be ignored.
-	 * 
-	 * \endparblock
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT,
-		Backend backend
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, backend, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< 
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifndef NDEBUG
-		const bool should_not_call_base_scalar_unmasked_matrix_foldl = false;
-		assert( should_not_call_base_scalar_unmasked_matrix_foldl );
-#endif
-		(void) A;
-		(void) x;
-		(void) monoid;
-		return UNSUPPORTED;
-	}
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly below the k-th 
-	 * diagonal (excluded).
-	 *
-	 * @tparam descr      The descriptor to be used (descriptors::no_operation
-	 * 					  if left unspecified).
-	 * @tparam InputType  The type of the elements in the supplied ALP/GraphBLAS
-	 *                    matrix \a A.
-	 * @tparam OutputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                    matrix \a L.
-	 *
-	 * @param[out] L       The lower triangular portion of \a A, strictly below
-	 * 					   the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::MISMATCH If the dimensions of \a L and \a A do not match.
-	 *
- 	 * \parblock
-	 * \par Allowed descriptors
-	 * - transpose_matrix: Consider A^T instead of A.
-	 * - no_casting: If the types of \a L and \a A differ, the primitive
-	 * 				 will fail.
-	 * \endparblock
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType,
-		typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		Backend implementation
-	>
-	RC tril(
-		Matrix< OutputType, implementation, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase &phase = Phase::EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType >::value &&
-			std::is_convertible< InputType, OutputType >::value
-		>::type * const = nullptr
-	) {
-		(void) L;
-		(void) A;
-		(void) k;
-		(void) phase;
-#ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::tril()\n";
-#endif
-#ifndef NDEBUG
-		const bool selected_backend_does_not_support_tril = false;
-		assert( selected_backend_does_not_support_tril );
-#endif
-		const RC ret = grb::clear( L );
-		return ret == SUCCESS ? UNSUPPORTED : ret;
-	}
-
-	/**
-	 * Return the lower triangular portion of a matrix,
-	 * strictly below main diagonal (excluded).
-	 *
-	 * This primitive is strictly equivalent to calling
-	 * grb::tril( L, A, 0, phase ).
-	 * 
-	 * see grb::tril( L, A, k, phase ) for full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType,
-		typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		Backend implementation
-	>
-	RC tril(
-		Matrix< OutputType, implementation, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
-		const Phase &phase = Phase::EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType >::value &&
-			std::is_convertible< InputType, OutputType >::value
-		>::type * const = nullptr
-	) {
-		return tril< descr >( L, A, 0, phase );
-	}
-
-	/**
-	 * Return the upper triangular portion of a matrix, strictly above the k-th 
-	 * diagonal (excluded).
-	 *
-	 * @tparam descr      The descriptor to be used (descriptors::no_operation
-	 * 					  if left unspecified).
-	 * @tparam InputType  The type of the elements in the supplied ALP/GraphBLAS
-	 *                    matrix \a A.
-	 * @tparam OutputType The type of the elements in the supplied ALP/GraphBLAS
-	 *                    matrix \a U.
-	 *
-	 * @param[out] U       The upper triangular portion of \a A, strictly above 
-	 * 					   the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
-	 *
-	 * @return grb::SUCCESS  When the call completed successfully.
-	 * @return grb::MISMATCH If the dimensions of \a U and \a A do not match.
-	 *
- 	 * \parblock
-	 * \par Allowed descriptors
-	 * - transpose_matrix: Consider A^T instead of A.
-	 * - no_casting: If the types of \a T and \a A differ, the primitive
-	 * 				 will fail.
-	 * \endparblock
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType,
-		typename OutputType,
-		typename RIT_U, typename CIT_U, typename NIT_U,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		Backend implementation
-	>
-	RC triu(
-		Matrix< OutputType, implementation, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase &phase = Phase::EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType >::value &&
-			std::is_convertible< InputType, OutputType >::value
-		>::type * const = nullptr
-	) {
-		(void) U;
-		(void) A;
-		(void) k;
-		(void) phase;
-#ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::triu()\n";
-#endif
-#ifndef NDEBUG
-		const bool selected_backend_does_not_support_triu = false;
-		assert( selected_backend_does_not_support_triu );
-#endif
-		const RC ret = grb::clear( U );
-		return ret == SUCCESS ? UNSUPPORTED : ret;
-	}
-
-	/**
-	 * Return the upper triangular portion of a matrix,
-	 * strictly above main diagonal (excluded).
-	 *
-	 * This primitive is strictly equivalent to calling
-	 * grb::triu( U, A, 0, phase ) 
-	 * 
-	 * see grb::triu( U, A, k, phase ) for full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType,
-		typename OutputType,
-		typename RIT_U, typename CIT_U, typename NIT_U,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		Backend implementation
-	>
-	RC triu(
-		Matrix< OutputType, implementation, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, implementation, RIT_A, CIT_A, NIT_A > & A,
-		const Phase &phase = Phase::EXECUTE,
-		const typename std::enable_if<
-			!grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType >::value &&
-			std::is_convertible< InputType, OutputType >::value
-		>::type * const = nullptr
-	) {
-		return triu< descr >( U, A, 0, phase );
-	}
-
 	/**
 	 * @}
 	 */
diff --git a/include/graphblas/base/vector.hpp b/include/graphblas/base/vector.hpp
index 5eb4ad83a..c00ca6e53 100644
--- a/include/graphblas/base/vector.hpp
+++ b/include/graphblas/base/vector.hpp
@@ -236,28 +236,6 @@ namespace grb {
 				(void) n;
 			}
 
-			/**
-			 * Creates a dense ALP/GraphBLAS vector.
-			 *
-			 * This constructor takes an initialiser list of values that will be copied
-			 * into this vector. The size of the vector will be equal to the number of
-			 * elements in the initialiser list.
-			 *
-			 * For backends with more than one user process, the size of \a vals is the
-			 * global vector size, and the contents of \a vals are processed using
-			 * sequential I/O semantics.
-			 *
-			 * @see #grb::IOMode For the difference between sequential and parallel I/O
-			 *                   modes.
-			 *
-			 * \note There is only a difference if there are more than one user process.
-			 *
-			 * @param[in] vals The values to be copied into this vector.
-			 */
-			Vector( const std::initializer_list< D > &vals ) {
-				(void) vals;
-			}
-
 			/**
 			 * Move constructor.
 			 *
diff --git a/include/graphblas/blas0.hpp b/include/graphblas/blas0.hpp
index dad228bbf..751b2cf14 100644
--- a/include/graphblas/blas0.hpp
+++ b/include/graphblas/blas0.hpp
@@ -34,7 +34,6 @@
 #include "graphblas/descriptors.hpp"
 #include "graphblas/rc.hpp"
 #include "graphblas/type_traits.hpp"
-#include "graphblas/identities.hpp"
 
 #define NO_CAST_ASSERT( x, y, z )                                                  \
 	static_assert( x,                                                              \
@@ -605,37 +604,6 @@ namespace grb {
 
 		};
 
-		template< typename MaskType >
-		struct MaskHasValue {
-
-			public:
-				template < Descriptor descr = descriptors::no_operation, typename MaskStruct >
-				MaskHasValue( const MaskStruct& mask_raw, const size_t k ) {
-						bool hasValue = (bool) mask_raw.values[ k ];
-						if (descr & grb::descriptors::invert_mask) {
-							hasValue = !hasValue;
-						}
-						value = hasValue;
-					}
-
-				bool value;
-		};
-
-		template<>
-		struct MaskHasValue< void > {
-
-			public:
-				template < Descriptor descr = descriptors::no_operation, typename MaskStruct >
-				MaskHasValue( const MaskStruct& mask_raw, const size_t k ) :
-				value(not (descr & grb::descriptors::invert_mask)){
-					(void) mask_raw;
-					(void) k;
-				}
-
-				const bool value;
-
-		};
-
 	} // namespace internal
 
 } // namespace grb
diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp
index 13e144f38..386beb164 100644
--- a/include/graphblas/bsp1d/blas3.hpp
+++ b/include/graphblas/bsp1d/blas3.hpp
@@ -205,278 +205,6 @@ namespace grb {
 		return internal::checkGlobalErrorStateOrClear( C, ret );
 	}
 
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, BSP1D, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, BSP1D, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr( BSP1D, IOType <- op( InputType, IOType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( BSP1D, matrix, mask, monoid )\n";
-#endif
-		RC rc = SUCCESS;
-
-		if( grb::nnz( A ) == 0 ) {
-			return rc;
-		}
-
-		// Do local folding
-		IOType local = monoid.template getIdentity< IOType >();
-		rc = foldr< descr >( local, internal::getLocal( A ), internal::getLocal( mask ), monoid );
-
-#ifdef _DEBUG
-		std::cout << "After process-local delegation, local value has become "
-			<< local << ". Entering allreduce..." << std::endl;
-#endif
-
-		// All-reduce using \a op
-		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
-
-		// Accumulate end result
-		rc = rc ? rc : foldr( x, local, monoid.getOperator() );
-
-		return SUCCESS;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, BSP1D, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldr cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldr cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( BSP1D, matrix, monoid )\n";
-#endif
-		RC rc = SUCCESS;
-
-		if( grb::nnz( A ) == 0 ) {
-			return rc;
-		}
-
-		// Do local folding
-		IOType local = monoid.template getIdentity< IOType >();
-		rc = foldr< descr >( local, internal::getLocal( A ), monoid );
-
-#ifdef _DEBUG
-		std::cout << "After process-local delegation, local value has become "
-			<< local << ". Entering allreduce..." << std::endl;
-#endif
-
-		// All-reduce using \a op
-		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
-
-		// Accumulate end result
-		rc = rc ? rc : foldr( x, local, monoid.getOperator() );
-
-		return SUCCESS;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, BSP1D, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, BSP1D, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( BSP1D, matrix, mask, monoid )\n";
-#endif
-		RC rc = SUCCESS;
-
-		if( grb::nnz( A ) == 0 ) {
-			return rc;
-		}
-
-		// Do local folding
-		IOType local = monoid.template getIdentity< IOType >();
-		rc = foldl< descr >( local, internal::getLocal( A ), internal::getLocal( mask ), monoid );
-
-#ifdef _DEBUG
-		std::cout << "After process-local delegation, local value has become "
-			<< local << ". Entering allreduce..." << std::endl;
-#endif
-
-		// All-reduce using \a op
-		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
-
-		// Accumulate end result
-		rc = rc ? rc : foldl( x, local, monoid.getOperator() );
-
-		return SUCCESS;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, BSP1D, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl( BSP1D, IOType <- op( IOType, InputType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( BSP1D, matrix, monoid )\n";
-#endif
-		RC rc = SUCCESS;
-
-		if( grb::nnz( A ) == 0 ) {
-			return rc;
-		}
-
-		// Do local folding
-		IOType local = monoid.template getIdentity< IOType >();
-		rc = foldl< descr >( local, internal::getLocal( A ), monoid );
-
-#ifdef _DEBUG
-		std::cout << "After process-local delegation, local value has become "
-			<< local << ". Entering allreduce..." << std::endl;
-#endif
-
-		// All-reduce using \a op
-		rc = rc ? rc : collectives< BSP1D >::allreduce< descr >( local, monoid.getOperator() );
-
-		// Accumulate end result
-		rc = rc ? rc : foldl( x, local, monoid.getOperator() );
-
-		return SUCCESS;
-	}
-
 } // namespace grb
 
 #endif
diff --git a/include/graphblas/bsp1d/vector.hpp b/include/graphblas/bsp1d/vector.hpp
index a0d44829f..1e85db74e 100644
--- a/include/graphblas/bsp1d/vector.hpp
+++ b/include/graphblas/bsp1d/vector.hpp
@@ -2377,67 +2377,6 @@ namespace grb {
 #endif
 		}
 
-		/**
-		 * Constructs a BSP1D vector.
-		 *
-		 * @see Full description in base backend.
-		 *
-		 * \internal
-		 * This constructor initialises the local vector and synchronises the global
-		 * vector once.
-		 *
-		 * TODO rewrite below logic using an iterator filter (GitHub PR 233, issue
-		 * 228)
-		 * \endinternal
-		 */
-		Vector( const std::initializer_list< D > &vals )
-			: Vector( vals.size(), vals.size() )
-		{
-#ifdef _DEBUG
-			std::cerr << "In Vector< BSP1D >::Vector( initializer_list ) constructor\n";
-#endif
-			RC ret = SUCCESS;
-			const size_t n = vals.size();
-			const internal::BSP1D_Data &data = internal::grb_BSP1D.cload();
-
-			// Set all the local values
-			for( size_t i = 0; i < vals.size(); i++ ) {
-				const D val = *( vals.begin() + i );
-
-				// check if local
-				// if( (i / x._b) % data.P != data.s ) {
-				if( data.s !=
-					internal::Distribution< BSP1D >::global_index_to_process_id(
-						i, n, data.P
-					)
-				) {
-					continue;
-				}
-
-				// local, so translate index and perform requested operation
-				const size_t local_index =
-					internal::Distribution< BSP1D >::global_index_to_local( i, n, data.P );
-#ifdef _DEBUG
-				std::cout << data.s << ", grb::setElement translates global index "
-					<< i << " to " << local_index << "\n";
-#endif
-				ret = ret
-					? ret
-					: setElement( _local, val, local_index, EXECUTE );
-			}
-
-			// Synchronise once between all processes
-			if( SUCCESS !=
-				collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() )
-			) {
-				throw std::runtime_error( "grb::Vector< BSP1D >::Vector( initializer_list ): "
-					"collective::allreduce failed." );
-			}
-
-			// on successful execute, sync new nnz count
-			updateNnz();
-		}
-
 		/**
 		 * Copy constructor.
 		 *
diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
index 75785ee17..ee0c10f36 100644
--- a/include/graphblas/hyperdags/blas3.hpp
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -332,343 +332,6 @@ namespace grb {
 		return ret;
 	}
 
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, hyperdags, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldr (hyperdags, mask, matrix, monoid)\n";
-#endif
-
-		const RC ret = foldr< descr, Monoid >(
-			x, internal::getMatrix( A ), internal::getMatrix( mask ), monoid
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 2 > sourcesC{
-			getID( internal::getMatrix(A) ),
-			getID( internal::getMatrix(mask) )
-		};
-		std::array< uintptr_t, 0 > destinations{};
-		// NOTE scalar output is ignored
-		// std::array< uintptr_t, 1 > destinations{ &x };
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::FOLDR_SCALAR_MATRIX_MASK_MONOID,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesC.begin(), sourcesC.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldr (hyperdags, matrix, monoid)\n";
-#endif
-
-		const RC ret = foldr< descr, Monoid >(
-			x, internal::getMatrix( A ), monoid
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
-		std::array< uintptr_t, 0 > destinations{};
-		// NOTE scalar output is ignored
-		// std::array< uintptr_t, 1 > destinations{ &x };
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::FOLDR_SCALAR_MATRIX_MONOID,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesC.begin(), sourcesC.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, hyperdags, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		#ifdef _DEBUG
-		std::cout << "In grb::foldl (hyperdags, mask, matrix, monoid)\n";
-#endif
-
-		const RC ret = foldl< descr, Monoid >(
-			x, internal::getMatrix( A ), internal::getMatrix( mask ), monoid
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 2 > sourcesC{
-			getID( internal::getMatrix(A) ),
-			getID( internal::getMatrix(mask) )
-		};
-		std::array< uintptr_t, 0 > destinations{};
-		// NOTE scalar output is ignored
-		// std::array< uintptr_t, 1 > destinations{ &x };
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::FOLDL_SCALAR_MATRIX_MASK_MONOID,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesC.begin(), sourcesC.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldl (hyperdags, matrix, monoid)\n";
-#endif
-
-		const RC ret = foldl< descr, Monoid >(
-			x, internal::getMatrix( A ), monoid
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
-		std::array< uintptr_t, 0 > destinations{};
-		// NOTE scalar output is ignored
-		// std::array< uintptr_t, 1 > destinations{ &x };
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::FOLDL_SCALAR_MATRIX_MONOID,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesC.begin(), sourcesC.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-	
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly 
-	 * below the k-th diagonal.
-	 *
-	 * @param[out] L       The lower triangular portion of \a A, strictly 
-	 * 				 	   below the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive 
-	 * 					   is to proceed.
-	 *
-	 * \internal Pattern matrices are allowed
-	 */
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC tril(
-		Matrix< OutputType, hyperdags, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			! grb::is_object< OutputType >::value && 
-			! grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-			>::type * const = nullptr ) {
-#ifdef _DEBUG
-		std::cerr << "In grb::tril (hyperdags)\n";
-#endif
-
-		const RC ret = tril< descr >( 
-			internal::getMatrix( L ), 
-			internal::getMatrix( A ), 
-			k, phase 
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( phase != EXECUTE ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 1 > sourcesL{
-			getID( internal::getMatrix(A) )
-		};
-		std::array< uintptr_t, 1 > destinations{ 
-			getID( internal::getMatrix(L) )
-		};
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::TRIL_MATRIX,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesL.begin(), sourcesL.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly 
-	 * below the main diagonal.
-	 *
-	 * This primitive is strictly equivalent to calling 
-	 * grb::tril( L, A, 0, phase ).
-	 * 
-	 * see grb::tril( L, A, k, phase ) for full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC tril( 
-		Matrix< OutputType, hyperdags, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			! grb::is_object< OutputType >::value && 
-			! grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-			>::type * const = nullptr ) {
-		return tril< descr >( L, A, 0, phase );
-	}
-
-	/**
-	 * Return the upper triangular portion of a matrix, strictly
-	 * above the k-th diagonal.
-	 *
-	 * @param[out] U       The upper triangular portion of \a A, strictly 
-	 * 					   above the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive 
-	 * 					   is to proceed.
-	 *
-	 * \internal Pattern matrices are allowed
-	 */
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_U, typename CIT_U, typename NIT_U,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC triu(
-		Matrix< OutputType, hyperdags, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			! grb::is_object< OutputType >::value && 
-			! grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-			>::type * const = nullptr ) {
-#ifdef _DEBUG
-		std::cerr << "In grb::triu (hyperdags)\n";
-#endif
-
-		const RC ret = triu< descr >( 
-			internal::getMatrix( U ), 
-			internal::getMatrix( A ), 
-			k, phase 
-		);
-		if( ret != SUCCESS ) { return ret; }
-		if( phase != EXECUTE ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
-		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 1 > sourcesL{
-			getID( internal::getMatrix(A) )
-		};
-		std::array< uintptr_t, 1 > destinations{ 
-			getID( internal::getMatrix(U) )
-		};
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::TRIU_MATRIX,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesL.begin(), sourcesL.end(),
-			destinations.begin(), destinations.end()
-		);
-		return ret;
-	}
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly 
-	 * above the main diagonal.
-	 *
-	 * This primitive is strictly equivalent to 
-	 * calling grb::triu( U, A, 0, phase ).
-	 * 
-	 * see grb::tril( U, A, k, phase ) for full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_U, typename CIT_U, typename NIT_U,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC triu( 
-		Matrix< OutputType, hyperdags, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, hyperdags, RIT_A, CIT_A, NIT_A > & A,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			! grb::is_object< OutputType >::value && 
-			! grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-			>::type * const = nullptr ) {
-		return triu< descr >( U, A, 0, phase );
-	}
-
 } // end namespace grb
 
 #endif
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index 416fe4cfc..4ef0e0059 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -488,23 +488,12 @@ namespace grb {
 
 				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
 
-				EWISELAMBDA_FUNC_VECTOR,
+				EWISELAMBDA_FUNC_VECTOR
 
-				FOLDL_SCALAR_MATRIX_MASK_MONOID,
-
-				FOLDL_SCALAR_MATRIX_MONOID,
-
-				FOLDR_SCALAR_MATRIX_MASK_MONOID,
-
-				FOLDR_SCALAR_MATRIX_MONOID,
-
-				TRIL_MATRIX,
-
-				TRIU_MATRIX
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 112;
+			const constexpr size_t numOperationVertexTypes = 106;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
@@ -615,13 +604,7 @@ namespace grb {
 				EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
 				EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
 				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
-				EWISELAMBDA_FUNC_VECTOR,
-				FOLDL_SCALAR_MATRIX_MASK_MONOID,
-				FOLDL_SCALAR_MATRIX_MONOID,
-				FOLDR_SCALAR_MATRIX_MASK_MONOID,
-				FOLDR_SCALAR_MATRIX_MONOID,
-				TRIL_MATRIX,
-				TRIU_MATRIX
+				EWISELAMBDA_FUNC_VECTOR
 			};
 
 			/** \internal @returns The operation vertex type as a string. */
diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index 09b16634a..db1f09e54 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -180,8 +180,7 @@ namespace grb {
 		typename T
 	>
 	RC set(
-		Vector< DataType, hyperdags, Coords > &x, 
-		const T val,
+		Vector< DataType, hyperdags, Coords > &x, const T val,
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< DataType >::value &&
diff --git a/include/graphblas/hyperdags/vector.hpp b/include/graphblas/hyperdags/vector.hpp
index 02c0f7734..5f422399e 100644
--- a/include/graphblas/hyperdags/vector.hpp
+++ b/include/graphblas/hyperdags/vector.hpp
@@ -161,15 +161,6 @@ namespace grb {
 				register_vector();
 			}
 
-			Vector( const std::initializer_list< T > vals ) : vector( vals )
-			{
-#ifdef _DEBUG
-				std::cout << "In Vector< hyperdags >::Vector( initializer_list )"
-					<< " constructor\n";
-#endif
-				register_vector();
-			}
-
 			~Vector() {
 #ifdef _DEBUG
 				std::cout << "Vector (hyperdags) destructor\n";
diff --git a/include/graphblas/nonblocking/blas1.hpp b/include/graphblas/nonblocking/blas1.hpp
index f970da41a..f9f14cafc 100644
--- a/include/graphblas/nonblocking/blas1.hpp
+++ b/include/graphblas/nonblocking/blas1.hpp
@@ -548,7 +548,7 @@ namespace grb {
 			typename Monoid::D3 global =
 				monoid.template getIdentity< typename Monoid::D3 >();
 
-			size_t local_reduced_size = NONBLOCKING::numThreads() *
+			size_t local_reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
 				config::CACHE_LINE_SIZE::value();
 			IOType local_reduced[ local_reduced_size ];
 
@@ -10550,7 +10550,7 @@ namespace grb {
 				typename AddMonoid::D3 reduced =
 					addMonoid.template getIdentity< typename AddMonoid::D3 >();
 
-				size_t reduced_size = NONBLOCKING::numThreads() *
+				size_t reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
 					config::CACHE_LINE_SIZE::value();
 				typename AddMonoid::D3 array_reduced[ reduced_size ];
 
diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
index 27d633900..c52c9aaff 100644
--- a/include/graphblas/nonblocking/blas3.hpp
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -571,118 +571,6 @@ namespace grb {
 		);
 	}
 
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, nonblocking, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, nonblocking, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( nonblocking, matrix, mask, monoid )\n";
-#endif
-		// nonblocking execution is not supported
-		// first, execute any computation that is not completed
-		internal::le.execution();
-
-		// second, delegate to the reference backend
-		return foldr< descr, Monoid >( x, internal::getRefMatrix( A ), internal::getRefMatrix( mask ), monoid );
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( nonblocking, matrix, monoid )\n";
-#endif
-		// nonblocking execution is not supported
-		// first, execute any computation that is not completed
-		internal::le.execution();
-
-		// second, delegate to the reference backend
-		return foldr< descr, Monoid >( x, internal::getRefMatrix( A ), monoid	);
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, nonblocking, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, nonblocking, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( nonblocking, matrix, mask, monoid )\n";
-#endif
-		// nonblocking execution is not supported
-		// first, execute any computation that is not completed
-		internal::le.execution();
-
-		// second, delegate to the reference backend
-		return foldl< descr, Monoid >( x, internal::getRefMatrix( A ), internal::getRefMatrix( mask ), monoid );
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( nonblocking, matrix, monoid )\n";
-#endif
-		// nonblocking execution is not supported
-		// first, execute any computation that is not completed
-		internal::le.execution();
-
-		// second, delegate to the reference backend
-		return foldl< descr, Monoid >( x, internal::getRefMatrix( A ), monoid	);
-	}
-
 } // namespace grb
 
 #undef NO_CAST_ASSERT
diff --git a/include/graphblas/nonblocking/coordinates.hpp b/include/graphblas/nonblocking/coordinates.hpp
index ad9e3c670..bcb4cf42a 100644
--- a/include/graphblas/nonblocking/coordinates.hpp
+++ b/include/graphblas/nonblocking/coordinates.hpp
@@ -232,21 +232,6 @@ namespace grb {
 					}
 				}
 
-				/**
-				 * Sets this data structure to a dummy placeholder for a dense structure.
-				 *
-				 * This structure will be immutable, and does not support the majority of
-				 * operations this class defines; use dense coordinates with care.
-				 */
-				void setDense( const size_t dim ) noexcept {
-					_assigned = nullptr;
-					_stack = nullptr;
-					_buffer = nullptr;
-					_n = dim;
-					_cap = dim;
-					_buf = 0;
-				}
-
 				inline bool assign( const size_t i ) noexcept {
 					if( _n == _cap ) {
 						return true;
@@ -265,7 +250,7 @@ namespace grb {
 				}
 
 				template< bool maybe_invalid = false >
-				inline void local_assignAll() noexcept {
+				inline void local_assignAll( ) noexcept {
 					if( maybe_invalid || _n != _cap ) {
 						if( _assigned != nullptr ) {
 							assert( _stack != nullptr );
@@ -318,21 +303,8 @@ namespace grb {
 					}
 				}
 
-				inline void assignAll() noexcept {
-					// this operates on the global coordinates, not on a local view of it
-					#pragma omp parallel
-					{
-						size_t start, end;
-						config::OMP::localRange( start, end, 0, _cap );
-						for( size_t i = start; i < end; ++i ) {
-							_assigned[ i ] = true;
-							_stack[ i ] = i;
-						}
-					}
-					_n = _cap;
-				}
-
 				inline void clear() noexcept {
+
 					if( _n == _cap ) {
 #ifndef NDEBUG
 						if( _assigned == nullptr && _cap > 0 ) {
@@ -340,6 +312,7 @@ namespace grb {
 							assert( dense_coordinates_may_not_call_clear );
 						}
 #endif
+
 						#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
 						for( size_t i = 0; i < _cap; ++i ) {
 							_assigned[ i ] = false;
diff --git a/include/graphblas/nonblocking/matrix.hpp b/include/graphblas/nonblocking/matrix.hpp
index 00d0e5ebb..251e2037d 100644
--- a/include/graphblas/nonblocking/matrix.hpp
+++ b/include/graphblas/nonblocking/matrix.hpp
@@ -419,32 +419,6 @@ namespace grb {
 			const Matrix< InputType, nonblocking, RIT, CIT, NIT > &
 		);
 
-		// Native interface friends
-
-		friend const grb::Matrix<
-			D, nonblocking,
-			ColIndexType, ColIndexType, NonzeroIndexType
-		>
-		internal::wrapCRSMatrix< D, ColIndexType, NonzeroIndexType, nonblocking >(
-			const D *__restrict__ const,
-			const ColIndexType *__restrict__ const,
-			const NonzeroIndexType *__restrict__ const,
-			const size_t, const size_t
-		);
-
-		friend grb::Matrix<
-			D, nonblocking,
-			ColIndexType, ColIndexType, NonzeroIndexType
-		>
-		internal::wrapCRSMatrix< D, ColIndexType, NonzeroIndexType, nonblocking >(
-			D *__restrict__ const,
-			ColIndexType *__restrict__ const,
-			NonzeroIndexType *__restrict__ const,
-			const size_t, const size_t, const size_t,
-			char * const, char * const,
-			D *__restrict__ const
-		);
-
 
 		private:
 
diff --git a/include/graphblas/nonblocking/vector.hpp b/include/graphblas/nonblocking/vector.hpp
index a70c48521..9b57d8c24 100644
--- a/include/graphblas/nonblocking/vector.hpp
+++ b/include/graphblas/nonblocking/vector.hpp
@@ -197,46 +197,12 @@ namespace grb {
 
 		friend class PinnedVector< D, nonblocking >;
 
-		// Native interface friends
-
-		template< typename ValueType, Backend backend >
-		friend Vector<
-			ValueType, backend,
-			internal::Coordinates<
-				config::IMPLEMENTATION< backend >::coordinatesBackend()
-			>
-		> internal::wrapRawVector( const size_t n, ValueType *__restrict__ const
-			raw );
-
-		template< typename ValueType, Backend backend >
-		friend const Vector<
-			ValueType, backend,
-			internal::Coordinates<
-				config::IMPLEMENTATION< backend >::coordinatesBackend()
-			>
-		> internal::wrapRawVector( const size_t n, const ValueType *__restrict__ const raw );
-
 
 		private:
 
 			Vector< D, reference, MyCoordinates > ref;
 
 
-		protected:
-
-			/**
-			 * \internal Internal constructor that wraps around an existing raw dense
-			 *           vector. This constructor results in a dense vector whose
-			 *           structure is immutable. Any invalid use incurs UB; use with care.
-			 */
-			Vector( const size_t n, D *__restrict__ const raw ) : ref( n, raw ) {
-#ifdef _DEBUG
-				std::cerr << "In Vector< nonblocking > constructor that wraps around an "
-					<< "external raw array.\n";
-#endif
-			}
-
-
 		public:
 
 			/** @see Vector::value_type. */
@@ -255,14 +221,8 @@ namespace grb {
 			typedef typename Vector< D, reference, MyCoordinates >::const_iterator
 				const_iterator;
 
-			Vector( const size_t n, const size_t nz ) : ref( n, nz ) {
-				// pipeline execution is not required here as this is a grb::Vector
-				// declaration
-#ifdef _DEBUG
-				std::cerr << "In Vector< nonblocking >::Vector( size_t, size_t )"
-					<< " constructor\n";				
-#endif
-			}
+
+			Vector( const size_t n, const size_t nz ) : ref( n, nz ) {}
 
 			Vector( const size_t n ) : Vector( n, n ) {
 
@@ -273,15 +233,6 @@ namespace grb {
 #endif
 			}
 
-			Vector( const std::initializer_list< D > vals ) : ref( vals ) {
-				// pipeline execution is not required here as this is a grb::Vector
-				// declaration
-#ifdef _DEBUG
-				std::cerr << "In Vector< nonblocking >::Vector( initializer_list )"
-					<< " constructor\n";				
-#endif
-			}
-
 			Vector() : Vector( 0 ) {}
 
 			Vector( const Vector< D, nonblocking, MyCoordinates > &x ) :
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 865f7bc51..40acf6531 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2573,10 +2573,9 @@ namespace grb {
 				j_start = A.n / 2;
 				assert( A.n > 0 );
 				while( j_start < A.n && !(
-						A.CCS.col_start[ j_start ] <= start &&
-						start < A.CCS.col_start[ j_start + 1 ]
-					)
-				) {
+					A.CCS.col_start[ j_start ] <=
+						start && start < A.CCS.col_start[ j_start + 1 ]
+				) ) {
 #ifdef _DEBUG
  #ifdef _H_GRB_REFERENCE_OMP_BLAS2
 					#pragma omp critical
@@ -2609,32 +2608,33 @@ namespace grb {
 				j_left_range = 0;
 				j_right_range = A.n;
 				j_end = A.n / 2;
-				while( j_end < A.n && !(
+				if( j_end < A.CCS.col_start[ A.n ] ) {
+					while( j_end < A.n && !(
 						A.CCS.col_start[ j_end ] <= end &&
 						end < A.CCS.col_start[ j_end + 1 ]
-					)
-				) {
+					) ) {
 #ifdef _DEBUG
  #ifdef _H_GRB_REFERENCE_OMP_BLAS2
-					#pragma omp critical
+						#pragma omp critical
  #endif
-					std::cout << "\t binary search for " << end << " in [ " << j_left_range
-						<< ", " << j_right_range << " ) = [ " << A.CCS.col_start[ j_left_range ]
-						<< ", " << A.CCS.col_start[ j_right_range ] << " ). "
-						<< "Currently tried and failed at " << j_end << "\n";
-#endif
-					if( j_right_range == j_left_range ) {
-						assert( false );
-						break;
-					} else if( A.CCS.col_start[ j_end ] > end ) {
-						j_right_range = j_end;
-					} else {
-						j_left_range = j_end + 1;
+						std::cout << "\t binary search for " << end << " in [ " << j_left_range
+							<< ", " << j_right_range << " ) = [ " << A.CCS.col_start[ j_left_range ]
+							<< ", " << A.CCS.col_start[ j_right_range ] << " ). "
+							<< "Currently tried and failed at " << j_end << "\n";
+#endif
+						if( j_right_range == j_left_range ) {
+							assert( false );
+							break;
+						} else if( A.CCS.col_start[ j_end ] > end ) {
+							j_right_range = j_end;
+						} else {
+							j_left_range = j_end + 1;
+						}
+						assert( j_right_range >= j_left_range );
+						j_end = j_right_range - j_left_range;
+						j_end /= 2;
+						j_end += j_left_range;
 					}
-					assert( j_right_range >= j_left_range );
-					j_end = j_right_range - j_left_range;
-					j_end /= 2;
-					j_end += j_left_range;
 				}
 				if( j_start > j_end ) {
 					j_start = j_end;
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 8bc18f32a..868547183 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -57,25 +57,6 @@
 		"********************************************************************" \
 		"******************************\n" );
 
-#ifndef _DEBUG_PRINT
-	#ifndef _DEBUG
-		#define _DEBUG_PRINT( msg )
-	#else
-		#ifdef _GRB_WITH_OMP
-			#ifndef _GRB_DEBUG_CRITICAL_SECTION
-				#define _GRB_DEBUG_CRITICAL_SECTION _Pragma("omp critical(_GRB_DEBUG_CRITICAL_SECTION)")
-			#endif
-			#define _DEBUG_PRINT( msg ) \
-				_GRB_DEBUG_CRITICAL_SECTION \
-				{ \
-					std::cout << "[T" << omp_get_thread_num() << "] - " << msg << std::flush; \
-				}
-		#else
-			#define _DEBUG_PRINT( msg ) std::cout << msg << std::flush;
-		#endif
-	#endif
-#endif
-
 namespace grb {
 
 	namespace internal {
@@ -937,185 +918,6 @@ namespace grb {
 
 	namespace internal {
 
-		template<
-			Descriptor descr = descriptors::no_operation,
-			class Monoid,
-			typename InputType, typename IOType,
-			typename RIT, typename CIT, typename NIT
-		>
-		RC fold_unmasked_generic(
-			IOType &x,
-			const Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			const Monoid &monoid
-		) {
-			_DEBUG_PRINT( "In grb::internal::foldr_unmasked_generic( reference )\n" );
-			RC rc = SUCCESS;
-
-			if( grb::nnz( A ) == 0 ) {
-				_DEBUG_PRINT( "The input matrix is empty, nothing to compute\n" );
-				return rc;
-			}
-
-			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
-				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
-				return RC::ILLEGAL;
-			}
-			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_matrix ) {
-				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_matrix is not supported\n" );
-				return RC::ILLEGAL;
-			}
-
-			const auto &A_raw = (descr & grb::descriptors::transpose_matrix || descr & grb::descriptors::transpose_left ) ?
-				internal::getCCS( A ) : internal::getCRS( A );
-			const size_t A_nnz = nnz( A );
-
-			const auto& op = monoid.getOperator();
-			RC local_rc = rc;
-			auto local_x = monoid.template getIdentity< typename Monoid::D3 >();
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, x, rc, std::cout) firstprivate(local_x, local_rc, A_nnz, op)
-#endif
-			{
-				size_t start = 0;
-				size_t end = A_nnz;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start, end, 0, A_nnz );
-#endif
-
-				for( size_t idx = start; idx < end; ++idx ) {
-					// Get A value
-					const InputType a_val = A_raw.values[ idx ];
-					_DEBUG_PRINT( "A.values[ " + std::to_string( idx ) + " ] = " + std::to_string( a_val ) + "\n" );
-
-					// Compute the fold for this coordinate
-					auto local_x_before = local_x;
-					local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-					_DEBUG_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
-				}
-
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp critical
-#endif
-				{ // Reduction with the global result (critical section if OpenMP)
-					auto x_before = x;
-					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
-					_DEBUG_PRINT( "Computing x: op(" + std::to_string( local_x ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( x ) + "\n" );
-					rc = rc ? rc : local_rc;
-				}
-			}
-
-			return rc;
-		}
-
-		template<
-			Descriptor descr = descriptors::no_operation,
-			class Monoid,
-			typename InputType, typename IOType, typename MaskType,
-			typename RIT_A, typename CIT_A, typename NIT_A,
-			typename RIT_M, typename CIT_M, typename NIT_M
-		>
-		RC fold_masked_generic(
-			IOType &x,
-			const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
-			const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
-			const Monoid &monoid
-		) {
-			_DEBUG_PRINT( "In grb::internal::foldr_masked_generic( reference )\n" );
-			RC rc = SUCCESS;
-
-			if( grb::nnz( mask ) == 0 || grb::nnz( A ) == 0 ) {
-				_DEBUG_PRINT( "The mask and/or the input matrix are empty, nothing to compute\n" );
-				return rc;
-			}
-
-			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_left ) {
-				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_left is not supported\n" );
-				return RC::ILLEGAL;
-			}
-			if ( descr & descriptors::force_row_major && descr & descriptors::transpose_right ) {
-				_DEBUG_PRINT( "Masked fold with force_row_major and transpose_right is not supported\n" );
-				return RC::ILLEGAL;
-			}
-
-			const auto& identity = monoid.template getIdentity< typename Monoid::D3 >();
-			const auto& op = monoid.getOperator();
-
-			const auto &A_raw = descr & grb::descriptors::transpose_left ?
-				internal::getCCS( A ) : internal::getCRS( A );
-			const auto &mask_raw = descr & grb::descriptors::transpose_right ?
-				internal::getCCS( mask ) : internal::getCRS( mask );
-			const size_t m = descr & grb::descriptors::transpose_left ?
-				ncols( A ) : nrows( A );
-			const size_t n = descr & grb::descriptors::transpose_left ?
-				nrows( A ) : ncols( A );
-			const size_t m_mask = descr & grb::descriptors::transpose_right ?
-				ncols( mask ) : nrows( mask );
-			const size_t n_mask = descr & grb::descriptors::transpose_right ?
-				nrows( mask ) : ncols( mask );
-
-			// Check mask dimensions
-			if( m != m_mask || n != n_mask ) {
-				_DEBUG_PRINT( "Mask dimensions do not match input matrix dimensions\n" );
-				return MISMATCH;
-			}
-
-			RC local_rc = rc;
-			auto local_x = identity;
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp parallel default(none) shared(A_raw, mask_raw, x, rc, std::cout) firstprivate(local_x, local_rc, m, op, identity)
-#endif
-			{
-				size_t start_row = 0;
-				size_t end_row = m;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-				config::OMP::localRange( start_row, end_row, 0, m );
-#endif
-				for( auto i = start_row; i < end_row; ++i ) {
-					auto mask_k = mask_raw.col_start[ i ];
-					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-						auto k_col = A_raw.row_index[ k ];
-
-						// Increment the mask pointer until we find the right column, or a lower column (since the storage withing a row is sorted in a descending order)
-						while( mask_k < mask_raw.col_start[ i + 1 ] && mask_raw.row_index[ mask_k ] > k_col  ) {
-							_DEBUG_PRINT( "NEquals masked coordinate: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-							mask_k++;
-						}
-
-						if( mask_raw.row_index[ mask_k ] < k_col || not MaskHasValue< MaskType >( mask_raw, mask_k ).value ) {
-							mask_k++;
-							_DEBUG_PRINT( "Skip masked value at: ( " + std::to_string( i ) + ";" + std::to_string( mask_raw.row_index[ mask_k ] ) + " )\n" );
-							continue;
-						}
-
-						// Get A value
-						const InputType a_val = A_raw.getValue( k, identity );
-						_DEBUG_PRINT( "A( " + std::to_string( i ) + ";" + std::to_string( k_col ) + " ) = " + std::to_string( a_val ) + "\n" );
-
-						// Compute the fold for this coordinate
-						auto local_x_before = local_x;
-						local_rc = local_rc ? local_rc : grb::apply< descr >( local_x, local_x_before, a_val, op );
-						_DEBUG_PRINT( "Computing: local_x = op(" + std::to_string( a_val ) + ", " + std::to_string( local_x_before ) + ") = " + std::to_string( local_x ) + "\n" );
-					}
-				}
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-	#pragma omp critical
-#endif
-				{ // Reduction with the global result (critical section if OpenMP)
-					auto x_before = x;
-					local_rc = local_rc ? local_rc : grb::apply< descr >( x, x_before, local_x, op );
-					_DEBUG_PRINT( "Computing x: op(" + std::to_string( local_x ) + ", " + std::to_string( x_before ) + ") = " + std::to_string( x ) + "\n" );
-					rc = rc ? rc : local_rc;
-				}
-			}
-
-			return rc;
-		}
-
-
 		/**
 		 * \internal general elementwise matrix application that all eWiseApply
 		 *           variants refer to.
@@ -1402,142 +1204,6 @@ namespace grb {
 			return SUCCESS;
 		}
 
-		template<
-			bool upper,
-			Descriptor descr = descriptors::no_operation,
-			typename InputType, typename OutputType,
-			typename RIT_L, typename CIT_L, typename NIT_L,
-			typename RIT_A, typename CIT_A, typename NIT_A
-		>
-		RC trilu_generic(
-			Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
-			const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
-			const long int k,
-			const Phase & phase ) {
-
-			const size_t m = descr & descriptors::transpose_matrix ? ncols( A ) : nrows( A );
-			const size_t n = descr & descriptors::transpose_matrix ? nrows( A ) : ncols( A );
-
-			// Run-time checks
-			if( m != nrows( L ) || n != ncols( L ) ) {
-				return RC::MISMATCH;
-			}
-
-#ifdef _DEBUG
-			std::cout << "In grb::internal::trilu_generic( reference )\n";
-#endif
-			const auto & A_raw = descr & descriptors::transpose_matrix ? internal::getCCS( A ) : internal::getCRS( A );
-
-			if( phase == Phase::RESIZE ) {
-				size_t nzc = 0;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for reduction( + : nzc ) default( none ) shared( A_raw ) firstprivate( k, m )
-#endif
-				for( size_t i = 0; i < m; ++i ) {
-					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-						const auto A_j = A_raw.row_index[ A_k ];
-						// If the value is in the appropriate triangle, skip it
-						if( not upper && A_j > i + k ) {
-							continue;
-						} 
-						if( upper && A_j < i - k ) {
-							continue;
-						}
-						nzc += 1;
-					}
-				}
-#ifdef _DEBUG
-				std::cout << "RESIZE phase: resize( L, " << nzc << " )\n";
-#endif
-				return resize( L, nzc );
-			}
-
-			if( phase == Phase::EXECUTE ) {
-
-				const auto & L_crs_raw = internal::getCRS( L );
-				const auto & L_ccs_raw = internal::getCCS( L );
-				const size_t nzc = capacity( L );
-
-				L_crs_raw.col_start[ 0 ] = 0;
-				L_ccs_raw.col_start[ 0 ] = 0;
-
-				// Prefix sum computation into L.CRS.col_start
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel for simd default( none ) shared( A_raw, L_crs_raw, L_ccs_raw ) firstprivate( k, m )
-#endif
-				for( size_t i = 0; i < m; i++ ) {
-					size_t cumul = 0UL;
-					for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-						const auto A_j = A_raw.row_index[ A_k ];
-						// If the value is in the appropriate triangle, skip it
-						if( not upper && A_j > i + k ) {
-							continue;
-						} 
-						if( upper && A_j < i - k ) {
-							continue;
-						}
-						cumul += 1;
-					}
-					L_crs_raw.col_start[ i + 1 ] = cumul;
-				}
-
-				// Apply the prefix sum
-				for( size_t i = 1; i <= m; i++ ) {
-					L_crs_raw.col_start[ i ] += L_crs_raw.col_start[ i - 1 ];
-					L_ccs_raw.col_start[ i ] = L_crs_raw.col_start[ i ];
-				}
-
-				// Check if the number of nonzeros is greater than the capacity
-				if( L_crs_raw.col_start[ m ] > nzc ) {
-#ifdef _DEBUG
-					std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n"
-							  << "Requested " << L_crs_raw.col_start[ m ] << " nonzeros, but capacity is " << nzc << "\n";
-#endif
-					return RC::MISMATCH;
-				}
-
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-#pragma omp parallel default( none ) shared( A_raw, L_crs_raw, L_ccs_raw ) firstprivate( k, m )
-#endif
-				{
-					size_t start_row = 0;
-					size_t end_row = m;
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-					config::OMP::localRange( start_row, end_row, 0, m );
-#endif
-					// Update the CRS and CCS row indices and values
-					for( size_t i = start_row; i < end_row; i++ ) {
-						auto L_k = L_crs_raw.col_start[ i ];
-						for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) {
-							const auto A_j = A_raw.row_index[ A_k ];
-							// If the value is in the appropriate triangle, skip it
-							if( not upper && A_j > i + k ) {
-								continue;
-							} 
-							if( upper && A_j < i - k ) {
-								continue;
-							}
-
-							L_crs_raw.row_index[ L_k ] = A_j;
-							L_crs_raw.values[ L_k ] = A_raw.values[ A_k ];
-							L_ccs_raw.row_index[ L_k ] = i;
-							L_ccs_raw.values[ L_k ] = A_raw.values[ A_k ];
-							L_k += 1;
-						}
-					}
-				}
-
-#ifdef _DEBUG
-				std::cout << "EXECUTE phase: setCurrentNonzeroes( L, " << nzc << " )\n";
-#endif
-				internal::setCurrentNonzeroes( L, nzc );
-
-				return RC::SUCCESS;
-			}
-
-			return RC::SUCCESS;;
-		}
-
 	} // namespace internal
 
 	/**
@@ -1603,6 +1269,7 @@ namespace grb {
 	 *
 	 * \internal Dispatches to internal::eWiseApply_matrix_generic
 	 */
+
 	template<
 		Descriptor descr = grb::descriptors::no_operation,
 		class Operator,
@@ -1659,350 +1326,6 @@ namespace grb {
 		);
 	}
 
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid = Monoid(),
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a postfactor input type that does not match the second domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( reference, mask, matrix, monoid )\n";
-#endif
-
-		return internal::fold_masked_generic< descr, Monoid >(
-			x, A, mask, monoid
-		);
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldr(
-		IOType &x,
-		const Matrix< InputType, reference, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if< !grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"the operator version of foldr cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, InputType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, IOType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with a postfactor input type that does not match the second domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldr( reference, IOType <- op( InputType, IOType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldr( reference, matrix, monoid )\n";
-#endif
-
-		return internal::fold_unmasked_generic< descr, Monoid >(
-			x, A, monoid
-		);
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType, typename MaskType,
-		typename RIT_A, typename CIT_A, typename NIT_A,
-		typename RIT_M, typename CIT_M, typename NIT_M
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > &A,
-		const Matrix< MaskType, reference, RIT_M, CIT_M, NIT_M > &mask,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			!grb::is_object< MaskType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the second domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( reference, mask, matrix, monoid )\n";
-#endif
-
-		return internal::fold_masked_generic< descr, Monoid >(
-			x, A, mask, monoid
-		);
-	}
-
-	template<
-		Descriptor descr = descriptors::no_operation,
-		class Monoid,
-		typename InputType, typename IOType,
-		typename RIT, typename CIT, typename NIT
-	>
-	RC foldl(
-		IOType &x,
-		const Matrix< InputType, reference, RIT, CIT, NIT > &A,
-		const Monoid &monoid,
-		const typename std::enable_if<
-			!grb::is_object< IOType >::value &&
-			!grb::is_object< InputType >::value &&
-			grb::is_monoid< Monoid >::value, void
-		>::type * const = nullptr
-	) {
-		// static checks
-		static_assert( !std::is_same< InputType, void >::value,
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"input matrix is a pattern matrix (of type void)"
-		);
-		static_assert( !std::is_same< IOType, void >::value,
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"the operator version of foldl cannot be used if the "
-			"result is of type void"
-		);
-		static_assert( (std::is_same< typename Monoid::D1, IOType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a prefactor input type that does not match the first domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D2, InputType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with a postfactor input type that does not match the second domain of the given operator"
-		);
-		static_assert( (std::is_same< typename Monoid::D3, IOType >::value),
-			"grb::foldl( reference, IOType <- op( IOType, InputType ): "
-			"called with an output type that does not match the output domain of the given operator"
-		);
-
-#ifdef _DEBUG
-		std::cout << "In grb::foldl( reference, matrix, monoid )\n";
-#endif
-
-		return internal::fold_unmasked_generic< descr, Monoid >(
-			x, A, monoid
-		);
-	}
-
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly below 
-	 * the k-th diagonal.
-	 *
-	 * @param[out] L       The lower triangular portion of \a A, strictly
-	 * 					   below the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
-	 *
-	 * \internal Pattern matrices are allowed
-	 *
-	 * \internal Dispatches to internal::trilu_generic
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC tril(
-		Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			not grb::is_object< OutputType >::value && 
-			not grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-		>::type * const = nullptr ) 
-	{
-#ifdef _DEBUG
-		std::cerr << "In grb::tril (reference)\n";
-#endif
-
-		// Static checks
-		NO_CAST_ASSERT( 
-			( not ( descr & descriptors::no_casting ) || 
-			std::is_same< InputType, OutputType >::value ), 
-			"grb::tril (reference)",
-			"input matrix and output matrix are incompatible for implicit casting"
-		);
-
-		return internal::trilu_generic< false, descr >( L, A, k, phase );
-	}
-
-	/**
-	 * Return the lower triangular portion of a matrix, strictly below main diagonal.
-	 *
-	 * This primitive is strictly equivalent to calling grb::tril( L, A, 0, phase ).
-	 * see grb::tril( L, A, k, phase ) for full description.
-	 */
-	template<
-		Descriptor descr = descriptors::no_operation,
-		typename InputType, typename OutputType,
-		typename RIT_L, typename CIT_L, typename NIT_L,
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC tril(
-		Matrix< OutputType, reference, RIT_L, CIT_L, NIT_L > & L,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			not grb::is_object< OutputType >::value && 
-			not grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-		>::type * const = nullptr )
-	{
-		return tril< descr >( L, A, 0, phase );
-	}
-
-	/**
-	 * Return the upper triangular portion of a matrix, strictly above 
-	 * the k-th diagonal.
-	 *
-	 * @param[out] U       The upper triangular portion of \a A, strictly 
-	 * 					   above the k-th diagonal.
-	 * @param[in]  A       Any ALP/GraphBLAS matrix.
-	 * @param[in]  k       The diagonal above which to zero out \a A.
-	 * @param[in]  phase   The #grb::Phase in which the primitive is to proceed.
-	 *
-	 * \internal Pattern matrices are allowed
-	 *
-	 * \internal Dispatches to internal::trilu_generic
-	 */
-	template< 
-		Descriptor descr = descriptors::no_operation, 
-		typename InputType, typename OutputType, 
-		typename RIT_U, typename CIT_U, typename NIT_U, 
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC triu(
-		Matrix< OutputType, reference, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
-		const long int k,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			not grb::is_object< OutputType >::value && 
-			not grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-		>::type * const = nullptr )
-	{
-#ifdef _DEBUG
-		std::cerr << "In grb::triu (reference)\n";
-#endif
-
-		// Static checks
-		NO_CAST_ASSERT( 
-			( not ( descr & descriptors::no_casting ) || 
-			std::is_same< InputType, OutputType >::value ), 
-			"grb::triu (reference)",
-			"input matrix and output matrix are incompatible for implicit casting"
-		);
-
-		// Add descriptors::transpose_matrix to descr
-		return internal::trilu_generic< true, descr >( U, A, k, phase );
-	}
-
-	/**
-	 * Return the upper triangular portion of a matrix, stricly above the main diagonal.
-	 *
-	 * This primitive is strictly equivalent to calling grb::triu( L, A, 0, phase ).
-	 * see grb::triu( L, A, k, phase ) for full description.
-	 */
-	template< 
-		Descriptor descr = descriptors::no_operation, 
-		typename InputType, typename OutputType, 
-		typename RIT_U, typename CIT_U, typename NIT_U, 
-		typename RIT_A, typename CIT_A, typename NIT_A
-	>
-	RC triu(
-		Matrix< OutputType, reference, RIT_U, CIT_U, NIT_U > & U,
-		const Matrix< InputType, reference, RIT_A, CIT_A, NIT_A > & A,
-		const Phase & phase = Phase::EXECUTE,
-		const typename std::enable_if< 
-			not grb::is_object< OutputType >::value && 
-			not grb::is_object< InputType >::value && 
-			std::is_convertible< InputType, OutputType >::value 
-		>::type * const = nullptr )
-	{
-		return triu< descr >( U, A, 0, phase );
-	}
-
 } // namespace grb
 
 #undef NO_CAST_ASSERT
@@ -2019,3 +1342,4 @@ namespace grb {
 #endif
 
 #endif // ``_H_GRB_REFERENCE_BLAS3''
+
diff --git a/include/graphblas/reference/vector.hpp b/include/graphblas/reference/vector.hpp
index fcd2516d8..f0db908b2 100644
--- a/include/graphblas/reference/vector.hpp
+++ b/include/graphblas/reference/vector.hpp
@@ -244,8 +244,6 @@ namespace grb {
 
 		friend class PinnedVector< D, BSP1D >;
 
-		friend class Vector< D, nonblocking, internal::Coordinates< nonblocking > >;
-
 		template< typename ValueType, Backend backend >
 		friend Vector<
 			ValueType, backend,
@@ -857,28 +855,6 @@ namespace grb {
 #endif
 			}
 
-			/**
-			 * Constructs a reference vector.
-			 *
-			 * @see Full description in base backend.
-			 */
-			Vector( const std::initializer_list< D > &vals )
-				: Vector( vals.size(), vals.size() )
-			{
-#ifdef _DEBUG
-				std::cerr << "In Vector< reference >::Vector( initializer_list )"
-					<< " constructor\n";
-#endif
-
-#ifdef _H_GRB_REFERENCE_OMP_VECTOR
-				#pragma omp parallel for simd
-#endif
-				for( size_t i = 0; i < vals.size(); ++i ) {
-					_raw[ i ] = *( vals.begin() + i );
-				}
-				_coordinates.assignAll();
-			}
-
 			/**
 			 * The default constructor creates an empty vector and should never be
 			 * used explicitly.
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index bf574515d..6000f3af7 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -380,24 +380,6 @@ std::string grb::internal::hyperdags::toString(
 		case GETID_MATRIX:
 			return "getID( matrix )";
 
-		case FOLDL_SCALAR_MATRIX_MASK_MONOID:
-			return "foldl( scalar, matrix, matrix, monoid )";
-
-		case FOLDL_SCALAR_MATRIX_MONOID:
-			return "foldl( scalar, matrix, monoid )";
-
-		case FOLDR_SCALAR_MATRIX_MASK_MONOID:
-			return "foldr( scalar, matrix, matrix, monoid )";
-
-		case FOLDR_SCALAR_MATRIX_MONOID:
-			return "foldr( scalar, matrix, monoid )";
-			
-		case TRIL_MATRIX:
-			return "tril( matrix, matrix )";
-
-		case TRIU_MATRIX:
-			return "triu( matrix, matrix )";
-
 	}
 	assert( false );
 	return "unknown operation";
diff --git a/src/transition/CMakeLists.txt b/src/transition/CMakeLists.txt
index 5c6abcc72..daf7252e0 100644
--- a/src/transition/CMakeLists.txt
+++ b/src/transition/CMakeLists.txt
@@ -18,79 +18,48 @@
 # This file creates the basic target(s) needed by all backends
 #
 
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_NONBLOCKING_BACKEND )
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND )
 
-function( add_transition_library target_name lib_type lib_name src1 )
-
-	set( multiValueArgs
-		"SOURCES"
-		"PUBLIC_LINK_LIBRARIES"
-		"PRIVATE_LINK_LIBRARIES"
+if( WITH_REFERENCE_BACKEND )
+	add_library( sparseblas_static STATIC
+		${CMAKE_CURRENT_SOURCE_DIR}/sparseblas.cpp
 	)
-	cmake_parse_arguments( parsed "" "" "${multiValueArgs}" "SOURCES;${src1};${ARGN}" )
 
-	add_library( ${target_name} ${lib_type} ${parsed_SOURCES} )
-	set_target_properties( ${target_name} PROPERTIES
-		OUTPUT_NAME ${lib_name}
+	set_target_properties( sparseblas_static PROPERTIES
+		OUTPUT_NAME "sparseblas"
 		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/shmem"
 	)
 
-	target_link_libraries( ${target_name} PUBLIC transition_headers ${parsed_PUBLIC_LINK_LIBRARIES} )
-	target_link_libraries( ${target_name} PRIVATE backend_flags ${parsed_PRIVATE_LINK_LIBRARIES} )
-	add_dependencies( libs ${target_name} )
-	install( TARGETS ${target_name} EXPORT GraphBLASTargets
-		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
+	target_link_libraries( sparseblas_static PUBLIC backend_reference transition )
 
-endfunction( add_transition_library )
+	target_link_libraries( sparseblas_static PRIVATE backend_flags )
 
-if( WITH_REFERENCE_BACKEND )
-	add_transition_library( sparseblas_sequential_static STATIC "sparseblas_sequential" ${CMAKE_CURRENT_SOURCE_DIR}/sparseblas.cpp
-		PUBLIC_LINK_LIBRARIES backend_reference
-	)
+	add_dependencies( libs sparseblas_static )
 
-	# this is the version for sequantial execution only
-	add_transition_library( ${_SPBLAS_PREFIX}sequential STATIC "${_SPBLAS_PREFIX}sequential"
-		${CMAKE_CURRENT_SOURCE_DIR}/spblas.cpp PUBLIC_LINK_LIBRARIES backend_reference
+	install( TARGETS sparseblas_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
 	)
-	target_compile_definitions( ${_SPBLAS_PREFIX}sequential PUBLIC SPBLAS_PREFIX=${_SPBLAS_PREFIX} )
-
-	if( ENABLE_SOLVER_LIB )
-		add_transition_library( spsolver_sequential STATIC "spsolver_sequential" ${CMAKE_CURRENT_SOURCE_DIR}/solver.cpp
-			PRIVATE_LINK_LIBRARIES backend_reference
-		)
-	endif()
-endif()
+endif( WITH_REFERENCE_BACKEND )
 
 if( WITH_OMP_BACKEND )
-	add_transition_library( sparseblas_shmem_parallel_static STATIC "sparseblas_shmem_parallel" ${CMAKE_CURRENT_SOURCE_DIR}/sparseblas.cpp
-		PRIVATE_LINK_LIBRARIES backend_reference_omp
+	add_library( sparseblas_omp_static STATIC
+		${CMAKE_CURRENT_SOURCE_DIR}/sparseblas.cpp
 	)
 
-	# this is the "default" version (parallel)
-	add_transition_library( ${_SPBLAS_PREFIX}shmem_parallel STATIC "${_SPBLAS_PREFIX}shmem_parallel"
-		${CMAKE_CURRENT_SOURCE_DIR}/spblas.cpp PRIVATE_LINK_LIBRARIES backend_reference_omp
+	set_target_properties( sparseblas_omp_static PROPERTIES
+		OUTPUT_NAME "sparseblas_omp"
+		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/shmem"
 	)
-	target_compile_definitions( ${_SPBLAS_PREFIX}shmem_parallel PUBLIC SPBLAS_PREFIX=${_SPBLAS_PREFIX} )
 
-	if( ENABLE_EXTRA_SOLVER_LIBS )
-		add_transition_library( spsolver_shmem_blocking STATIC "spsolver_shmem_blocking" ${CMAKE_CURRENT_SOURCE_DIR}/solver.cpp
-			PRIVATE_LINK_LIBRARIES backend_reference_omp
-		)
-	endif()
-endif()
+	target_link_libraries( sparseblas_omp_static PUBLIC backend_reference_omp transition )
 
-if( WITH_NONBLOCKING_BACKEND )
-	if( ENABLE_SOLVER_LIB )
-		add_transition_library( spsolver_shmem_parallel STATIC "spsolver_shmem_parallel" ${CMAKE_CURRENT_SOURCE_DIR}/solver.cpp
-			PRIVATE_LINK_LIBRARIES backend_nonblocking
-		)
+	target_link_libraries( sparseblas_omp_static PRIVATE backend_flags )
 
-		# same binary name of KML library
-		# https://www.hikunpeng.com/document/detail/en/kunpengaccel/math-lib/devg-kml/kunpengaccel_kml_16_0011.html
-		add_transition_library( ksolver STATIC "ksolver" ${CMAKE_CURRENT_SOURCE_DIR}/kml_iss.cpp
-			PRIVATE_LINK_LIBRARIES spsolver_nonblocking_static
-		)
-	endif()
-endif()
+	add_dependencies( libs sparseblas_omp_static )
 
+	install( TARGETS sparseblas_omp_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+endif()
diff --git a/src/transition/sparseblas.cpp b/src/transition/sparseblas.cpp
index 6d3471258..60fb1f055 100644
--- a/src/transition/sparseblas.cpp
+++ b/src/transition/sparseblas.cpp
@@ -15,17 +15,7 @@
  * limitations under the License.
  */
 
-/**
- * @file
- *
- * Implements the Sparse BLAS standard using ALP/GraphBLAS.
- *
- * @author A. N. Yzelman
- * @date 2023
- */
-
 #include "blas_sparse.h"
-#include "sparse_vector_impl.hpp"
 
 #include <limits>
 #include <vector>
@@ -252,6 +242,76 @@ namespace sparseblas {
 
 	};
 
+	/**
+	 * \internal A sparse vector that is either under construction, or finalized as
+	 *           an ALP/GraphBLAS vector.
+	 */
+	template< typename T >
+	class SparseVector {
+
+		public:
+
+			int n;
+			bool finalized;
+			grb::Vector< T > * vector;
+			typename grb::Vector< T >::const_iterator start, end;
+
+		private:
+
+			std::vector< T > uc_vals;
+			std::vector< int > uc_inds;
+
+		public:
+
+			SparseVector( const int &_n ) :
+				n( _n ), finalized( false ), vector( nullptr )
+			{}
+
+			~SparseVector() {
+				if( finalized ) {
+					assert( vector != nullptr );
+					delete vector;
+				} else {
+					assert( vector == nullptr );
+				}
+			}
+
+			void add( const T &val, const int &index ) {
+				assert( !finalized );
+				uc_vals.push_back( val );
+				uc_inds.push_back( index );
+			}
+
+			void finalize() {
+				assert( uc_vals.size() == uc_inds.size() );
+				const size_t nz = uc_vals.size();
+				vector = new grb::Vector< T >( n, nz );
+				if( vector == nullptr ) {
+					std::cerr << "Could not create ALP/GraphBLAS vector of size " << n
+						<< " and capacity " << nz << "\n";
+					throw std::runtime_error( "Could not create ALP/GraphBLAS vector" );
+				}
+				if( grb::capacity( *vector ) < nz ) {
+					throw std::runtime_error( "ALP/GraphBLAS vector has insufficient "
+						"capacity" );
+				}
+				const grb::RC rc = grb::buildVector(
+					*vector,
+					uc_inds.cbegin(), uc_inds.cend(),
+					uc_vals.cbegin(), uc_vals.cend(),
+					grb::SEQUENTIAL
+				);
+				if( rc != grb::SUCCESS ) {
+					throw std::runtime_error( "Could not ingest nonzeroes into ALP/GraphBLAS "
+						"vector" );
+				}
+				uc_vals.clear();
+				uc_inds.clear();
+				finalized = true;
+			}
+
+	};
+
 	/**
 	 * \internal SparseBLAS allows a matrix to be under construction or finalized.
 	 *           This class matches that concept -- for non-finalized matrices, it
@@ -331,20 +391,95 @@ namespace sparseblas {
 	 * \internal Utility function that converts a #extblas_sparse_vector to a
 	 *           sparseblas::SparseVector. This is for vectors of doubles.
 	 */
-	static native::SparseVector< double > * getDoubleVector(
-		EXTBLAS_TYPE( sparse_vector ) x
-	) {
-		return static_cast< native::SparseVector< double >* >( x );
+	SparseVector< double > * getDoubleVector( extblas_sparse_vector x ) {
+		return static_cast< SparseVector< double >* >( x );
 	}
 
 	/**
 	 * \internal Utility function that converts a #blas_sparse_matrix to a
 	 *           sparseblas::SparseMatrix. This is for matrices of doubles.
 	 */
-	static SparseMatrix< double > * getDoubleMatrix( blas_sparse_matrix A ) {
+	SparseMatrix< double > * getDoubleMatrix( blas_sparse_matrix A ) {
 		return static_cast< SparseMatrix< double >* >( A );
 	}
 
+	/**
+	 * \internal Internal buffer used for output matrix containers.
+	 */
+	char * buffer = nullptr;
+
+	/**
+	 * \internal The size of #buffer.
+	 */
+	size_t buffer_size = 0;
+
+	/**
+	 * @returns false if and only if buffer allocation failed.
+	 * @returns true on success.
+	 */
+	template< typename T >
+	bool getBuffer(
+		char * &bitmask, char * &stack, T * &valbuf,
+		const size_t size
+	) {
+		typedef typename grb::internal::Coordinates< grb::config::default_backend >
+			Coors;
+		constexpr const size_t b = grb::config::CACHE_LINE_SIZE::value();
+
+		// catch trivial case
+		if( size == 0 ) {
+			bitmask = stack = nullptr;
+			valbuf = nullptr;
+			return true;
+		}
+
+		// compute required size
+		size_t reqSize = Coors::arraySize( size ) + Coors::stackSize( size ) +
+			(size * sizeof(T)) + 3 * b;
+
+		// ensure buffer is at least the required size
+		if( buffer == nullptr ) {
+			assert( buffer_size == 0 );
+			buffer_size = reqSize;
+			buffer = static_cast< char * >( malloc( buffer_size ) );
+			if( buffer == nullptr ) {
+				buffer_size = 0;
+				return false;
+			}
+		} else if( buffer_size < reqSize ) {
+			free( buffer );
+			buffer_size = std::max( reqSize, 2 * buffer_size );
+			buffer = static_cast< char * >( malloc( buffer_size ) );
+			if( buffer == nullptr ) {
+				buffer_size = 0;
+				return false;
+			}
+		}
+
+		// set buffers and make sure they are aligned
+		char * walk = buffer;
+		uintptr_t cur_mod = reinterpret_cast< uintptr_t >(walk) % b;
+		if( cur_mod > 0 ) {
+			walk += (b - cur_mod);
+		}
+		bitmask = walk;
+		walk += Coors::arraySize( size );
+		cur_mod = reinterpret_cast< uintptr_t >(walk) % b;
+		if( cur_mod > 0 ) {
+			walk += (b - cur_mod);
+		}
+		stack = walk;
+		walk += Coors::stackSize( size );
+		cur_mod = reinterpret_cast< uintptr_t >(walk) % b;
+		if( cur_mod > 0 ) {
+			walk += (b - cur_mod);
+		}
+		valbuf = reinterpret_cast< T * >( walk);
+
+		// done
+		return true;
+	}
+
 } // end namespace sparseblas
 
 namespace std {
@@ -374,536 +509,823 @@ namespace std {
 
 // implementation of the SparseBLAS API follows
 
-EXTBLAS_TYPE( sparse_vector ) EXTBLAS_FUN( dusv_begin )( const int n ) {
-	return new native::SparseVector< double >( n );
-}
-
-int EXTBLAS_FUN( dusv_insert_entry )(
-	EXTBLAS_TYPE( sparse_vector ) x,
-	const double val,
-	const int index
-) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( !(vector->finalized) );
-	try {
-		vector->add( val, index );
-	} catch( ... ) {
-		return 20;
-	}
-	return 0;
-}
-
-int EXTBLAS_FUN( dusv_end )( EXTBLAS_TYPE( sparse_vector ) x ) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( !(vector->finalized) );
-	try {
-		vector->finalize();
-	} catch( ... ) {
-		return 30;
-	}
-	return 0;
-}
-
-int EXTBLAS_FUN( dusvds )( EXTBLAS_TYPE( sparse_vector ) x ) {
-	auto vector = sparseblas::getDoubleVector( x );
-	delete vector;
-	return 0;
-}
-
-int EXTBLAS_FUN( dusv_nz )(
-	const EXTBLAS_TYPE( sparse_vector ) x,
-	int * const nz
-) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( vector->finalized );
-	const size_t nnz = grb::nnz( *(vector->vector) );
-	if( nnz > static_cast< size_t >( std::numeric_limits< int >::max() ) ) {
-		std::cerr << "Number of nonzeroes is larger than what can be represented by "
-			<< "a SparseBLAS int!\n";
-		return 10;
-	}
-	*nz = static_cast< int >(nnz);
-	return 0;
-}
-
-int EXTBLAS_FUN( dusv_clear )( EXTBLAS_TYPE( sparse_vector ) x ) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( vector->finalized );
-	const grb::RC rc = grb::clear( *(vector->vector) );
-	if( rc != grb::SUCCESS ) {
-		return 10;
-	}
-	return 0;
-}
-
-int EXTBLAS_FUN( dusv_open )( const EXTBLAS_TYPE( sparse_vector ) x ) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( vector->finalized );
-	try {
-		vector->start = vector->vector->cbegin();
-		vector->end = vector->vector->cend();
-	} catch( ... ) {
-		return 10;
-	}
-	return 0;
-}
-
-int EXTBLAS_FUN( dusv_get )(
-	const EXTBLAS_TYPE( sparse_vector ) x,
-	double * const val, int * const ind
-) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( vector->finalized );
-	assert( vector->start != vector->end );
-	assert( val != nullptr );
-	assert( ind != nullptr );
-	*val = vector->start->second;
-	*ind = vector->start->first;
-	try {
-		(void) ++(vector->start);
-	} catch( ... ) {
-		return 2;
-	}
-	if( vector->start == vector->end ) {
+extern "C" {
+
+	extblas_sparse_vector EXTBLAS_dusv_begin( const int n ) {
+		return new sparseblas::SparseVector< double >( n );
+	}
+
+	int EXTBLAS_dusv_insert_entry(
+		extblas_sparse_vector x,
+		const double val,
+		const int index
+	) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( !(vector->finalized) );
+		try {
+			vector->add( val, index );
+		} catch( ... ) {
+			return 20;
+		}
 		return 0;
-	} else {
-		return 1;
-	}
-}
-
-int EXTBLAS_FUN( dusv_close )( const EXTBLAS_TYPE( sparse_vector ) x ) {
-	auto vector = sparseblas::getDoubleVector( x );
-	assert( vector->finalized );
-	vector->start = vector->end;
-	return 0;
-}
-
-blas_sparse_matrix BLAS_duscr_begin( const int m, const int n ) {
-	return new sparseblas::SparseMatrix< double >( m, n );
-}
-
-int BLAS_duscr_insert_entry(
-	blas_sparse_matrix A,
-	const double val, const int row, const int col
-) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized == false );
-	assert( matrix->ingest != nullptr );
-	try {
-		matrix->ingest->add( val, row, col );
-	} catch( ... ) {
-		return 2;
-	}
-	return 0;
-}
-
-int BLAS_duscr_insert_entries(
-	blas_sparse_matrix A,
-	const int nnz,
-	const double * vals, const int * rows, const int * cols
-) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized == false );
-	assert( matrix->ingest != nullptr );
-	try {
-		for( int k = 0; k < nnz; ++k ) {
-			matrix->ingest->add( vals[ k ], rows[ k ], cols[ k ] );
-		}
-	} catch( ... ) {
-		return 3;
-	}
-	return 0;
-}
-
-int BLAS_duscr_insert_col(
-	blas_sparse_matrix A,
-	const int j, const int nnz,
-	const double * vals, const int * rows
-) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized == false );
-	assert( matrix->ingest != nullptr );
-	try {
-		for( int k = 0; k < nnz; ++k ) {
-			matrix->ingest->add( vals[ k ], rows[ k ], j );
-		}
-	} catch( ... ) {
-		return 4;
-	}
-	return 0;
-}
-
-int BLAS_duscr_insert_row(
-	blas_sparse_matrix A,
-	const int i, const int nnz,
-	const double * vals, const int * cols
-) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized == false );
-	assert( matrix->ingest != nullptr );
-	try {
-		for( int k = 0; k < nnz; ++k ) {
-			matrix->ingest->add( vals[ k ], i, cols[ k ] );
-		}
-	} catch( ... ) {
-		return 5;
-	}
-	return 0;
-}
-
-int BLAS_duscr_end( blas_sparse_matrix A ) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized == false );
-	assert( matrix->ingest != nullptr );
-	try {
-		matrix->finalize();
-	} catch( const std::runtime_error &e ) {
-		std::cerr << "Caught error: " << e.what() << "\n";
-		return 1;
-	}
-	return 0;
-}
-
-int EXTBLAS_dusm_clear( blas_sparse_matrix A ) {
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	assert( matrix->finalized );
-	const grb::RC rc = grb::clear( *(matrix->A) );
-	if( rc != grb::SUCCESS ) {
-		return 10;
-	}
-	return 0;
-}
-
-int BLAS_usds( blas_sparse_matrix A ) {
-	delete sparseblas::getDoubleMatrix( A );
-	return 0;
-}
-
-int BLAS_dusmv(
-	const enum blas_trans_type transa,
-	const double alpha, const blas_sparse_matrix A,
-	const double * x, int incx,
-	double * const y, const int incy
-) {
-	grb::Semiring<
-		grb::operators::add< double >, grb::operators::mul< double >,
-		grb::identities::zero, grb::identities::one
-	> ring;
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	if( alpha != 1.0 ) {
-		grb::Vector< double > output = grb::internal::template
-			wrapRawVector< double >( matrix->m, y );
-		const grb::RC rc = grb::foldl< grb::descriptors::dense >(
-			output, 1.0 / alpha, ring.getMultiplicativeOperator() );
-		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during pre-scaling during SpMV\n";
-			return 50;
+	}
+
+	int EXTBLAS_dusv_end( extblas_sparse_vector x ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( !(vector->finalized) );
+		try {
+			vector->finalize();
+		} catch( ... ) {
+			return 30;
 		}
+		return 0;
 	}
-	if( incx != 1 || incy != 1 ) {
-		// TODO: requires ALP views
-		std::cerr << "Strided input and/or output vectors are not supported.\n";
-		return 255;
+
+	int EXTBLAS_dusvds( extblas_sparse_vector x ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		delete vector;
+		return 0;
 	}
-	if( !(matrix->finalized) ) {
-		std::cerr << "Input matrix was not yet finalised; see BLAS_duscr_end.\n";
-		return 100;
+
+	int EXTBLAS_dusv_nz( const extblas_sparse_vector x, int * const nz ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( vector->finalized );
+		const size_t nnz = grb::nnz( *(vector->vector) );
+		if( nnz > static_cast< size_t >( std::numeric_limits< int >::max() ) ) {
+			std::cerr << "Number of nonzeroes is larger than what can be represented by "
+				<< "a SparseBLAS int!\n";
+			return 10;
+		}
+		*nz = static_cast< int >(nnz);
+		return 0;
 	}
-	assert( matrix->finalized );
-	if( transa == blas_no_trans ) {
-		const grb::Vector< double > input = grb::internal::template
-			wrapRawVector< double >( matrix->n, x );
-		grb::Vector< double > output = grb::internal::template
-			wrapRawVector< double >( matrix->m, y );
-		const grb::RC rc = grb::mxv< grb::descriptors::dense >(
-			output, *(matrix->A), input, ring
-		);
+
+	int EXTBLAS_dusv_clear( extblas_sparse_vector x ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( vector->finalized );
+		const grb::RC rc = grb::clear( *(vector->vector) );
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "ALP/GraphBLAS returns error during SpMV: "
-				<< grb::toString( rc ) << ".\n";
-			return 200;
+			return 10;
 		}
-	} else {
-		const grb::Vector< double > input = grb::internal::template
-			wrapRawVector< double >( matrix->m, x );
-		grb::Vector< double > output = grb::internal::template
-			wrapRawVector< double >( matrix->n, y );
-		const grb::RC rc = grb::mxv<
-			grb::descriptors::dense |
-			grb::descriptors::transpose_matrix
-		>(
-			output, *(matrix->A), input, ring
-		);
+		return 0;
+	}
+
+	int EXTBLAS_dusv_open( const extblas_sparse_vector x ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( vector->finalized );
+		try {
+			vector->start = vector->vector->cbegin();
+			vector->end = vector->vector->cend();
+		} catch( ... ) {
+			return 10;
+		}
+		return 0;
+	}
+
+	int EXTBLAS_dusv_get(
+		const extblas_sparse_vector x,
+		double * const val, int * const ind
+	) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( vector->finalized );
+		assert( vector->start != vector->end );
+		assert( val != nullptr );
+		assert( ind != nullptr );
+		*val = vector->start->second;
+		*ind = vector->start->first;
+		try {
+			(void) ++(vector->start);
+		} catch( ... ) {
+			return 2;
+		}
+		if( vector->start == vector->end ) {
+			return 0;
+		} else {
+			return 1;
+		}
+	}
+
+	int EXTBLAS_dusv_close( const extblas_sparse_vector x ) {
+		auto vector = sparseblas::getDoubleVector( x );
+		assert( vector->finalized );
+		vector->start = vector->end;
+		return 0;
+	}
+
+	blas_sparse_matrix BLAS_duscr_begin( const int m, const int n ) {
+		return new sparseblas::SparseMatrix< double >( m, n );
+	}
+
+	int BLAS_duscr_insert_entry(
+		blas_sparse_matrix A,
+		const double val, const int row, const int col
+	) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized == false );
+		assert( matrix->ingest != nullptr );
+		try {
+			matrix->ingest->add( val, row, col );
+		} catch( ... ) {
+			return 2;
+		}
+		return 0;
+	}
+
+	int BLAS_duscr_insert_entries(
+		blas_sparse_matrix A,
+		const int nnz,
+		const double * vals, const int * rows, const int * cols
+	) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized == false );
+		assert( matrix->ingest != nullptr );
+		try {
+			for( int k = 0; k < nnz; ++k ) {
+				matrix->ingest->add( vals[ k ], rows[ k ], cols[ k ] );
+			}
+		} catch( ... ) {
+			return 3;
+		}
+		return 0;
+	}
+
+	int BLAS_duscr_insert_col(
+		blas_sparse_matrix A,
+		const int j, const int nnz,
+		const double * vals, const int * rows
+	) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized == false );
+		assert( matrix->ingest != nullptr );
+		try {
+			for( int k = 0; k < nnz; ++k ) {
+				matrix->ingest->add( vals[ k ], rows[ k ], j );
+			}
+		} catch( ... ) {
+			return 4;
+		}
+		return 0;
+	}
+
+	int BLAS_duscr_insert_row(
+		blas_sparse_matrix A,
+		const int i, const int nnz,
+		const double * vals, const int * cols
+	) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized == false );
+		assert( matrix->ingest != nullptr );
+		try {
+			for( int k = 0; k < nnz; ++k ) {
+				matrix->ingest->add( vals[ k ], i, cols[ k ] );
+			}
+		} catch( ... ) {
+			return 5;
+		}
+		return 0;
+	}
+
+	int BLAS_duscr_end( blas_sparse_matrix A ) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized == false );
+		assert( matrix->ingest != nullptr );
+		try {
+			matrix->finalize();
+		} catch( const std::runtime_error &e ) {
+			std::cerr << "Caught error: " << e.what() << "\n";
+			return 1;
+		}
+		return 0;
+	}
+
+	int EXTBLAS_dusm_clear( blas_sparse_matrix A ) {
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		assert( matrix->finalized );
+		const grb::RC rc = grb::clear( *(matrix->A) );
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "ALP/GraphBLAS returns error during transposed SpMV: "
-				<< grb::toString( rc ) << ".\n";
-			return 200;
+			return 10;
 		}
+		return 0;
 	}
-	if( alpha != 1.0 ) {
-		grb::Vector< double > output = grb::internal::template
-			wrapRawVector< double >( matrix->m, y );
-		const grb::RC rc = grb::foldl< grb::descriptors::dense >(
-			output, alpha, ring.getMultiplicativeOperator() );
+
+	int BLAS_usds( blas_sparse_matrix A ) {
+		delete sparseblas::getDoubleMatrix( A );
+		return 0;
+	}
+
+	int BLAS_dusmv(
+		const enum blas_trans_type transa,
+		const double alpha, const blas_sparse_matrix A,
+		const double * x, int incx,
+		double * const y, const int incy
+	) {
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		if( alpha != 1.0 ) {
+			grb::Vector< double > output = grb::internal::template
+				wrapRawVector< double >( matrix->m, y );
+			const grb::RC rc = grb::foldl< grb::descriptors::dense >(
+				output, 1.0 / alpha, ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during pre-scaling during SpMV\n";
+				return 50;
+			}
+		}
+		if( incx != 1 || incy != 1 ) {
+			// TODO: requires ALP views
+			std::cerr << "Strided input and/or output vectors are not supported.\n";
+			return 255;
+		}
+		if( !(matrix->finalized) ) {
+			std::cerr << "Input matrix was not yet finalised; see BLAS_duscr_end.\n";
+			return 100;
+		}
+		assert( matrix->finalized );
+		if( transa == blas_no_trans ) {
+			const grb::Vector< double > input = grb::internal::template
+				wrapRawVector< double >( matrix->n, x );
+			grb::Vector< double > output = grb::internal::template
+				wrapRawVector< double >( matrix->m, y );
+			const grb::RC rc = grb::mxv< grb::descriptors::dense >(
+				output, *(matrix->A), input, ring
+			);
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "ALP/GraphBLAS returns error during SpMV: "
+					<< grb::toString( rc ) << ".\n";
+				return 200;
+			}
+		} else {
+			const grb::Vector< double > input = grb::internal::template
+				wrapRawVector< double >( matrix->m, x );
+			grb::Vector< double > output = grb::internal::template
+				wrapRawVector< double >( matrix->n, y );
+			const grb::RC rc = grb::mxv<
+				grb::descriptors::dense |
+				grb::descriptors::transpose_matrix
+			>(
+				output, *(matrix->A), input, ring
+			);
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "ALP/GraphBLAS returns error during transposed SpMV: "
+					<< grb::toString( rc ) << ".\n";
+				return 200;
+			}
+		}
+		if( alpha != 1.0 ) {
+			grb::Vector< double > output = grb::internal::template
+				wrapRawVector< double >( matrix->m, y );
+			const grb::RC rc = grb::foldl< grb::descriptors::dense >(
+				output, alpha, ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during post-scaling during SpMV\n";
+				return 250;
+			}
+		}
+		return 0;
+	}
+
+	void spblas_dcsrgemv(
+		const char * transa,
+		const int * m_p,
+		const double * a, const int * ia, const int * ja,
+		const double * x,
+		double * y
+	) {
+		// declare algebraic structures
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+		grb::Monoid<
+			grb::operators::max< int >, grb::identities::negative_infinity
+		> maxMonoid;
+
+		// declare minimum necessary descriptors
+		constexpr grb::Descriptor minDescr = grb::descriptors::dense |
+			grb::descriptors::force_row_major;
+
+		// determine matrix size
+		const int m = *m_p;
+		const grb::Vector< int > columnIndices =
+			grb::internal::template wrapRawVector< int >( ia[ m ], ja );
+		int n = 0;
+		grb::RC rc = foldl( n, columnIndices, maxMonoid );
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during post-scaling during SpMV\n";
-			return 250;
-		}
-	}
-	return 0;
-}
-
-int BLAS_dusmm(
-	const enum blas_order_type order,
-	const enum blas_trans_type transa,
-	const int nrhs,
-	const double alpha, const blas_sparse_matrix A,
-	const double * B, const int ldb,
-	const double * C, const int ldc
-) {
-	(void) order;
-	(void) transa;
-	(void) nrhs;
-	(void) alpha;
-	(void) A;
-	(void) B;
-	(void) ldb;
-	(void) C;
-	(void) ldc;
-	// TODO requires dense ALP and mixed sparse/dense ALP operations
-	std::cerr << "BLAS_dusmm (sparse matrix times dense matrix) has not yet "
-		<< "been implemented.\n";
-	assert( false );
-	return 255;
-}
-
-int EXTBLAS_dusmsv(
-	const enum blas_trans_type transa,
-	const double alpha, const blas_sparse_matrix A,
-	const EXTBLAS_TYPE( sparse_vector ) x,
-	EXTBLAS_TYPE( sparse_vector ) y
-) {
-	grb::Semiring<
-		grb::operators::add< double >, grb::operators::mul< double >,
-		grb::identities::zero, grb::identities::one
-	> ring;
-	auto matrix = sparseblas::getDoubleMatrix( A );
-	auto input  = sparseblas::getDoubleVector( x );
-	auto output = sparseblas::getDoubleVector( y );
-	if( !(matrix->finalized) ) {
-		std::cerr << "Uninitialised input matrix during SpMSpV\n";
-		return 10;
-	}
-	if( !(input->finalized) ) {
-		std::cerr << "Uninitialised input vector during SpMSpV\n";
-		return 20;
-	}
-	if( !(output->finalized) ) {
-		std::cerr << "Uninitialised output vector during SpMSpV\n";
-		return 30;
-	}
-	grb::RC rc = grb::SUCCESS;
-	if( alpha != 1.0 ) {
-		rc = grb::foldl( *(output->vector), 1.0 / alpha,
-			ring.getMultiplicativeOperator() );
+			std::cerr << "Could not determine matrix column size\n";
+			assert( false );
+			return;
+		}
+
+		// retrieve buffers (only when A needs to be output also)
+		//char * const bitmask = sparseblas::getBitmask( n );
+		//char * const stack = sparseblas::getStack( n );
+		//double * const buffer = sparseblas::template getBuffer< double >( n );
+
+		// retrieve necessary ALP/GraphBLAS container wrappers
+		const grb::Matrix< double, grb::config::default_backend, int, int, int > A =
+			grb::internal::wrapCRSMatrix( a, ja, ia, m, n );
+		const grb::Vector< double > input = grb::internal::template
+			wrapRawVector< double >( n, x );
+		grb::Vector< double > output = grb::internal::template
+			wrapRawVector< double >( m, y );
+
+		// set output vector to zero
+		rc = grb::set( output, ring.template getZero< double >() );
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during pre-scaling of SpMSpV\n";
-			return 40;
+			std::cerr << "Could not set output vector to zero\n";
+			assert( false );
+			return;
 		}
+
+		// do either y=Ax or y=A^Tx
+		if( transa[0] == 'N' ) {
+			rc = grb::mxv< minDescr >(
+				output, A, input, ring
+			);
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "ALP/GraphBLAS returns error during SpMV: "
+					<< grb::toString( rc ) << ".\n";
+				assert( false );
+				return;
+			}
+		} else {
+			// Hermitian is not supported
+			assert( transa[0] == 'T' );
+			rc = grb::mxv<
+				minDescr |
+				grb::descriptors::transpose_matrix
+			>(
+				output, A, input, ring
+			);
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "ALP/GraphBLAS returns error during transposed SpMV: "
+					<< grb::toString( rc ) << ".\n";
+				assert( false );
+				return;
+			}
+		}
+
+		// done
 	}
-	if( transa == blas_no_trans ) {
-		rc = grb::mxv( *(output->vector), *(matrix->A), *(input->vector), ring );
-	} else {
-		rc = grb::mxv< grb::descriptors::transpose_matrix >(
-			*(output->vector), *(matrix->A), *(input->vector), ring );
+
+	int BLAS_dusmm(
+		const enum blas_order_type order,
+		const enum blas_trans_type transa,
+		const int nrhs,
+		const double alpha, const blas_sparse_matrix A,
+		const double * B, const int ldb,
+		const double * C, const int ldc
+	) {
+		(void) order;
+		(void) transa;
+		(void) nrhs;
+		(void) alpha;
+		(void) A;
+		(void) B;
+		(void) ldb;
+		(void) C;
+		(void) ldc;
+		// TODO requires dense ALP and mixed sparse/dense ALP operations
+		std::cerr << "BLAS_dusmm (sparse matrix times dense matrix) has not yet "
+			<< "been implemented.\n";
+		assert( false );
+		return 255;
 	}
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Error during call to grb::mxv (SpMSpV)\n";
-		return 50;
+
+	void spblas_dcsrmm(
+		const char * const transa,
+		const int * m, const int * n, const int * k,
+		const double * alpha,
+		const char * matdescra, const double * val, const int * indx,
+		const int * pntrb, const int * pntre,
+		const double * b, const int * ldb,
+		const double * beta,
+		double * c, const int * ldc
+	) {
+		assert( transa[0] == 'N' || transa[0] == 'T' );
+		assert( m != NULL );
+		assert( n != NULL );
+		assert( k != NULL );
+		assert( alpha != NULL );
+		// not sure yet what constraints if any on matdescra
+		if( *m > 0 && *k > 0 ) {
+			assert( pntrb != NULL );
+			assert( pntre != NULL );
+		}
+		// val and indx could potentially be NULL if there are no nonzeroes
+		assert( b != NULL );
+		assert( ldb != NULL );
+		assert( beta != NULL );
+		assert( c != NULL );
+		assert( ldc != NULL );
+		(void) transa;
+		(void) m; (void) n; (void) k;
+		(void) alpha;
+		(void) matdescra; (void) val; (void) indx; (void) pntrb; (void) pntre;
+		(void) b; (void) ldb;
+		(void) beta;
+		(void) c; (void) ldc;
+		// requires dense ALP and mixed sparse/dense operations
+		assert( false );
 	}
-	if( alpha != 1.0 ) {
-		rc = grb::foldl( *(output->vector), alpha,
-			ring.getMultiplicativeOperator() );
+
+	int EXTBLAS_dusmsv(
+		const enum blas_trans_type transa,
+		const double alpha, const blas_sparse_matrix A,
+		const extblas_sparse_vector x,
+		extblas_sparse_vector y
+	) {
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+		auto matrix = sparseblas::getDoubleMatrix( A );
+		auto input  = sparseblas::getDoubleVector( x );
+		auto output = sparseblas::getDoubleVector( y );
+		if( !(matrix->finalized) ) {
+			std::cerr << "Uninitialised input matrix during SpMSpV\n";
+			return 10;
+		}
+		if( !(input->finalized) ) {
+			std::cerr << "Uninitialised input vector during SpMSpV\n";
+			return 20;
+		}
+		if( !(output->finalized) ) {
+			std::cerr << "Uninitialised output vector during SpMSpV\n";
+			return 30;
+		}
+		grb::RC rc = grb::SUCCESS;
+		if( alpha != 1.0 ) {
+			rc = grb::foldl( *(output->vector), 1.0 / alpha,
+				ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during pre-scaling of SpMSpV\n";
+				return 40;
+			}
+		}
+		if( transa == blas_no_trans ) {
+			rc = grb::mxv( *(output->vector), *(matrix->A), *(input->vector), ring );
+		} else {
+			rc = grb::mxv< grb::descriptors::transpose_matrix >(
+				*(output->vector), *(matrix->A), *(input->vector), ring );
+		}
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during post-scaling of SpMSpV\n";
-			return 60;
+			std::cerr << "Error during call to grb::mxv (SpMSpV)\n";
+			return 50;
 		}
+		if( alpha != 1.0 ) {
+			rc = grb::foldl( *(output->vector), alpha,
+				ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during post-scaling of SpMSpV\n";
+				return 60;
+			}
+		}
+		return 0;
 	}
-	return 0;
-}
-
-int EXTBLAS_dusmsm(
-	const enum blas_trans_type transa,
-	const double alpha, const blas_sparse_matrix A,
-	const enum blas_trans_type transb, const blas_sparse_matrix B,
-	blas_sparse_matrix C
-) {
-	grb::Semiring<
-		grb::operators::add< double >, grb::operators::mul< double >,
-		grb::identities::zero, grb::identities::one
-	> ring;
-	auto matA = sparseblas::getDoubleMatrix( A );
-	auto matB = sparseblas::getDoubleMatrix( B );
-	auto matC = sparseblas::getDoubleMatrix( C );
-	if( !(matA->finalized) ) {
-		std::cerr << "Uninitialised left-hand input matrix during SpMSpM\n";
-		return 10;
-	}
-	if( !(matB->finalized) ) {
-		std::cerr << "Uninitialised right-hand input matrix during SpMSpM\n";
-		return 20;
-	}
-	if( !(matC->finalized) ) {
-		std::cerr << "Uninitialised output matrix during SpMSpM\n";
-		return 30;
-	}
-
-	grb::RC rc = grb::SUCCESS;
-	if( alpha != 1.0 ) {
-		/*const grb::RC rc = grb::foldl( *(matC->A), 1.0 / alpha,
-			ring.getMultiplicativeOperator() );
+
+	void extspblas_dcsrmultsv(
+		const char * trans, const int * request,
+		const int * m, const int * n,
+		const double * a, const int * ja, const int * ia,
+		const extblas_sparse_vector x,
+		extblas_sparse_vector y
+	) {
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+		const grb::Matrix< double, grb::config::default_backend, int, int, int > A =
+			grb::internal::wrapCRSMatrix( a, ja, ia, *m, *n );
+		auto input  = sparseblas::getDoubleVector( x );
+		auto output = sparseblas::getDoubleVector( y );
+		if( !(input->finalized) ) {
+			throw std::runtime_error( "Uninitialised input vector during SpMSpV\n" );
+		}
+		if( !(output->finalized) ) {
+			throw std::runtime_error( "Uninitialised output vector during SpMSpV\n" );
+		}
+		if( request[ 0 ] != 0 && request[ 1 ] != 1 ) {
+			throw std::runtime_error( "Illegal request during call to dcsrmultsv\n" );
+		}
+		grb::Phase phase = grb::EXECUTE;
+		if( request[ 0 ] == 1 ) {
+			phase = grb::RESIZE;
+		}
+		grb::RC rc;
+		if( trans[0] == 'N' ) {
+			rc = grb::mxv< grb::descriptors::force_row_major >( *(output->vector), A,
+				*(input->vector), ring, phase );
+		} else {
+			if( trans[1] != 'T' ) {
+				throw std::runtime_error( "Illegal trans argument to dcsrmultsv\n" );
+			}
+			rc = grb::mxv<
+				grb::descriptors::force_row_major |
+				grb::descriptors::transpose_matrix
+			>( *(output->vector), A, *(input->vector), ring, phase );
+		}
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during pre-scaling for SpMSpM\n";
-			return 40;
+			throw std::runtime_error( "ALP/GraphBLAS returns error during call to "
+				"SpMSpV: " + grb::toString( rc ) );
+		}
+	}
+
+	int EXTBLAS_dusmsm(
+		const enum blas_trans_type transa,
+		const double alpha, const blas_sparse_matrix A,
+		const enum blas_trans_type transb, const blas_sparse_matrix B,
+		blas_sparse_matrix C
+	) {
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+		auto matA = sparseblas::getDoubleMatrix( A );
+		auto matB = sparseblas::getDoubleMatrix( B );
+		auto matC = sparseblas::getDoubleMatrix( C );
+		if( !(matA->finalized) ) {
+			std::cerr << "Uninitialised left-hand input matrix during SpMSpM\n";
+			return 10;
+		}
+		if( !(matB->finalized) ) {
+			std::cerr << "Uninitialised right-hand input matrix during SpMSpM\n";
+			return 20;
+		}
+		if( !(matC->finalized) ) {
+			std::cerr << "Uninitialised output matrix during SpMSpM\n";
+			return 30;
+		}
+
+		grb::RC rc = grb::SUCCESS;
+		if( alpha != 1.0 ) {
+			/*const grb::RC rc = grb::foldl( *(matC->A), 1.0 / alpha,
+				ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during pre-scaling for SpMSpM\n";
+				return 40;
+			}*/
+			// TODO requires level-3 fold in ALP/GraphBLAS
+			std::cerr << "Any other alpha from 1.0 is currently not supported for "
+				<< "SpMSpM multiplication\n";
+			return 255;
+		}
+
+		// resize phase
+		if( transa == blas_no_trans && transb == blas_no_trans ) {
+			rc = grb::mxm( *(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
+		} else if( transa != blas_no_trans && transb == blas_no_trans ) {
+			rc = grb::mxm< grb::descriptors::transpose_left >(
+				*(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
+		} else if( transa == blas_no_trans && transb != blas_no_trans ) {
+			rc = grb::mxm< grb::descriptors::transpose_right >(
+				*(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
+		} else {
+			assert( transa != blas_no_trans );
+			assert( transb != blas_no_trans );
+			rc = grb::mxm<
+				grb::descriptors::transpose_left |
+				grb::descriptors::transpose_right
+			>( *(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
+		}
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Error during call to ALP/GraphBLAS mxm (RESIZE phase): "
+				<< grb::toString( rc ) << "\n";
+			return 50;
+		}
+
+		// execute phase
+		if( transa == blas_no_trans && transb == blas_no_trans ) {
+			rc = grb::mxm( *(matC->A), *(matA->A), *(matB->A), ring );
+		} else if( transa != blas_no_trans && transb == blas_no_trans ) {
+			rc = grb::mxm< grb::descriptors::transpose_left >(
+				*(matC->A), *(matA->A), *(matB->A), ring );
+		} else if( transa == blas_no_trans && transb != blas_no_trans ) {
+			rc = grb::mxm< grb::descriptors::transpose_right >(
+				*(matC->A), *(matA->A), *(matB->A), ring );
+		} else {
+			assert( transa != blas_no_trans );
+			assert( transb != blas_no_trans );
+			rc = grb::mxm<
+				grb::descriptors::transpose_left |
+				grb::descriptors::transpose_right
+			>( *(matC->A), *(matA->A), *(matB->A), ring );
+		}
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Error during call to ALP/GraphBLAS mxm (EXECUTE phase): \n"
+				<< grb::toString( rc ) << "\n";
+			return 60;
+		}
+
+		/*TODO see above
+		if( alpha != 1.0 ) {
+			rc = grb::foldl( *(matC->A), 1.0 / alpha,
+				ring.getMultiplicativeOperator() );
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Error during post-scaling for SpMSpM\n";
+				return 70;
+			}
 		}*/
-		// TODO requires level-3 fold in ALP/GraphBLAS
-		std::cerr << "Any other alpha from 1.0 is currently not supported for "
-			<< "SpMSpM multiplication\n";
-		return 255;
+		return 0;
 	}
 
-	// resize phase
-	if( transa == blas_no_trans && transb == blas_no_trans ) {
-		rc = grb::mxm( *(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
-	} else if( transa != blas_no_trans && transb == blas_no_trans ) {
-		rc = grb::mxm< grb::descriptors::transpose_left >(
-			*(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
-	} else if( transa == blas_no_trans && transb != blas_no_trans ) {
-		rc = grb::mxm< grb::descriptors::transpose_right >(
-			*(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
-	} else {
-		assert( transa != blas_no_trans );
-		assert( transb != blas_no_trans );
-		rc = grb::mxm<
-			grb::descriptors::transpose_left |
-			grb::descriptors::transpose_right
-		>( *(matC->A), *(matA->A), *(matB->A), ring, grb::RESIZE );
-	}
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Error during call to ALP/GraphBLAS mxm (RESIZE phase): "
-			<< grb::toString( rc ) << "\n";
-		return 50;
-	}
-
-	// execute phase
-	if( transa == blas_no_trans && transb == blas_no_trans ) {
-		rc = grb::mxm( *(matC->A), *(matA->A), *(matB->A), ring );
-	} else if( transa != blas_no_trans && transb == blas_no_trans ) {
-		rc = grb::mxm< grb::descriptors::transpose_left >(
-			*(matC->A), *(matA->A), *(matB->A), ring );
-	} else if( transa == blas_no_trans && transb != blas_no_trans ) {
-		rc = grb::mxm< grb::descriptors::transpose_right >(
-			*(matC->A), *(matA->A), *(matB->A), ring );
-	} else {
-		assert( transa != blas_no_trans );
-		assert( transb != blas_no_trans );
-		rc = grb::mxm<
-			grb::descriptors::transpose_left |
-			grb::descriptors::transpose_right
-		>( *(matC->A), *(matA->A), *(matB->A), ring );
-	}
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Error during call to ALP/GraphBLAS mxm (EXECUTE phase): \n"
-			<< grb::toString( rc ) << "\n";
-		return 60;
-	}
-
-	/*TODO see above
-	if( alpha != 1.0 ) {
-		rc = grb::foldl( *(matC->A), 1.0 / alpha,
-			ring.getMultiplicativeOperator() );
+	void spblas_dcsrmultcsr(
+		const char * trans, const int * request, const int * sort,
+		const int * m_p, const int * n_p, const int * k_p,
+		double * a, int * ja, int * ia,
+		double * b, int * jb, int * ib,
+		double * c, int * jc, int * ic,
+		const int * nzmax, int * info
+	) {
+		assert( trans[0] == 'N' );
+		assert( sort != NULL && sort[0] == 7 );
+		assert( m_p != NULL );
+		assert( n_p != NULL );
+		assert( k_p != NULL );
+		assert( a != NULL ); assert( ja != NULL ); assert( ia != NULL );
+		assert( b != NULL ); assert( jb != NULL ); assert( ib != NULL );
+		assert( c != NULL ); assert( jc != NULL ); assert( ic != NULL );
+		assert( nzmax != NULL );
+		assert( info != NULL );
+
+		// declare algebraic structures
+		grb::Semiring<
+			grb::operators::add< double >, grb::operators::mul< double >,
+			grb::identities::zero, grb::identities::one
+		> ring;
+
+		// check support
+		if( trans[ 0 ] != 'N' ) {
+			std::cerr << "ALP/SparseBLAS, error: illegal trans argument to dcsrmultcsr\n";
+			*info = 4;
+		}
+		if( sort[ 0 ] != 7 ) {
+			std::cerr << "ALP/SparseBLAS, error: illegal sort argument to dcsrmultcsr\n";
+			*info = 5;
+			return;
+		}
+
+		// declare minimum necessary descriptors
+		constexpr const grb::Descriptor minDescr = grb::descriptors::dense |
+			grb::descriptors::force_row_major;
+
+		// determine matrix size
+		const int m = *m_p;
+		const int n = *n_p;
+		const int k = *k_p;
+
+		// retrieve buffers (only when A needs to be output also)
+		char * bitmask = nullptr;
+		char * stack = nullptr;
+		double * valbuf = nullptr;
+		if( sparseblas::template getBuffer< double >(
+				bitmask, stack, valbuf, n
+			) == false
+		) {
+			std::cerr << "ALP/SparseBLAS, error: could not allocate buffer for "
+				<< "computations on an output matrix\n";
+			*info = 10;
+			return;
+		}
+
+		// retrieve necessary ALP/GraphBLAS container wrappers
+		const grb::Matrix< double, grb::config::default_backend, int, int, int > A =
+			grb::internal::wrapCRSMatrix( a, ja, ia, m, k );
+		const grb::Matrix< double, grb::config::default_backend, int, int, int > B =
+			grb::internal::wrapCRSMatrix( b, jb, ib, k, n );
+		grb::Matrix< double, grb::config::default_backend, int, int, int > C =
+			grb::internal::wrapCRSMatrix(
+				c, jc, ic,
+				m, n, *nzmax,
+				bitmask, stack, valbuf
+			);
+
+		// set output vector to zero
+		grb::RC rc = grb::clear( C );
 		if( rc != grb::SUCCESS ) {
-			std::cerr << "Error during post-scaling for SpMSpM\n";
-			return 70;
-		}
-	}*/
-	return 0;
-}
-
-int EXTBLAS_dusm_nz( const blas_sparse_matrix A, int * nz ) {
-	auto matA = sparseblas::getDoubleMatrix( A );
-	if( !(matA->finalized) ) {
-		std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
-		return 10;
-	}
-	const size_t grb_nz = grb::nnz( *(matA->A) );
-	if( grb_nz > static_cast< size_t >(std::numeric_limits< int >::max()) ) {
-		std::cerr << "Number of nonzeroes in given sparse matrix is larger than "
-			<< "what can be represented by a SparseBLAS int\n";
-		return 20;
-	}
-	*nz = static_cast< int >( grb_nz );
-	return 0;
-}
-
-int EXTBLAS_dusm_open( const blas_sparse_matrix A ) {
-	auto matA = sparseblas::getDoubleMatrix( A );
-	if( !(matA->finalized) ) {
-		std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
-		return 10;
-	}
-	try{
-		matA->start = matA->A->cbegin();
-		matA->end = matA->A->cend();
-	} catch( ... ) {
-		std::cerr << "Could not retrieve matrix iterators\n";
-		return 20;
-	}
-	return 0;
-}
-
-int EXTBLAS_dusm_get(
-	const blas_sparse_matrix A,
-	double * value, int * row, int * col
-) {
-	auto matA = sparseblas::getDoubleMatrix( A );
-	if( !(matA->finalized) ) {
-		std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
-		return 10;
-	}
-	assert( matA->start != matA->end );
-	const auto &triplet = *(matA->start);
-	*value = triplet.second;
-	*row = triplet.first.first;
-	*col = triplet.first.second;
-	try {
-		(void) ++(matA->start);
-	} catch( ... ) {
-		return 2;
-	}
-	if( matA->start == matA->end ) {
+			std::cerr << "ALP/SparseBLAS, error: Could not clear output matrix\n";
+			assert( false );
+			*info = 20;
+			return;
+		}
+
+		// do either C=AB or C=A^TB
+		if( trans[0] == 'N' ) {
+			if( *request == 1 ) {
+				rc = grb::mxm< minDescr >( C, A, B, ring, grb::RESIZE );
+			} else {
+				assert( *request == 0 || *request == 2 );
+				rc = grb::mxm< minDescr >( C, A, B, ring );
+			}
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "ALP/SparseBLAS, error during call to SpMSpM: "
+					<< grb::toString( rc ) << ".\n";
+				assert( false );
+				*info = 30;
+				return;
+			}
+		} else {
+			// this case is not supported
+			assert( false );
+		}
+
+		// done
+		if( *request == 1 ) {
+			*info = -1;
+		} else {
+			*info = 0;
+		}
+	}
+
+	int EXTBLAS_dusm_nz( const blas_sparse_matrix A, int * nz ) {
+		auto matA = sparseblas::getDoubleMatrix( A );
+		if( !(matA->finalized) ) {
+			std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
+			return 10;
+		}
+		const size_t grb_nz = grb::nnz( *(matA->A) );
+		if( grb_nz > static_cast< size_t >(std::numeric_limits< int >::max()) ) {
+			std::cerr << "Number of nonzeroes in given sparse matrix is larger than "
+				<< "what can be represented by a SparseBLAS int\n";
+			return 20;
+		}
+		*nz = static_cast< int >( grb_nz );
+		return 0;
+	}
+
+	int EXTBLAS_dusm_open( const blas_sparse_matrix A ) {
+		auto matA = sparseblas::getDoubleMatrix( A );
+		if( !(matA->finalized) ) {
+			std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
+			return 10;
+		}
+		try{
+			matA->start = matA->A->cbegin();
+			matA->end = matA->A->cend();
+		} catch( ... ) {
+			std::cerr << "Could not retrieve matrix iterators\n";
+			return 20;
+		}
 		return 0;
-	} else {
-		return 1;
 	}
-}
 
-int EXTBLAS_dusm_close( const blas_sparse_matrix A ) {
-	auto matA = sparseblas::getDoubleMatrix( A );
-	if( !(matA->finalized) ) {
-		std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
-		return 10;
+	int EXTBLAS_dusm_get(
+		const blas_sparse_matrix A,
+		double * value, int * row, int * col
+	) {
+		auto matA = sparseblas::getDoubleMatrix( A );
+		if( !(matA->finalized) ) {
+			std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
+			return 10;
+		}
+		assert( matA->start != matA->end );
+		const auto &triplet = *(matA->start);
+		*value = triplet.second;
+		*row = triplet.first.first;
+		*col = triplet.first.second;
+		try {
+			(void) ++(matA->start);
+		} catch( ... ) {
+			return 2;
+		}
+		if( matA->start == matA->end ) {
+			return 0;
+		} else {
+			return 1;
+		}
 	}
-	matA->start = matA->end;
-	return 0;
-}
 
-int EXTBLAS_free() {
-	const grb::RC rc = grb::finalize();
-	if( rc != grb::SUCCESS ) {
-		std::cerr << "Error during call to EXTBLAS_free\n";
-		return 10;
+	int EXTBLAS_dusm_close( const blas_sparse_matrix A ) {
+		auto matA = sparseblas::getDoubleMatrix( A );
+		if( !(matA->finalized) ) {
+			std::cerr << "Uninitialised left-hand input matrix during dusm_nz\n";
+			return 10;
+		}
+		matA->start = matA->end;
+		return 0;
 	}
-	return 0;
-}
+
+	int EXTBLAS_free() {
+		if( sparseblas::buffer != nullptr || sparseblas::buffer_size > 0 ) {
+			assert( sparseblas::buffer != nullptr );
+			assert( sparseblas::buffer_size > 0 );
+			free( sparseblas::buffer );
+			sparseblas::buffer_size = 0;
+		}
+		const grb::RC rc = grb::finalize();
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Error during call to EXTBLAS_free\n";
+			return 10;
+		}
+		return 0;
+	}
+
+	void extspblas_free() {
+		(void) EXTBLAS_free();
+	}
+
+} // end extern "C"
 
diff --git a/tests/unit/fold_matrix_to_scalar.cpp b/tests/unit/fold_matrix_to_scalar.cpp
deleted file mode 100644
index a22c7667e..000000000
--- a/tests/unit/fold_matrix_to_scalar.cpp
+++ /dev/null
@@ -1,628 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Tests for the reduce( Matrix<D>, T, Operator<T,D,T> ) API call
- *
- * @author Benjamin Lozes
- * @date 17/05/2023
- *
- * Tests whether the foldl and foldl API calls produce the expected results.
- *
- * The test cases are focused on the following aspects:
- *   * The types of the result, the matrix values and the operator
- * 	 * The initial value of the reduction result
- * 	 * The order of the operands (foldr, foldl)
- */
-
-#include <chrono>
-#include <iostream>
-#include <numeric>
-#include <sstream>
-#include <vector>
-
-#include <graphblas.hpp>
-
-using namespace grb;
-
-using NzType = double;
-
-constexpr bool SKIP_FOLDL = false;
-constexpr bool SKIP_FOLDR = false;
-constexpr bool SKIP_UNMASKED = false;
-constexpr bool SKIP_MASKED = false;
-// Benchmarking
-constexpr bool PRINT_TIMERS = false;
-constexpr size_t ITERATIONS = 1;
-
-// #define _DEBUG
-
-template< class Iterator >
-void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-#ifndef _DEBUG
-	return;
-#endif
-	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-	if( rows > 50 || cols > 50 ) {
-		os << "   Matrix too large to print" << std::endl;
-	} else {
-		// os.precision( 3 );
-		for( size_t y = 0; y < rows; y++ ) {
-			os << std::string( 3, ' ' );
-			for( size_t x = 0; x < cols; x++ ) {
-				auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-					return a.first.first == y && a.first.second == x;
-				} );
-				if( nnz_val != end )
-					os << std::fixed << ( *nnz_val ).second;
-				else
-					os << '_';
-				os << " ";
-			}
-			os << std::endl;
-		}
-	}
-	os << "]" << std::endl;
-	std::flush( os );
-}
-
-template< typename D >
-void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-	grb::wait( mat );
-	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
-}
-
-template< typename T, typename V, typename M, class Monoid >
-RC foldl_test( const char * test_label,
-	const char * test_description,
-	const grb::Matrix< V > & A,
-	const grb::Matrix< M > & mask,
-	T initial,
-	T expected,
-	const Monoid & monoid,
-	bool skip_masked = false,
-	bool skip_unmasked = false ) {
-	if( SKIP_FOLDL )
-		return RC::SUCCESS;
-	RC rc = RC::SUCCESS;
-
-	if( not skip_unmasked && rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldl( value, A, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldl (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldl (unmasked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	if( not skip_masked && rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldl( value, A, mask, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldl (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldl (masked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	return rc;
-}
-
-template< typename T, typename V, typename M, class Monoid >
-RC foldr_test( const char * test_label,
-	const char * test_description,
-	const grb::Matrix< V > & A,
-	const grb::Matrix< M > & mask,
-	T initial,
-	T expected,
-	const Monoid & monoid,
-	bool skip_masked = false,
-	bool skip_unmasked = false ) {
-	if( SKIP_FOLDR )
-		return RC::SUCCESS;
-	RC rc = RC::SUCCESS;
-
-	if( not skip_unmasked && rc == RC::SUCCESS && ! SKIP_UNMASKED ) { // Unmasked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldr( value, A, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldr (unmasked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldr (unmasked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	if( not skip_masked && rc == RC::SUCCESS && ! SKIP_MASKED ) { // Masked
-		T value = initial;
-		auto start_chrono = std::chrono::high_resolution_clock::now();
-		for( size_t _ = 0; _ < ITERATIONS; _++ ) {
-			value = initial;
-			foldr( value, A, mask, monoid );
-		}
-		auto end_chrono = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast< std::chrono::nanoseconds >( end_chrono - start_chrono ) / ITERATIONS;
-		if( PRINT_TIMERS )
-			std::cout << "foldr (masked) \"" << test_label << "\" took " << duration.count() << " ns" << std::endl;
-
-		std::cout << "foldr (masked) \"" << test_label << "\": ";
-		if( value == expected )
-			std::cout << "OK" << std::endl;
-		else
-			std::cerr << "Failed" << std::endl
-					  << test_description << std::endl
-					  << std::string( 3, ' ' ) << "Initial value: " << initial << std::endl
-					  << std::string( 3, ' ' ) << "Expected value: " << expected << std::endl
-					  << std::string( 3, ' ' ) << "Actual value: " << value << std::endl;
-
-		rc = rc ? rc : ( value == expected ? RC::SUCCESS : RC::FAILED );
-	}
-
-	return rc;
-}
-
-template< typename T, typename V, typename M, class Monoid >
-RC foldLR_test( const char * test_label,
-	const char * test_description,
-	const grb::Matrix< V > & A,
-	const grb::Matrix< M > & mask,
-	T initial,
-	T expected,
-	const Monoid & monoid,
-	bool skip_masked = false,
-	bool skip_unmasked = false ) {
-	RC rc = foldl_test( test_label, test_description, A, mask, initial, expected, monoid, skip_masked, skip_unmasked );
-	return rc ? rc : foldr_test( test_label, test_description, A, mask, initial, expected, monoid, skip_masked, skip_unmasked );
-}
-
-template< typename T, typename M >
-struct input {
-	const grb::Matrix< T > & A;
-	const grb::Matrix< M > & mask;
-	
-	// Default constructor for distributed backends
-	input( const grb::Matrix< T > & A = {0,0}, const grb::Matrix< M > & mask = {0,0} ) : A( A ), mask( mask ) {}
-};
-
-template< typename T, typename M >
-void grb_program( const input< T, M > & in, grb::RC & rc ) {
-	const grb::Matrix< T > & I = in.A;
-	const grb::Matrix< M > & mask = in.mask;
-
-	const long n = grb::nnz( I );
-
-	/**    Test case 1:
-	 *  A simple additive reduction with the same types for the nzs and the reduction result.
-	 *  * Initial value is 0
-	 *  * Expected unmasked result: n
-	 *  * Expected masked result: 0
-	 */
-	{
-		rc = foldLR_test(
-			"1", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 2:
-	 *  A simple additive reduction with the same types for the nzs and the reduction result.
-	 *  * Initial value is n
-	 *  * Expected result: 2*n
-	 */
-	{
-		rc = foldLR_test(
-			"2", "A simple reduction(+) with the same types for the nzs and the reduction result.", I, mask, (NzType)n, (NzType)( 2 * n ), Monoid< operators::add< NzType >, identities::zero >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 3:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
-	 *  * Initial value is 0
-	 *  * Expected result: n
-	 */
-	{
-		rc = foldl_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)0, (int)n,
-			Monoid< operators::add< int, NzType, int >, identities::zero >() );
-		if( rc )
-			return;
-		rc = foldr_test( "3", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)0, (int)n,
-			Monoid< operators::add< NzType, int, int >, identities::zero >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 4:
-	 *  A simple additive reduction with different types for the nzs and the reduction result (int <- int + NzType).
-	 *  * Initial value is n
-	 *  * Expected result: 2*n
-	 */
-	{
-		rc = foldl_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- int + NzType).", I, mask, (int)n, (int)( 2 * n ),
-			Monoid< operators::add< int, NzType, int >, identities::zero >() );
-		if( rc )
-			return;
-		rc = foldr_test( "4", "A simple reduction(+) with different types for the nzs and the reduction result (int <- NzType + int).", I, mask, (int)n, (int)( 2 * n ),
-			Monoid< operators::add< NzType, int, int >, identities::zero >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 5:
-	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
-	 * * Initial value is 0
-	 * * Expected result: 0
-	 */
-	{
-		rc = foldLR_test(
-			"5", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)0, (NzType)0, Monoid< operators::mul< NzType >, identities::one >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 6:
-	 * A simple multiplicative reduction with the same types for the nzs and the reduction result.
-	 * * Initial value is 1
-	 * * Expected result: 1
-	 */
-	{
-		rc = foldLR_test(
-			"6", "A simple reduction(*) with the same types for the nzs and the reduction result.", I, mask, (NzType)1, (NzType)1, Monoid< operators::mul< NzType >, identities::one >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 7:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
-	 * * Initial value is 0
-	 * * Expected result: 0
-	 */
-	{
-		rc = foldl_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-			Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-		if( rc )
-			return;
-		rc = foldr_test( "7", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)0, (size_t)0,
-			Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 8:
-	 * A simple multiplicative reduction with different types for the nzs and the reduction result (size_t <- size_t * NzType).
-	 * * Initial value is 1
-	 * * Expected result: 1
-	 */
-	{
-		rc = foldl_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-			Monoid< operators::mul< size_t, NzType, size_t >, identities::one >() );
-		if( rc )
-			return;
-		rc = foldr_test( "8", "A simple reduction(*) with different types for the nzs and the reduction result (int <- int * NzType).", I, mask, (size_t)1, (size_t)1,
-			Monoid< operators::mul< NzType, size_t, size_t >, identities::one >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 9:
-	 * A simple binary equal reduction with different types for the nzs and the reduction result (bool <- bool == NzType).
-	 * * Initial value is true
-	 * * Expected result: true
-	 */
-	{
-		rc = foldl_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-			Monoid< operators::equal< bool, NzType, bool >, identities::logical_true >() );
-		if( rc )
-			return;
-		rc = foldr_test( "9", "A simple reduction(==) with different types for the nzs and the reduction result (bool <- bool == NzType).", I, mask, (bool)true, (bool)true,
-			Monoid< operators::equal< NzType, bool, bool >, identities::logical_true >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 10:
-	 * A simple binary logical_or reduction with different types for the nzs and the reduction result (bool <- bool || NzType).
-	 * * Initial value is false
-	 * * Expected result: true
-	 */
-	{
-		rc = foldl_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-			Monoid< operators::logical_or< bool, NzType, bool >, identities::logical_false >() );
-		if( rc )
-			return;
-		rc = foldr_test( "10", "A simple reduction(||) with different types for the nzs and the reduction result (bool <- bool || NzType).", I, mask, (bool)false, (bool)true,
-			Monoid< operators::logical_or< NzType, bool, bool >, identities::logical_false >() );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 11:
-	 * Reduction with an empty mask.
-	 * * Initial value is 4
-	 * * Expected result: 4
-	 */
-	{
-		Matrix< void > empty_mask( grb::nrows( I ), grb::ncols( I ), 0 );
-		rc = foldLR_test( "11", "Reduction with an empty mask.", I, empty_mask, (NzType)4, (NzType)4, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 12:
-	 * Reduction with a dense void mask.
-	 * * Initial value is 0
-	 * * Expected result: n
-	 */
-	{
-		Matrix< void > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
-		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
-		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
-			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
-			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
-		}
-		buildMatrixUnique( dense_mask, rows.data(), cols.data(), grb::nrows( I ) * grb::ncols( I ), SEQUENTIAL );
-		rc = foldLR_test( "12", "Reduction with a dense void mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 13:
-	 * Reduction with a dense int mask.
-	 * * Initial value is 0
-	 * * Expected result: n
-	 */
-	{
-		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
-		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
-		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
-			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
-			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
-		}
-		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 1 );
-		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
-		rc = foldLR_test( "13", "Reduction with a dense int mask.", I, dense_mask, (NzType)0, (NzType)n, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 14:
-	 * Reduction with a dense int mask, full of zero, except for the first nz.
-	 * * Initial value is 0
-	 * * Expected result: 1
-	 */
-	{
-		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
-		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
-		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
-			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
-			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
-		}
-		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 0 );
-		for( const auto e : I ) {
-			vals[ e.first.first * grb::ncols( I ) + e.first.second ] = 1;
-			break;
-		}
-		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
-		rc = foldLR_test( "14", "Reduction with a dense int mask, matching only the first nz.", I, dense_mask, (NzType)0, (NzType)1, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-		if( rc )
-			return;
-	}
-
-	/**     Test case 15:
-	 * Reduction with a dense int mask, full of zero, except for the last nz.
-	 * * Initial value is 0
-	 * * Expected result: 1
-	 */
-	{
-		Matrix< int > dense_mask( grb::nrows( I ), grb::ncols( I ), grb::nrows( I ) * grb::ncols( I ) );
-		std::vector< size_t > rows( grb::nrows( I ) * grb::ncols( I ) ), cols( grb::nrows( I ) * grb::ncols( I ) );
-		for( size_t x = 0; x < grb::nrows( I ); x++ ) {
-			std::fill( rows.begin() + x * grb::ncols( I ), rows.begin() + ( x + 1 ) * grb::ncols( I ), x );
-			std::iota( cols.begin() + x * grb::ncols( I ), cols.begin() + ( x + 1 ) * grb::ncols( I ), 0 );
-		}
-		std::vector< int > vals( grb::nrows( I ) * grb::ncols( I ), 0 );
-		size_t previous_idx = 0;
-		for( const auto e : I ) 
-			previous_idx = e.first.first * grb::ncols( I ) + e.first.second;
-		vals[ previous_idx ] = 1;
-		buildMatrixUnique( dense_mask, rows.data(), cols.data(), vals.data(), vals.size(), SEQUENTIAL );
-		rc = foldLR_test( "15", "Reduction with a dense int mask, matching only the last nz.", I, dense_mask, (NzType)0, (NzType)1, Monoid< operators::add< NzType >, identities::zero >(), false, true );
-		if( rc )
-			return;
-	}
-}
-
-int main( int argc, char ** argv ) {
-	// defaults
-	bool printUsage = false;
-	size_t n = 10;
-
-	// error checking
-	if( argc > 2 ) {
-		printUsage = true;
-	}
-	if( argc == 2 ) {
-		n = std::atol( argv[ 1 ] );
-	}
-	if( printUsage ) {
-		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
-		std::cerr << "  -n (optional, default is 10): an even integer, the test "
-				  << "size.\n";
-		return 1;
-	}
-
-	std::cout << "This is functional test " << argv[ 0 ] << "\n";
-	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC rc = RC::SUCCESS;
-
-	if( ! rc ) { // Identity square-matrix
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n ), I_cols( n );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
-		std::cout << "-- Running test 01: Identity square matrix of size n = " << n << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 01 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Build a square-matrix with n 1s on the first row
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
-		std::cout << "-- Running test 02: Square matrix of size n = " << n << ", with n 1s on the first row" << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 02 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Square-matrix with n 1s on the first column
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( n ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), PARALLEL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), PARALLEL );
-		std::cout << "-- Running test 03: Square matrix of size n = " << n << ", with n 1s on the first column" << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 03 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a square-matrix with n 1s on the first row and column
-		Matrix< NzType > I( n, n );
-		std::vector< size_t > I_rows( 2 * n - 1, 0 ), I_cols( 2 * n - 1, 0 );
-		std::vector< NzType > I_vals( 2 * n - 1, 1.f );
-		std::iota( I_rows.begin() + n, I_rows.end(), 1 );
-		std::iota( I_cols.begin(), I_cols.begin() + n, 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
-		Matrix< void > mask( n, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
-		std::cout << "-- Running test 04: Square matrix of size n = " << n << ", with n 1s on the first row and column" << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 04 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a [1 row, n columns] matrix filled with 1s
-		Matrix< NzType > I( 1, n );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_cols.begin(), I_cols.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
-		Matrix< void > mask( 1, n );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
-		std::cout << "-- Running test 05: [1-row, n = " << n << " columns] matrix, filled with 1s" << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 04 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( ! rc ) { // Building a [n rows, 1 column] matrix filled with 1s
-		Matrix< NzType > I( n, 1 );
-		std::vector< size_t > I_rows( n, 0 ), I_cols( n, 0 );
-		std::vector< NzType > I_vals( n, 1.f );
-		std::iota( I_rows.begin(), I_rows.end(), 0 );
-		buildMatrixUnique( I, I_rows.data(), I_cols.data(), I_vals.data(), I_vals.size(), SEQUENTIAL );
-		Matrix< void > mask( n, 1 );
-		buildMatrixUnique( mask, I_rows.data(), I_cols.data(), I_rows.size(), SEQUENTIAL );
-		std::cout << "-- Running test 06: [n = " << n << " rows, 1 column] matrix, filled with 1s" << std::endl;
-		input< NzType, void > input(I, mask);
-		if( launcher.exec( &grb_program, input, rc, true ) != SUCCESS ) {
-			std::cerr << "Launching test 06 FAILED\n";
-			return 255;
-		}
-		std::cout << std::endl << std::flush;
-	}
-
-	if( rc != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl;
-		return rc;
-	} else {
-		std::cout << "Test OK" << std::endl;
-		return 0;
-	}
-}
diff --git a/tests/unit/tril.cpp b/tests/unit/tril.cpp
deleted file mode 100644
index c57b1aeab..000000000
--- a/tests/unit/tril.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <sstream>
-
-#include <graphblas.hpp>
-
-using namespace grb;
-
-template< class Iterator >
-void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-	if( rows > 64 || cols > 64 ) {
-		return;
-	}
-	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-	// os.precision( 3 );
-	for( size_t y = 0; y < rows; y++ ) {
-		os << std::string( 3, ' ' );
-		for( size_t x = 0; x < cols; x++ ) {
-			auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-				return a.first.first == y && a.first.second == x;
-			} );
-			if( nnz_val != end )
-				os << std::fixed << ( *nnz_val ).second;
-			else
-				os << '_';
-			os << " ";
-		}
-		os << std::endl;
-	}
-	os << "]" << std::endl;
-}
-
-template< typename D >
-void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-	grb::wait( mat );
-	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
-}
-
-template< typename D, Descriptor descr = descriptors::no_operation >
-D compute_value( size_t i, size_t j ) {
-	return descr & descriptors::transpose_matrix ? i + 2 * j : 2 * i + j;
-}
-
-template< Descriptor descr = descriptors::no_operation, typename D >
-RC check_obtained( const grb::Matrix< D > & L ) {
-	for( const auto & triple : L ) {
-		const size_t & i = triple.first.first;
-		const size_t & j = triple.first.second;
-		const size_t & v = triple.second;
-		if( i < j ) {
-			std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
-					  << "-- only expected entries on the lower triangular part\n";
-			return RC::FAILED;
-		}
-		const D expected_value = compute_value< D, descr >( i, j );
-		if( v != expected_value ) {
-			std::cout << "Unexpected value at position ( " << i << ", " << j << " ) "
-					  << "-- expected " << expected_value << ", found " << v << "\n";
-			return RC::FAILED;
-		}
-	}
-	return RC::SUCCESS;
-}
-
-void grb_program( const size_t & n, grb::RC & rc ) {
-	rc = RC::SUCCESS;
-
-	// Matrix initialisation
-	grb::Matrix< int > A( n, n );
-	grb::Matrix< size_t > L_A( n, n );  // L_A is the lower triangular matrix of A
-	grb::Matrix< size_t > L_At( n, n ); // L_At is the lower triangular matrix of A^T
-	size_t * I = new size_t[ n ];
-	size_t * J = new size_t[ n ];
-	double * V = new double[ n ];
-	for( size_t k = 0; k < n; ++k ) {
-		I[ k ] = k % 3 == 0 ? k : k - 1;
-		J[ k ] = std::rand() % n;
-		V[ k ] = compute_value< int >( I[ k ], J[ k ] );
-	}
-	assert( not grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ) );
-
-	{ // Mixed-domain matrix, should be successful
-		printSparseMatrix( A, "A" );
-		rc = grb::tril( L_A, A, Phase::RESIZE );
-		rc = rc ? rc : grb::tril( L_A, A, Phase::EXECUTE );
-		printSparseMatrix( L_A, "L_A" );
-
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
-			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
-			return;
-		}
-		rc = check_obtained( L_A );
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
-			std::cerr << "Error on result, incorrect result" << std::endl;
-			return;
-		}
-		std::cout << std::flush << " -- Test passed: mixed-domain matrix" << std::flush << std::endl;
-	}
-	{ // Transpose_matrix descriptor, should be successful
-		printSparseMatrix( A, "A" );
-		rc = grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::RESIZE );
-		rc = rc ? rc : grb::tril< descriptors::transpose_matrix >( L_At, A, Phase::EXECUTE );
-		printSparseMatrix( L_At, "L_At" );
-
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
-			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
-			return;
-		}
-		rc = check_obtained< descriptors::transpose_matrix >( L_At );
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
-			std::cerr << "Error on result, incorrect result" << std::endl;
-			return;
-		}
-		std::cout << std::flush << " -- Test passed: transpose_matrix descriptor" << std::flush << std::endl;
-	}
-}
-
-int main( int argc, char ** argv ) {
-	// defaults
-	size_t n = 1000000;
-
-	// error checking
-	if( argc == 2 ) {
-		n = std::strtoul( argv[ 1 ], nullptr, 10 );
-	}
-	if( argc > 3 ) {
-		std::cerr << "Usage: " << argv[ 0 ] << "[n = " << n << "]\n";
-		return 1;
-	}
-
-	std::cout << "This is functional test " << argv[ 0 ] << "\n";
-	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC out;
-	if( launcher.exec( &grb_program, n, out, false ) != SUCCESS ) {
-		std::cerr << "Launching test FAILED\n";
-		return 255;
-	}
-	if( out != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
-		return out;
-	} else {
-		std::cout << "Test OK" << std::endl;
-		return 0;
-	}
-}
diff --git a/tests/unit/triu.cpp b/tests/unit/triu.cpp
deleted file mode 100644
index 4cf66d97c..000000000
--- a/tests/unit/triu.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <sstream>
-
-#include <graphblas.hpp>
-
-using namespace grb;
-
-template< class Iterator >
-void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) {
-	if( rows > 64 || cols > 64 ) {
-		return;
-	}
-	std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl;
-	// os.precision( 3 );
-	for( size_t y = 0; y < rows; y++ ) {
-		os << std::string( 3, ' ' );
-		for( size_t x = 0; x < cols; x++ ) {
-			auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) {
-				return a.first.first == y && a.first.second == x;
-			} );
-			if( nnz_val != end )
-				os << std::fixed << ( *nnz_val ).second;
-			else
-				os << '_';
-			os << " ";
-		}
-		os << std::endl;
-	}
-	os << "]" << std::endl;
-}
-
-template< typename D >
-void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) {
-	grb::wait( mat );
-	printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os );
-}
-
-template< typename D, Descriptor descr = descriptors::no_operation >
-D compute_value( size_t i, size_t j ) {
-	return descr & descriptors::transpose_matrix ? i + 2 * j : 2 * i + j;
-}
-
-template< Descriptor descr = descriptors::no_operation, typename D >
-RC check_obtained( const grb::Matrix< D > & U ) {
-	for( const auto & triple : U ) {
-		const size_t & i = triple.first.first;
-		const size_t & j = triple.first.second;
-		const size_t & v = triple.second;
-		if( i > j ) {
-			std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) "
-					  << "-- only expected entries on the upper triangular part\n";
-			return RC::FAILED;
-		}
-		const D expected_value = compute_value< D, descr >( i, j );
-		if( v != expected_value ) {
-			std::cout << "Unexpected value at position ( " << i << ", " << j << " ) "
-					  << "-- expected " << expected_value << ", found " << v << "\n";
-			return RC::FAILED;
-		}
-	}
-	return RC::SUCCESS;
-}
-
-void grb_program( const size_t & n, grb::RC & rc ) {
-	rc = RC::SUCCESS;
-
-	// Matrix initialisation
-	grb::Matrix< int > A( n, n );
-	grb::Matrix< size_t > U_A( n, n );  // U_A is the upper triangular matrix of A
-	grb::Matrix< size_t > U_At( n, n ); // U_At is the upper triangular matrix of A^T
-	size_t * I = new size_t[ n ];
-	size_t * J = new size_t[ n ];
-	double * V = new double[ n ];
-	for( size_t k = 0; k < n; ++k ) {
-		I[ k ] = k % 3 == 0 ? k : k - 1;
-		J[ k ] = std::rand() % n;
-		V[ k ] = compute_value< int >( I[ k ], J[ k ] );
-	}
-	assert( not grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ) );
-
-	{ // Mixed-domain matrix, should be successful
-		printSparseMatrix( A, "A" );
-		rc = grb::triu( U_A, A, Phase::RESIZE );
-		rc = rc ? rc : grb::triu( U_A, A, Phase::EXECUTE );
-		printSparseMatrix( U_A, "U_A" );
-
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
-			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
-			return;
-		}
-		rc = check_obtained( U_A );
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: mixed-domain matrix" << std::endl;
-			std::cerr << "Error on result, incorrect result" << std::endl;
-			return;
-		}
-		std::cout << std::flush << " -- Test passed: mixed-domain matrix" << std::flush << std::endl;
-	}
-	{ // Transpose_matrix descriptor, should be successful
-		printSparseMatrix( A, "A" );
-		rc = grb::triu< descriptors::transpose_matrix >( U_At, A, Phase::RESIZE );
-		rc = rc ? rc : grb::triu< descriptors::transpose_matrix >( U_At, A, Phase::EXECUTE );
-		printSparseMatrix( U_At, "U_At" );
-
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
-			std::cerr << "Error on executing: " << grb::toString( rc ) << std::endl;
-			return;
-		}
-		rc = check_obtained< descriptors::transpose_matrix >( U_At );
-		if( rc != SUCCESS ) {
-			std::cerr << "Error on test: transpose_matrix descriptor" << std::endl;
-			std::cerr << "Error on result, incorrect result" << std::endl;
-			return;
-		}
-		std::cout << std::flush << " -- Test passed: transpose_matrix descriptor" << std::flush << std::endl;
-	}
-}
-
-int main( int argc, char ** argv ) {
-	// defaults
-	size_t n = 1000000;
-
-	// error checking
-	if( argc == 2 ) {
-		n = std::strtoul( argv[ 1 ], nullptr, 10 );
-	}
-	if( argc > 3 ) {
-		std::cerr << "Usage: " << argv[ 0 ] << "[n = " << n << "]\n";
-		return 1;
-	}
-
-	std::cout << "This is functional test " << argv[ 0 ] << "\n";
-	grb::Launcher< AUTOMATIC > launcher;
-	grb::RC out;
-	if( launcher.exec( &grb_program, n, out, false ) != SUCCESS ) {
-		std::cerr << "Launching test FAILED\n";
-		return 255;
-	}
-	if( out != SUCCESS ) {
-		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
-		return out;
-	} else {
-		std::cout << "Test OK" << std::endl;
-		return 0;
-	}
-}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 1aa51aad0..0089d2674 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -266,20 +266,6 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
-				if [ "$BACKEND" = "reference" ] || [ "$BACKEND" = "reference_omp" ]|| [ "$BACKEND" = "hyperdags" ]; then
-					echo ">>>      [x]           [ ]       Testing grb::tril"
-					$runner ${TEST_BIN_DIR}/tril_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
-					head -1 ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T}
-					grep 'Test OK' ${TEST_OUT_DIR}/tril_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
-					echo " "
-
-					echo ">>>      [x]           [ ]       Testing grb::triu"
-					$runner ${TEST_BIN_DIR}/triu_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T}
-					head -1 ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T}
-					grep 'Test OK' ${TEST_OUT_DIR}/triu_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
-					echo " "
-				fi
-
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
 				echo "                                 of doubles of size 100."
 				$runner ${TEST_BIN_DIR}/ewiseapply_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/ewiseapply_${MODE}_${BACKEND}_${P}_${T}
@@ -421,12 +407,6 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/buildVector_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::Vector( initializer_list ) constructor"
-				$runner ${TEST_BIN_DIR}/vectorFromListConstructor_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/vectorFromListConstructor_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/vectorFromListConstructor_${MODE}_${BACKEND}_${P}_${T}.log
-				grep 'Test OK' ${TEST_OUT_DIR}/vectorFromListConstructor_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
-				echo " "
-
 				echo ">>>      [x]           [ ]       Testing grb::vectorToMatrixConverter"
 				$runner ${TEST_BIN_DIR}/vectorToMatrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/vectorToMatrix_${MODE}_${BACKEND}_${P}_${T}.log
 				head -1 ${TEST_OUT_DIR}/vectorToMatrix_${MODE}_${BACKEND}_${P}_${T}.log
@@ -469,12 +449,6 @@ for MODE in ${MODES}; do
 				echo "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::foldl+r (scalar, matrix, [mask], monoid)"
-				$runner ${TEST_BIN_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log
-				echo "Test OK" ${TEST_OUT_DIR}/fold_matrix_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
-				echo " "
-
 				echo ">>>      [x]           [ ]       Tests the \`level-0' grb::collectives"
 				echo "Functional test executable: ${TEST_BIN_DIR}/collectives_blas0_${MODE}_${BACKEND}"
 				$runner ${TEST_BIN_DIR}/collectives_blas0_${MODE}_${BACKEND} ${P} &> ${TEST_OUT_DIR}/collectives_blas0_${MODE}_${BACKEND}_${P}_${T}.log
@@ -651,12 +625,6 @@ for MODE in ${MODES}; do
 				grep 'Test OK' ${TEST_OUT_DIR}/mxm_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::eWiseLambda on a small matrix"
-				$runner ${TEST_BIN_DIR}/eWiseLambda_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseLambda_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/eWiseLambda_${MODE}_${BACKEND}_${P}_${T}.log
-				grep 'Test OK' ${TEST_OUT_DIR}/eWiseLambda_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
-				echo " "
-
 				echo ">>>      [x]           [ ]       Testing grb::outer on a small matrix"
 				$runner ${TEST_BIN_DIR}/outer_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/outer_${MODE}_${BACKEND}_${P}_${T}.log
 				head -1 ${TEST_OUT_DIR}/outer_${MODE}_${BACKEND}_${P}_${T}.log

From 8cf6dec7cce5e6ab331cdb63db6dfd192185c03b Mon Sep 17 00:00:00 2001
From: Benjamin Lozes <benjamin.lozes@protonmail.com>
Date: Fri, 9 Feb 2024 16:16:54 +0100
Subject: [PATCH 63/63] Revert to a clean state before merging the dependencies

---
 include/CMakeLists.txt               |  18 +--
 include/graphblas.hpp                |   5 -
 include/transition/blas_sparse.h     |  74 +---------
 include/transition/blas_sparse_vec.h |  56 ++-----
 include/transition/spblas.h          | 211 +++++++++++++++++++++++++++
 tests/smoke/CMakeLists.txt           |   8 +-
 tests/unit/CMakeLists.txt            |  20 ---
 7 files changed, 233 insertions(+), 159 deletions(-)
 create mode 100644 include/transition/spblas.h

diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 714e6034b..5b48b9e55 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -219,30 +219,20 @@ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/interfaces/"
 
 install( TARGETS algorithms EXPORT GraphBLASTargets )
 
-# generate the spblas header with the library prefix
-configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/transition/spblas.h.in
-	${CMAKE_CURRENT_BINARY_DIR}/transition/spblas.h @ONLY
-)
-
 # this target lists the transition path headers
 # these are plain C headers and do not have any dependences
-add_library( transition_headers INTERFACE )
+add_library( transition INTERFACE )
 
 target_include_directories(
-	transition_headers INTERFACE
+	transition INTERFACE
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/transition/>
-	$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/transition/>
 	$<INSTALL_INTERFACE:transition/>
 )
 
-install( FILES ${CMAKE_CURRENT_BINARY_DIR}/transition/spblas.h
-	DESTINATION "${INCLUDE_INSTALL_DIR}/transition"
-)
-
 install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/transition/"
-	DESTINATION "${INCLUDE_INSTALL_DIR}/transition"
+	DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/../transition/"
 	FILES_MATCHING REGEX "${HEADERS_REGEX}"
 )
 
-install( TARGETS transition_headers EXPORT GraphBLASTargets )
+install( TARGETS transition EXPORT GraphBLASTargets )
 
diff --git a/include/graphblas.hpp b/include/graphblas.hpp
index 7a61f9599..a539a5c0d 100644
--- a/include/graphblas.hpp
+++ b/include/graphblas.hpp
@@ -38,11 +38,6 @@
  *  -# generalised sparse linear algebra, \ref GraphBLAS;
  *  -# vertex-centric programming, \ref Pregel.
  *
- * Additionally, to ease integration with existing software, ALP defines
- * so-called \ref TRANS libraries, which presently includes (partial)
- * implementations of the \ref SPARSEBLAS and \ref SPBLAS (de-facto) standards,
- * as well as an interface for numerical \ref TRANS_SOLVERS.
- *
  * Several other programming interfaces are under design at present.
  *
  * For authors who contributed to ALP, please see the NOTICE file.
diff --git a/include/transition/blas_sparse.h b/include/transition/blas_sparse.h
index 83e186f8e..63839723e 100644
--- a/include/transition/blas_sparse.h
+++ b/include/transition/blas_sparse.h
@@ -21,39 +21,6 @@
  * This is the ALP implementation of a subset of the NIST Sparse BLAS standard.
  * While the API is standardised, this header makes some implementation-specific
  * extensions.
- *
- * @author A. N. Yzelman
- * @date 2023
- */
-
-/**
- * \defgroup TRANS Transition path
- *
- * The transition path libraries enable integrating ALP with existing software.
- * It operates by exposing several of its functionalities via established C
- * interfaces and established data formats in order to facilitate the transition
- * of legacy software to ALP. Ideally, users of transition interfaces need only
- * re-compile and link their software; in some cases, trivial modifications
- * might be required to migrate to transition interfaces, e.g., changing the
- * prefix of called functions.
- *
- * The currently exposed interfaces are:
- *  - \ref SPARSEBLAS;
- *  - \ref SPBLAS; and
- *  - \ref TRANS_SOLVERS.
- *
- * All of these transition libraries show-case ALP's ability to quickly wrap
- * around external APIs, thus simplifying integration of ALP-backed code with
- * existing software. We do note, however, that the direct use of the native C++
- * ALP API may lead to higher performance than the use of these transition path
- * interfaces, and that in some cases the legacy interface itself is what makes
- * achieving such higher performance impossible.
- *
- * The current transition path interfaces are at an *experimental prototype
- * phase*; in particular, not all primitives in a given standard API are
- * currently implemented. For \ref SPARSEBLAS in particular, additional support
- * or coverage may freely be requested in GitHub issue #14. For other
- * interfaces, feel free to open new issues or to contact the maintainers.
  */
 
 #ifndef _H_ALP_SPARSEBLAS_NIST
@@ -65,41 +32,6 @@
 extern "C" {
 #endif
 
-/**
- * \defgroup SPARSEBLAS SparseBLAS
- * \ingroup TRANS
- *
- * A SparseBLAS implementation enabled by ALP/GraphBLAS
- *
- * ALP provides a (presently partial) implementation of the Sparse BLAS standard
- * as defined by the BLAS forum and in the following paper:
- *  - Duff, Iain S., Michael A. Heroux, and Roldan Pozo. "An overview of the
- *    sparse basic linear algebra subprograms: The new standard from the BLAS
- *    technical forum." ACM Transactions on Mathematical Software (TOMS) 28(2),
- *    2002, pp. 239-267.
- *
- * We also provide a couple of extensions over this standard, in particular to
- * add support for sparse vectors. Such extensions are prefixed by
- * <tt>EXTBLAS_</tt> and <tt>extblas_</tt>, such as, for example,
- *  - #EXTBLAS_dusv_begin and
- *  - #extblas_sparse_vector.
- * This prefix can be configured differently, please refer to the developer
- * documentation if looking for this option.
- *
- * The functionalities defined by the standard of course retain the prefix
- * defined by the standard: <tt>BLAS_</tt> and <tt>blas_</tt>, such as, e.g.,
- *  - #BLAS_duscr_begin and
- *  - #blas_sparse_matrix.
- *
- * The implementation of this standard is done by mapping back to the equivalent
- * ALP/GraphBLAS primitives. By default, ALP builds both sequential and shared-
- * memory parallel SparseBLAS libraries. It does so simply by compiling the same
- * ALP-based SparseBLAS implementation with a sequential and a shared-memory ALP
- * backend, respectively.
- *
- * @{
- */
-
 /**
  * The possible transposition types.
  *
@@ -261,8 +193,8 @@ int BLAS_dusmm(
 int EXTBLAS_dusmsv(
 	const enum blas_trans_type transa,
 	const double alpha, const blas_sparse_matrix A,
-	const EXTBLAS_TYPE( sparse_vector ) x,
-	EXTBLAS_TYPE( sparse_vector ) y
+	const extblas_sparse_vector x,
+	extblas_sparse_vector y
 );
 
 /**
@@ -397,8 +329,6 @@ int EXTBLAS_dusm_clear( blas_sparse_matrix A );
  */
 int EXTBLAS_free();
 
-/**@}*/ // ends the SparseBLAS doxygen group
-
 #ifdef __cplusplus
 } // end extern "C"
 #endif
diff --git a/include/transition/blas_sparse_vec.h b/include/transition/blas_sparse_vec.h
index efbb8256a..dd44f0ec2 100644
--- a/include/transition/blas_sparse_vec.h
+++ b/include/transition/blas_sparse_vec.h
@@ -21,47 +21,17 @@
  * This is an ALP-specific extension to the NIST Sparse BLAS standard, which
  * the ALP libsparseblas transition path also introduces to the de-facto spblas
  * standard.
- *
- * @author A. N. Yzelman
- * @date 2023
  */
 
 #ifndef _H_ALP_SPARSEBLAS_EXT_VEC
 #define _H_ALP_SPARSEBLAS_EXT_VEC
 
-/**
- * \addtogroup SPARSEBLAS
- * @{
- */
-
-/**@{*/
-/** \internal Helper macros for #EXTBLAS_FUN and #EXTBLAS_TYPE */
-#define __SPBLAS_CONC( _a, _b ) _a ## _b
-#define __SPBLAS_CONCAT( _a, _b ) __SPBLAS_CONC( _a, _b )
-#define SPCONCAT( _a, _b ) __SPBLAS_CONCAT( _a, _b )
-/**@}*/
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**@{*/
-/**
- * \internal
- *
- * Allows renaming our non-standard functions with some other prefix.
- *
- * The default prefixes are <tt>EXTBLAS_</tt> for functions and
- * <tt>extblas_</tt> for types.
- *
- * \endinternal
- */
-#define EXTBLAS_FUN( name ) SPCONCAT( EXTBLAS_, name )
-#define EXTBLAS_TYPE( name ) SPCONCAT( extblas_, name )
-/**@}*/
-
 /** A sparse vector. This is an implementation-specific extension. */
-typedef void * EXTBLAS_TYPE( sparse_vector );
+typedef void * extblas_sparse_vector;
 
 /**
  * Creates a handle to a new sparse vector that holds no entries.
@@ -72,7 +42,7 @@ typedef void * EXTBLAS_TYPE( sparse_vector );
  *
  * @returns An #extblas_sparse_vector that is under construction.
  */
-EXTBLAS_TYPE( sparse_vector ) EXTBLAS_FUN( dusv_begin )( const int n );
+extblas_sparse_vector EXTBLAS_dusv_begin( const int n );
 
 /**
  * Inserts a new nonzero entry into a sparse vector that is under construction.
@@ -90,8 +60,8 @@ EXTBLAS_TYPE( sparse_vector ) EXTBLAS_FUN( dusv_begin )( const int n );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_insert_entry )(
-	EXTBLAS_TYPE( sparse_vector ) x,
+int EXTBLAS_dusv_insert_entry(
+	extblas_sparse_vector x,
 	const double val,
 	const int index
 );
@@ -108,7 +78,7 @@ int EXTBLAS_FUN( dusv_insert_entry )(
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_end )( EXTBLAS_TYPE( sparse_vector ) x );
+int EXTBLAS_dusv_end( extblas_sparse_vector x );
 
 /**
  * Destroys the given sparse vector.
@@ -122,7 +92,7 @@ int EXTBLAS_FUN( dusv_end )( EXTBLAS_TYPE( sparse_vector ) x );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusvds )( EXTBLAS_TYPE( sparse_vector ) x );
+int EXTBLAS_dusvds( extblas_sparse_vector x );
 
 /**
  * Retrieves the number of nonzeroes in a given finalised sparse vector.
@@ -137,7 +107,7 @@ int EXTBLAS_FUN( dusvds )( EXTBLAS_TYPE( sparse_vector ) x );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_nz )( const EXTBLAS_TYPE( sparse_vector ) x, int * nz );
+int EXTBLAS_dusv_nz( const extblas_sparse_vector x, int * nz );
 
 /**
  * Opens a sparse vector for read-out.
@@ -154,7 +124,7 @@ int EXTBLAS_FUN( dusv_nz )( const EXTBLAS_TYPE( sparse_vector ) x, int * nz );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_open )( const EXTBLAS_TYPE( sparse_vector ) x );
+int EXTBLAS_dusv_open( const extblas_sparse_vector x );
 
 /**
  * Retrieves a sparse vector entry.
@@ -184,8 +154,8 @@ int EXTBLAS_FUN( dusv_open )( const EXTBLAS_TYPE( sparse_vector ) x );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_get )(
-	const EXTBLAS_TYPE( sparse_vector ) x,
+int EXTBLAS_dusv_get(
+	const extblas_sparse_vector x,
 	double * const val, int * const ind
 );
 
@@ -200,7 +170,7 @@ int EXTBLAS_FUN( dusv_get )(
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_close )( const EXTBLAS_TYPE( sparse_vector ) x );
+int EXTBLAS_dusv_close( const extblas_sparse_vector x );
 
 /**
  * Removes all entries from a finalised sparse vector.
@@ -213,9 +183,7 @@ int EXTBLAS_FUN( dusv_close )( const EXTBLAS_TYPE( sparse_vector ) x );
  *
  * This is an implementation-specific extension.
  */
-int EXTBLAS_FUN( dusv_clear )( EXTBLAS_TYPE( sparse_vector ) x );
-
-/**@}*/ // end doxygen grouping for SPARSEBLAS
+int EXTBLAS_dusv_clear( extblas_sparse_vector x );
 
 #ifdef __cplusplus
 } // end extern "C"
diff --git a/include/transition/spblas.h b/include/transition/spblas.h
new file mode 100644
index 000000000..4bcc41fc9
--- /dev/null
+++ b/include/transition/spblas.h
@@ -0,0 +1,211 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * This is the ALP implementation of a subset of the de-facto *_spblas.h Sparse
+ * BLAS standard. This implementation uses the spblas_ prefix; e.g.,
+ * #spblas_dcsrgemv.
+ *
+ * All functions defined have <tt>void</tt> return types. This implies two
+ * important factors:
+ *   1. when breaking the contract defined in the API, undefined behaviour will
+ *      occur.
+ *   2. this API hence does not permit the graceful handling of any errors that
+ *      ALP would normally recover gracefully from, such as, but not limited to,
+ *      the detection of dimension mismatches.
+ */
+
+#ifndef _H_ALP_SPBLAS
+#define _H_ALP_SPBLAS
+
+#include "blas_sparse_vec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Performs sparse matrix--vector multiplication.
+ *
+ * This function computes one of
+ *  - \f$ y \to Ax \f$, or
+ *  - \f$ y \to A^Tx \f$.
+ *
+ * The matrix \f$ A \f$ is \f$ m \times n \f$ and holds \f$ k \f$ nonzeroes,
+ * and is assumed to be stored in Compressed Row Storage (CRS).
+ *
+ * @param[in] transa Either 'N' or 'T' for transposed ('T') or not ('N').
+ * @param[in] m      The row size of \f$ A \f$.
+ * @param[in] a      The nonzero value array of \f$ A \f$ of size \f$ k \f$.
+ * @param[in] ia     The row offset array of \f$ A \f$ of size \f$ m+1 \f$.
+ * @param[in] ja     The column indices of nonzeroes of \f$ A \f$. Must be of
+ *                   size \f$ k \f$.
+ * @param[in] x      The dense input vector \f$ x \f$ of length \f$ n \f$.
+ * @param[out] y     The dense output vector \f$ y \f$ of length \f$ m \f$.
+ *
+ * All memory regions must be pre-allocated and initialised.
+ */
+void spblas_dcsrgemv(
+	const char * transa,
+	const int * m,
+	const double * a, const int * ia, const int * ja,
+	const double * x,
+	double * y
+);
+
+/**
+ * Computes a variant of \f$ C \to \alpha AB+\beta C \f$.
+ *
+ * The matrix \f$ A \f$ is sparse and employs the Compressed Row Storage (CRS).
+ * The matrices \f$ B, C \f$ are dense. \f$ A \f$ has size \f$ m \times k \f$,
+ * \f$ B \f$ is \f$ k \times n \f$ and \f$ C \f$ is \f$ m \times n \f$.
+ *
+ * @param[in] transa    Either 'N' or 'T'.
+ * @param[in] m, n, k   Pointers to integers that equal \f$ m, n, k \f$, resp.
+ * @param[in] alpha     Pointer to the scalar \f$ \alpha \f$.
+ * @param[in] matdescra Has several entries. Going from first to last:
+ *                      Either 'G', 'S', 'H', 'T', 'A', or 'D' (similar to MatrixMarket)
+ *                      Either 'L' or 'U', in the case of 'T' (triangular)
+ *                      Either 'N' or 'U' for the diagonal type
+ *                      Either 'F' or 'C' (one or zero based indexing)
+ * @param[in] val       The values of the nonzeroes in \f$ A \f$.
+ * @param[in] indx      The column index of the nonzeroes in \f$ A \f$.
+ * @param[in] pntrb     The Compressed Row Storage (CRS) row start array.
+ * @param[in] pntre     The array \a pntrb shifted by one.
+ * @param[in] b         Pointer to the values of \f$ B \f$.
+ * @param[in] ldb       Leading dimension of \a b. If in row-major format, this
+ *                      should be \f$ n \f$. If in column-major format, this
+ *                      should be \f$ k \f$.
+ * @param[in] beta      Pointer to the scalar \f$ \beta \f$.
+ * @param[in] c         Pointer to the values of \f$ C \f$.
+ * @param[in] ldc       Leading dimension of \a c. If in row-major format, this
+ *                      should be \f$ n \f$. If in column-major format, this
+ *                      should be \f$ m \f$.
+ */
+void spblas_dcsrmm(
+	const char * transa,
+	const int * m, const int * n, const int * k,
+	const double * alpha,
+	const char * matdescra, const double * val, const int * indx,
+	const int * pntrb, const int * pntre,
+	const double * b, const int * ldb,
+	const double * beta,
+	double * c, const int * ldc
+);
+
+/**
+ * Computes \f$ C \to AB \f$ or \f$ C \to A^TB \f$, where all matrices are
+ * sparse and employ the Compressed Row Storage (CRS).
+ *
+ * The matrix \f$ C \f$ is \f$ m \times n \f$, the matrix \f$ A \f$ is
+ * \f$ m \times k \f$, and the matrix \f$ B \f$ is \f$ k \times n \f$.
+ *
+ * @param[in] trans Either 'N' or 'T', indicating whether A is to be transposed.
+ *                  The Hermitian operator on \a A is currently not supported;
+ *                  if required, please submit a ticket.
+ * @param[in] request A pointer to an integer that reads either 0, 1, or 2.
+ *                    0: the output memory area has been pre-allocated and is
+ *                       guaranteed sufficient for storing the output
+ *                    1: a symbolic phase will be executed that only modifies
+ *                       the row offset array \a ic. This array must have been
+ *                       pre-allocated and of sufficient size (\f$ m+1 \f$).
+ *                    2: assumes 1 has executed prior to this call and that the
+ *                       contents of the row offset arrays have not been
+ *                       modified. It also assumes that the column index and
+ *                       value arrays are (now) of sufficient size to hold the
+ *                       output.
+ * @param[in] sort A pointer to an integer value of 7. All other values are not
+ *                 supported by this interface. If you require it, please submit
+ *                 a ticket.
+ * @param[in] m,n,k Pointers to the integer sizes of \a A, \a B, and \a C.
+ * @param[in] a     The value array of nonzeroes in \a A.
+ * @param[in] ja    The column index array of nonzeroes in \a A.
+ * @param[in] ia    The row offset array of nonzeroes in \a A.
+ * @param[in] b, ib, jb  Similar for the nonzeroes in \a B.
+ * @param[out] c, ic, jc Similar for the nonzeroes in \a C. For these parameters
+ *                       depending on \a request there are various assumptions
+ *                       on capacity and, for \a ic, contents.
+ * @param[in] nzmax A pointer to an integer that holds the capacity of \a c and
+ *                  \a jc.
+ * @param[out] info The integer pointed to will be set to 0 if the call was
+ *                  successful, -1 if the routine only computed the required
+ *                  size of \a c and \a jc (stored in \a ic), and any positive
+ *                  integer when computation has proceeded successfully until
+ *                  (but not including) the returned integer.
+ */
+void spblas_dcsrmultcsr(
+	const char * trans, const int * request, const int * sort,
+	const int * m, const int * n, const int * k,
+	double * a, int * ja, int * ia,
+	double * b, int * jb, int * ib,
+	double * c, int * jc, int * ic,
+	const int * nzmax, int * info
+);
+
+/**
+ * Performs sparse matrix--sparse vector multiplication.
+ *
+ * This extension performs one of
+ *  -# \f$ y \to y + \alpha A x \f$, or
+ *  -# \f$ y \to y + \alpha A^T x \f$.
+ *
+ * Here, \f$ A \f$ is assumed in Compressed Row Storage (CRS), while \f$ x \f$
+ * and \f$ y \f$ are assumed to be using the #extblas_sparse_vector extension.
+ *
+ * This API follows loosely that of #spblas_dcsrmultcsr.
+ *
+ * @param[in] trans Either 'N' or 'T', indicating whether A is to be transposed.
+ *                  The Hermitian operator on \a A is currently not supported;
+ *                  if required, please submit a ticket.
+ * @param[in] request A pointer to an integer that reads either 0 or 1
+ *                    0: the output vector is guaranteed to have sufficient
+ *                       capacity to hold the output of the computation.
+ *                    1: a symbolic phase will be executed that only modifies
+ *                       the capacity of the output vector so that it is
+ *                       guaranteed to be able to hold the output of the
+ *                       requested computation.
+ * @param[in] m, n Pointers to integers equal to \f$ m, n \f$.
+ * @param[in] a  The value array of the nonzeroes in \f$ A \f$.
+ * @param[in] ja The column indices of the nonzeroes in \f$ A \f$.
+ * @param[in] ia The row offset arrays of the nonzeroes in \f$ A \f$.
+ * @param[in]  x The sparse input vector.
+ * @param[out] y The sparse output vector.
+ *
+ * This is an ALP implementation-specific extension.
+ */
+void extspblas_dcsrmultsv(
+	const char * trans, const int * request,
+	const int * m, const int * n,
+	const double * a, const int * ja, const int * ia,
+	const extblas_sparse_vector x,
+	extblas_sparse_vector y
+);
+
+/**
+ * An extension that frees any buffers the ALP/GraphBLAS-generated SparseBLAS
+ * library may have allocated.
+ */
+void extspblas_free();
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end _H_ALP_SPBLAS
+
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 3e0cf830d..1bef2efe5 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -180,10 +180,10 @@ add_grb_executables( kcore_decomposition kcore_decomposition.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking bsp1d hybrid
 )
 
-add_grb_executables( triangle_count triangle_count.cpp
-	ADDITIONAL_LINK_LIBRARIES test_utils_headers
-	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
-)
+# add_grb_executables( triangle_count triangle_count.cpp
+# 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+# 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+# )
 
 # targets to list and build the test for this category
 get_property( smoke_tests_list GLOBAL PROPERTY tests_category_smoke )
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d8f0a74b4..c24026aa3 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -89,14 +89,6 @@ add_grb_executables( ewiseapply ewiseapply.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
-add_grb_executables( tril tril.cpp
-	BACKENDS reference reference_omp hyperdags
-)
-
-add_grb_executables( triu triu.cpp
-	BACKENDS reference reference_omp hyperdags
-)
-
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
@@ -133,10 +125,6 @@ add_grb_executables( matrixIterator matrixIterator.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
-add_grb_executables( fold_matrix_to_scalar fold_matrix_to_scalar.cpp
-	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
-)
-
 add_grb_executables( doubleAssign doubleAssign.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
@@ -263,10 +251,6 @@ add_grb_executables( eWiseApplyMatrixReference eWiseApplyMatrixReference.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
-add_grb_executables( eWiseLambda eWiseLambda.cpp
-	BACKENDS reference reference_omp hyperdags nonblocking
-)
-
 add_grb_executables( outer outer.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking
 )
@@ -312,10 +296,6 @@ add_grb_executables( adapterIterator adapterIterator.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking bsp1d hybrid
 )
 
-add_grb_executables( vectorFromListConstructor vectorFromListConstructor.cpp
-	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
-)
-
 # the below targets test successfully when they compile -- they do not need to
 # be executed successfully as part of the unit test suite.