From 9b26e5686fb06b292c0aac6481544adeb052a43c Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 4 Jul 2022 15:48:34 +0200
Subject: [PATCH 01/28] fixing iterator advancement in buildVector()

---
 include/graphblas/reference/vector.hpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/reference/vector.hpp b/include/graphblas/reference/vector.hpp
index f0db908b2..e6167a868 100644
--- a/include/graphblas/reference/vector.hpp
+++ b/include/graphblas/reference/vector.hpp
@@ -478,18 +478,19 @@ namespace grb {
 
 			// perform straight copy
 			fwd_iterator it = start;
-			for( size_t i = 0; start != end && i < _coordinates.size(); ++i ) {
+			for( size_t i = 0; it != end && i < _coordinates.size(); ++i ) {
 				// flag coordinate as assigned
 				if( _coordinates.assign( i ) ) {
 					if( descr & descriptors::no_duplicates ) {
 						return ILLEGAL;
 					}
 					// nonzero already existed, so fold into existing one
-					foldl( _raw[ i ], *it++, dup );
+					foldl( _raw[ i ], *it, dup );
 				} else {
 					// new nonzero, so overwrite
-					_raw[ i ] = static_cast< D >( *it++ );
+					_raw[ i ] = static_cast< D >( *it );
 				}
+				++it;
 			}
 
 			// write back final position
@@ -538,7 +539,9 @@ namespace grb {
 			nnz_iterator nnz = nnz_start;
 			ind_iterator ind = ind_start;
 			while( nnz != nnz_end || ind != ind_end ) {
-				const size_t i = static_cast< size_t >( *ind++ );
+				const size_t i = static_cast< size_t >( *ind );
+				++ind;
+
 				// sanity check
 				if( i >= _coordinates.size() ) {
 					return MISMATCH;
@@ -547,10 +550,11 @@ namespace grb {
 					if( descr & descriptors::no_duplicates ) {
 						return ILLEGAL;
 					}
-					foldl( _raw[ i ], *nnz++, dup );
+					foldl( _raw[ i ], *nnz, dup );
 				} else {
-					_raw[ i ] = static_cast< D >( *nnz++ );
+					_raw[ i ] = static_cast< D >( *nnz );
 				}
+				++nnz;
 			}
 
 			// done

From 56cd0f24e7851a0a9ce32c7644f3237f53897482 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 23 Sep 2022 11:53:19 +0200
Subject: [PATCH 02/28] multiple fixes to HPCG benchmark: fixing unititialized
 variable; printing with printf: unique line with multiple threads/processes
 and max precision with floats; renaming option: max_iter -> max-iter; adding
 descriptors for major operations and removing check for the color mask

---
 include/graphblas/algorithms/hpcg/hpcg.hpp          |  4 ++--
 .../graphblas/algorithms/hpcg/multigrid_v_cycle.hpp |  6 +++---
 .../algorithms/hpcg/red_black_gauss_seidel.hpp      |  5 ++---
 tests/smoke/hpcg.cpp                                | 13 ++++++++-----
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 6caf22a1c..eef0a3376 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -121,7 +121,7 @@ namespace grb {
 			ret = ret ? ret : grb::set( p, 0 );
 
 			ret = ret ? ret : grb::set( p, x );
-			ret = ret ? ret : grb::mxv( Ap, A, x, ring ); // Ap = A * x
+			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, ring ); // Ap = A * x
 			assert( ret == SUCCESS );
 
 			ret = ret ? ret : grb::eWiseApply( r, b, Ap, minus ); // r = b - Ap;
@@ -186,7 +186,7 @@ namespace grb {
 #endif
 
 				ret = ret ? ret : grb::set( Ap, 0 );
-				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, ring ); // Ap = A * p;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( Ap, "middle Ap" );
diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index f40296f91..7541a387f 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -76,7 +76,7 @@ namespace grb {
 				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
 				// to *coarsening_data->system_size
 				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
-				ret = ret ? ret : grb::mxv( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
 									  ring ); // r = coarsening_matrix * Ax_finer
 				return ret;
 			}
@@ -108,7 +108,7 @@ namespace grb {
 				// to nrows(x_fine)
 				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
 
-				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
+				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix | grb::descriptors::dense >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
 				assert( ret == SUCCESS );
 
 				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
@@ -216,7 +216,7 @@ namespace grb {
 #endif
 
 				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
-				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( cd.Ax_finer, data.A, data.z, ring );
 				assert( ret == SUCCESS );
 
 				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
diff --git a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
index 718e5015c..6fdc3c9a3 100644
--- a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
@@ -29,7 +29,6 @@
 
 #include <graphblas.hpp>
 
-
 namespace grb {
 	namespace algorithms {
 		namespace internal {
@@ -77,11 +76,11 @@ namespace grb {
 					                // nonetheless, it is left not to violate the semantics of RBGS in case also the false values
 					                // had been initialized (in which case the check is fundamental); if only true values were initialized,
 					                // we expect CPU branch prediction to neutralize the branch cost
-									if( color_mask[ i ] ) {
+									// if( color_mask[ i ] ) {
 										IOType d = A_diagonal[ i ];
 										IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
 										x[ i ] = v / d;
-									}
+									// }
 								},
 								color_mask, x, r, smoother_temp, A_diagonal );
 				assert( ret == SUCCESS );
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index d84c157e0..5b34d9895 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -41,6 +41,8 @@
 // here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
 #ifdef HPCG_PRINT_STEPS
 
+#include <cstdio>
+
 // HPCG_PRINT_STEPS requires defining the following symbols
 
 /**
@@ -176,15 +178,16 @@ template< typename T,
 		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
 	>
 void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm;
-	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
+	T norm = 0;
+	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
-	std::cout << ">>> ";
 	if( head != nullptr ) {
 		std::cout << head << ": ";
+		printf(">>> %s: %lf\n", head, norm );
+	} else {
+		printf(">>> %lf\n", norm );
 	}
-	std::cout << norm << std::endl;
 }
 #endif
 
@@ -377,7 +380,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			" by the minimum system dimension" )
 		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
 		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
-		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
+		.add_optional_argument( "--max-iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
 		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
 			"maximum norm for the residual to be acceptable (does NOT limit "
 			"the execution of the algorithm)" )

From d2ad73a5e38b679a61c20e6a9d0921622aede26a Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Wed, 6 Apr 2022 14:55:47 +0200
Subject: [PATCH 03/28] adding geometry generator for N-dimensional systems,
 also with halo adding old files for testing little test adding test for
 generator iterators fixing test to be successful memory pre-allocation
 polishing geometry utils, using them to build HPCG input and adding test

---
 include/graphblas/algorithms/hpcg/hpcg.hpp    |  20 +-
 .../algorithms/hpcg/matrix_building_utils.hpp | 110 ++-
 .../algorithms/hpcg/ndim_matrix_builders.hpp  | 736 +++++++++---------
 .../hpcg/old_matrix_building_utils.hpp        | 173 ++++
 .../hpcg/old_ndim_matrix_builders.hpp         | 548 +++++++++++++
 .../algorithms/hpcg/system_building_utils.hpp |  59 +-
 .../utils/geometry/array_vector_storage.hpp   |  67 ++
 .../utils/geometry/generic_vector_storage.hpp | 117 +++
 .../linearized_halo_ndim_geometry.hpp         | 232 ++++++
 .../linearized_halo_ndim_iterator.hpp         | 377 +++++++++
 .../geometry/linearized_halo_ndim_system.hpp  | 111 +++
 .../geometry/linearized_ndim_iterator.hpp     | 178 +++++
 .../utils/geometry/linearized_ndim_system.hpp | 174 +++++
 .../graphblas/utils/geometry/ndim_system.hpp  |  69 ++
 .../graphblas/utils/geometry/ndim_vector.hpp  | 122 +++
 tests/smoke/hpcg.cpp                          | 306 +++++++-
 16 files changed, 2999 insertions(+), 400 deletions(-)
 create mode 100644 include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
 create mode 100644 include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
 create mode 100644 include/graphblas/utils/geometry/array_vector_storage.hpp
 create mode 100644 include/graphblas/utils/geometry/generic_vector_storage.hpp
 create mode 100644 include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
 create mode 100644 include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
 create mode 100644 include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
 create mode 100644 include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
 create mode 100644 include/graphblas/utils/geometry/linearized_ndim_system.hpp
 create mode 100644 include/graphblas/utils/geometry/ndim_system.hpp
 create mode 100644 include/graphblas/utils/geometry/ndim_vector.hpp

diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index eef0a3376..492eb038d 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -31,6 +31,8 @@
 #include "hpcg_data.hpp"
 #include "multigrid_v_cycle.hpp"
 
+#include <graphblas/utils/Timer.hpp>
+
 
 namespace grb {
 	namespace algorithms {
@@ -102,6 +104,7 @@ namespace grb {
 			const ResidualType tolerance,
 			size_t &iterations,
 			ResidualType &norm_residual,
+			bool print_iter_stats,
 			const Ring &ring = Ring(),
 			const Minus &minus = Minus()
 		) {
@@ -139,6 +142,8 @@ namespace grb {
 			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
 			size_t iter { 0 };
 
+			grb::utils::Timer timer;
+
 #ifdef HPCG_PRINT_STEPS
 			DBG_print_norm( p, "start p" );
 			DBG_print_norm( Ap, "start Ap" );
@@ -150,8 +155,17 @@ namespace grb {
 				DBG_println( "========= iteration " << iter << " =========" );
 #endif
 				if( with_preconditioning ) {
-					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+					if( print_iter_stats ) {
+						timer.reset();
+					}
+					ret = ret ? ret : internal::multi_grid( data, data.coarser_level,
+						presmoother_steps, postsmoother_steps, ring, minus );
 					assert( ret == SUCCESS );
+					if( print_iter_stats ) {
+						double duration = timer.time();
+						std::cout << "iteration, pre-conditioner: " << iter << ","
+							<< duration << std::endl;
+					}
 				} else {
 					ret = ret ? ret : grb::set( z, r ); // z = r;
 					assert( ret == SUCCESS );
@@ -215,6 +229,10 @@ namespace grb {
 
 				norm_residual = std::sqrt( norm_residual );
 
+				if( print_iter_stats ) {
+					std::cout << "iteration, residual: " << iter << "," << norm_residual << std::endl;
+				}
+
 				++iter;
 			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
 
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index 1facabe49..2dfeabc49 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -31,15 +31,65 @@
 #include <numeric>
 #include <stdexcept>
 #include <utility>
+#include <limits.h>
 
 #include <graphblas.hpp>
 
 #include "ndim_matrix_builders.hpp"
 
 
+#define PAR
+
+
+
+#ifndef PAR
+#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
+#endif
+
+
 namespace grb {
 	namespace algorithms {
 
+		template< typename T > void partition_nonzeroes(
+				T num_nonzeroes,
+				T& first_offset,
+				T& last_offset
+		) {
+			const size_t num_procs{ spmd<>::nprocs() };
+			const T per_process{ ( num_nonzeroes + num_procs - 1 ) / num_procs }; // round up
+			first_offset = std::min( per_process * static_cast< T >( spmd<>::pid() ), num_nonzeroes );
+			last_offset = std::min( first_offset + per_process, num_nonzeroes );
+		}
+
+		template< typename IterT > void partition_iteration_range(
+			size_t num_nonzeroes,
+			IterT &begin,
+			IterT &end
+		) {
+			assert( num_nonzeroes == static_cast< size_t >( end - begin ) );
+			size_t first, last;
+			partition_nonzeroes( num_nonzeroes, first, last );
+			if( last < num_nonzeroes ) {
+				end = begin;
+				end += last;
+			}
+			begin += first;
+		}
+
+#ifndef PAR
+		template< typename T > void partition_rows(
+				T rows,
+				T& first_row,
+				T& last_row
+		) {
+			const size_t num_procs{ spmd<>::nprocs() };
+			const T per_process{ ( rows + num_procs - 1 ) / num_procs }; // round up
+			first_row = std::min( per_process * static_cast< T >( spmd<>::pid() ), rows );
+			last_row = std::min( first_row + per_process, rows );
+		}
+#endif
+
+
 		/**
 		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
 		 *
@@ -60,16 +110,36 @@ namespace grb {
 		template< std::size_t DIMS, typename T, enum grb::Backend B >
 		grb::RC build_ndims_system_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & sys_sizes, std::size_t halo_size, T diag_value, T non_diag_value ) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
 			if( grb::nrows( M ) != n || grb::nrows( M ) != grb::ncols( M ) ) {
 				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
 											"be square"
 											" and in accordance with given system "
 											"sizes" );
 			}
-			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
-			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
+#ifdef PAR
+			using coord_t = unsigned;
+			if( n > std::numeric_limits< coord_t >::max() ) {
+				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+			}
+			std::array< coord_t, DIMS > _sys_sizes;
+			for( size_t i = 0; i < DIMS; i++ ) _sys_sizes[i] = sys_sizes[i];
+			grb::algorithms::hpcg_builder< DIMS, coord_t, T > hpcg_system( _sys_sizes, halo_size );
+			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > begin(
+				hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
+			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > end(
+				hpcg_system.make_end_iterator( diag_value, non_diag_value )
+			);
+			partition_iteration_range( hpcg_system.system_size(), begin, end );
+
+			// std::cout << "num nonzeroes " << ( end - begin ) << std::endl;
+#else
+			size_t first_row, last_row;
+			partition_rows( n, first_row, last_row );
+			grb::algorithms::old::matrix_generator_iterator< DIMS, T > begin( sys_sizes, first_row, halo_size, diag_value, non_diag_value );
+			grb::algorithms::old::matrix_generator_iterator< DIMS, T > end( sys_sizes, last_row, halo_size, diag_value, non_diag_value );
+#endif
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
 		/**
@@ -97,7 +167,7 @@ namespace grb {
 		template< std::size_t DIMS, typename T, enum grb::Backend B >
 		grb::RC build_ndims_coarsener_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & coarser_sizes, const std::array< std::size_t, DIMS > & finer_sizes ) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
 			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
 				std::size_t step = finer_sizes[ i ] / coarser_sizes[ i ];
 				if( step * coarser_sizes[ i ] != finer_sizes[ i ] ) {
@@ -112,10 +182,32 @@ namespace grb {
 											" with rows == <product of coarser sizes> "
 											"and cols == <product of finer sizes>" );
 			}
-
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
+#ifdef PAR
+			using coord_t = unsigned;
+			if( rows > std::numeric_limits< coord_t >::max() ) {
+				throw std::domain_error( "CoordT cannot store the row coordinates" );
+			}
+			if( cols > std::numeric_limits< coord_t >::max() ) {
+				throw std::domain_error( "CoordT cannot store the column coordinates" );
+			}
+			std::array< coord_t, DIMS > _coarser_sizes, _finer_sizes;
+			for( size_t i = 0; i < DIMS; i++ ) {
+				_coarser_sizes[i] = coarser_sizes[i];
+				_finer_sizes[i] = finer_sizes[i];
+			}
+			grb::algorithms::hpcg_coarsener_builder< DIMS, coord_t, T > coarsener( _coarser_sizes, _finer_sizes );
+			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, T > begin( coarsener.make_begin_iterator() );
+			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, T > end(
+				coarsener.make_end_iterator()
+			);
+			partition_iteration_range( coarsener.system_size(), begin, end );
+#else
+			size_t first_row, last_row;
+			partition_rows( rows, first_row, last_row );
+			grb::algorithms::old::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, first_row );
+			grb::algorithms::old::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, last_row );
+#endif
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
 		/**
diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
index c00eb65b2..06672d110 100644
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
@@ -44,200 +44,91 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+#include <cstddef>
+#include <iterator>
 
+#include <graphblas/utils/geometry/linearized_halo_ndim_system.hpp>
 
-namespace grb {
+#include <graphblas/utils/geometry/linearized_ndim_system.hpp>
+#include <graphblas/utils/geometry/linearized_ndim_iterator.hpp>
+#include <graphblas/utils/geometry/array_vector_storage.hpp>
 
-	namespace algorithms {
 
-		/**
-		 * @brief Base class that iterates on DIMS dimensions starting from the first one.
-		 *
-		 * The coordinates are assumed to generate the row number in a matrix whose number of rows is
-		 * the product of all sizes. This class generates row numbers for physical problems described as
-		 * systems of linear equations in an n-dimensional space.
-		 *
-		 * Example of iterations in a 3D (x, y, z) system of size (4,3,2), with generated row numbers
-		 * reported as '=> ROW':
-		 * - z[0]
-		 * - y[0]
-		 * - x[0] => 0, x[1] => 1, x[2] => 2, x[3] => 3
-		 * - y[1]
-		 * - x[0] => 4, x[1] => 5, x[2] => 6, x[3] => 7
-		 * - y[2]
-		 * - x[0] => 8, x[1] => 9, x[2] => 10, x[3] => 11
-		 * - z[1]
-		 * - y[0]
-		 * - x[0] => 12, x[1] => 13, x[2] => 14, x[3] => 15
-		 * - y[1]
-		 * - x[0] => 16, x[1] => 17, x[2] => 18, x[3] => 19
-		 * - y[2]
-		 * - x[0] => 20, x[1] => 21, x[2] => 22, x[3] => 23
-		 *
-		 * The main goal of this class is to be derived by other classes to generate matrices in an
-		 * STL-iterator-fashion; hence, this class contains all the code for basic coordinate-to-row-column
-		 * conversion in \p DIM dimensions and the basic logic to increment the row number.
-		 *
-		 * @tparam DIMS number os dimensions of the system
-		 */
-		template< std::size_t DIMS >
-		struct row_generator {
 
-			using RowIndexType = std::size_t; ///< numeric type of rows
-			using array_t = std::array< RowIndexType,
-				DIMS >; ///< type for the array storing the coordinates.
+namespace grb {
 
-			const array_t physical_sizes; ///< size of each dimension, starting from the one to be explored first
+	namespace algorithms {
 
-			/**
-			 * @brief Construct a new row generator object
-			 * @param[in] _sizes array of sizes of each dimension; no dimension should be 0, otherwise an exception
-			 *                   is thrown
-			 * @param[in] first_row first row to iterate from; it is allowed to be beyond the matrix size, e.g. to create
-			 *                      an end iterator (no check occurs)
-			 */
-			row_generator( const array_t & _sizes, RowIndexType first_row ) : physical_sizes( _sizes ) {
-				static_assert( DIMS > 0, "DIMS should be higher than 0" );
-				for( const auto i : _sizes ) {
-					if( i == static_cast< RowIndexType >( 0U ) ) {
-						throw std::invalid_argument( "All dimension sizes must "
-													 "be > 0" );
-					}
-				}
-				row_to_coords( first_row );
-			}
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		class hpcg_builder;
+
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		struct matrix_generator_iterator {
+
+			using RowIndexType = CoordT; ///< numeric type of rows
+			using ColumnIndexType = CoordT;
+			using ValueType = T;
+			friend hpcg_builder< DIMS, CoordT, T >;
 
-			row_generator( const row_generator & o ) = default;
+			using linear_system_t = grb::utils::geometry::linearized_halo_ndim_system< RowIndexType, DIMS >;
+			using __iter_t = typename linear_system_t::iterator;
+			using self_t = matrix_generator_iterator< DIMS, CoordT, T >;
 
-			row_generator( row_generator && o ) = default;
+			struct __value {
 
-		protected:
-			// x: row_coords[0], y: row_coords[1], z: row_coords[2], ...
-			array_t row_coords; ///< n-D coordinates from which to compute the row
+				friend self_t;
 
-			/**
-			 * @brief converts a row number into a n-D coordinates according to the sizes in #physical_sizes
-			 *
-			 * In case the input is higher than the nunber of rows, the last coordinate is allowed to
-			 * go beyond its physical size. E.g., if the system has size (4,3,2) and \p rowcol is 24,
-			 * the coordinates are (0,0,3).
-			 *
-			 * @param[in] rowcol row number to convert; it can be any number
-			 */
-			void row_to_coords( RowIndexType rowcol ) {
-				std::size_t s = 1;
-				for( std::size_t i { 0 }; i < row_coords.size() - 1; i++ )
-					s *= physical_sizes[ i ];
-
-				for( typename array_t::size_type i { row_coords.size() - 1 }; i > 0; i-- ) {
-					row_coords[ i ] = rowcol / s;
-					rowcol -= row_coords[ i ] * s;
-					s /= physical_sizes[ i ];
-				}
-				row_coords[ 0 ] = rowcol % physical_sizes[ 0 ];
-			}
+				__value(
+					ValueType diag,
+					ValueType non_diag,
+					RowIndexType i,
+					ColumnIndexType j
+				) noexcept :
+					diagonal_value( diag ),
+					non_diagonal_value( non_diag ),
+					_i( i ),
+					_j( j )
+				{}
 
-			/**
-			 * @brief Pure function converting an array of coordinates into a row number, based on #physical_sizes.
-			 * @param a the #array_t array of coordinates to convert
-			 * @return #RowIndexType the row corresponding to the coordinates in \p a
-			 */
-			RowIndexType coords_to_rowcol( const array_t & a ) const {
-				RowIndexType row { 0 };
-				RowIndexType s { 1 };
-				for( typename array_t::size_type i { 0 }; i < a.size(); i++ ) {
-					row += s * a[ i ];
-					s *= physical_sizes[ i ];
-				}
-				return row;
-			}
+				__value( const __value & ) = default;
 
-			/**
-			 * @brief Increment #row_coords in order to move to the next coordinate (according to the
-			 * n-dimensional iteration order) and update #current_row accordingly.
-			 *
-			 * To be used by derived classes in order to generate the matrix, e.g. via the \c operator()++
-			 * operator prescribed for STL-like iterators.
-			 */
-			void increment_row() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & coord = row_coords[ i ];
-					// must rewind dimension if we wrap-around
-					typename array_t::value_type new_coord = ( coord + 1 ) % physical_sizes[ i ];
-					rewind = new_coord < coord;
-					coord = new_coord;
-					++i;
-				} while( rewind && i < row_coords.size() - 1 ); // rewind only the first N-1 coordinates
-
-				// if we still have to rewind, increment the last coordinate, which is unbounded
-				if( rewind ) {
-					row_coords.back()++;
+				__value & operator=( const __value & ) = default;
+
+				inline RowIndexType i() const { return _i; }
+				inline ColumnIndexType j() const { return _j; }
+				inline ValueType v() const {
+					return j() == i() ? diagonal_value : non_diagonal_value;
 				}
-			}
-		};
 
-		// ===============================================================
+			private:
+				ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
+				ValueType non_diagonal_value; ///< value to emit outside of the diagonal
+				RowIndexType _i;
+				ColumnIndexType _j;
+			};
 
-		/**
-		 * @brief STL-like iterable class to generate the values for a matrix by iterating in an n-dimensional
-		 * space along the coordinates.
-		 *
-		 * For each \f$ X=(x0, x1, ...,xn) \f$ point of the underlying (n+1)-dimensional space,
-		 * this class iterates through the points of the n-dimensional halo of radius \p halo around \f$ X \f$,
-		 * generating the row number corresponding to \f$ X \f$ and the column number corresponding to
-		 * each halo point. At each coordinate \code (row, col) \endcode generated this way, the corresponding matrix value
-		 * being generated depends on whether \code row == col \endcode.
-		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam HALO halo size, determining the number of points to iterate around and thus the column coordinates
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T = double >
-		struct matrix_generator_iterator : public row_generator< DIMS > {
+			// interface for std::random_access_iterator
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = __value;
+			using pointer = value_type;
+			using reference = value_type;
+			using difference_type = typename __iter_t::difference_type;
 
-			using RowIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ColumnIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ValueType = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< RowIndexType, ColumnIndexType >, T >;
-
-			// halo may in future become a DIM-size array to iterate in arbitrary shapes
-			const RowIndexType halo;              ///< number of points per dimension to iterate around
-			const ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
-			const ValueType non_diagonal_value; ///< value to emit outside of the diagonal
+			matrix_generator_iterator( const self_t & ) = default;
 
-			/**
-			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
-			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
-			 *
-			 * @param sizes array with the sizes along the dimensions
-			 * @param row current row to initialize the matrix on
-			 * @param _halo halo of points to iterate around; must be > 0
-			 * @param diag value to emit when on the diagonal
-			 * @param non_diag value to emit outside the diagonal
-			 */
-			matrix_generator_iterator( const array_t & sizes, RowIndexType row, RowIndexType _halo, ValueType diag, ValueType non_diag ) :
-				row_generator< DIMS >( sizes, row ), halo( _halo ), diagonal_value( diag ), non_diagonal_value( non_diag ) {
-				if( halo <= 0 ) {
-					throw std::invalid_argument( "halo should be higher than 0" );
-				}
-				for( const auto i : sizes ) {
-					if( i < static_cast< RowIndexType >( 2 * halo + 1 ) ) {
-						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
-					}
-				}
-				current_values.first.first = row;
-				update_column_max_values();
-				reset_all_columns();
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = v();
-			}
+			matrix_generator_iterator( self_t && ) = default;
 
-			matrix_generator_iterator( const matrix_generator_iterator & o ) = default;
+			self_t & operator=( const self_t & ) = default;
 
-			matrix_generator_iterator( matrix_generator_iterator && o ) = default;
+			self_t & operator=( self_t && ) = default;
 
 			/**
 			 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
@@ -248,22 +139,22 @@ namespace grb {
 			 *
 			 * @return matrix_generator_iterator<DIMS, T>& \c this object, with the updated state
 			 */
-			matrix_generator_iterator< DIMS, T > & operator++() {
-				bool must_rewind = increment_column();
-				if( must_rewind ) {
-					this->increment_row();
-					// after changing row, we must find the first non-zero column
-					reset_all_columns();
-					current_values.first.first = this->coords_to_rowcol( this->row_coords );
-					update_column_max_values();
-				}
-				// trigger column update after row update, as a row update
-				// triggers a column update
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = this->v();
+			self_t & operator++() noexcept {
+				(void) ++_sys_iter;
+				update_coords();
 				return *this;
 			}
 
+			self_t & operator+=( size_t offset ) {
+				_sys_iter += offset;
+				update_coords();
+				return *this;
+			}
+
+			difference_type operator-( const self_t &other ) const {
+				return this->_sys_iter - other._sys_iter;
+			}
+
 			/**
 			 * @brief Operator to compare \c this against \p o  and return whether they differ.
 			 *
@@ -271,11 +162,8 @@ namespace grb {
 			 * @return true of the row or the column is different between \p o and \c this
 			 * @return false if both row and column of \p o and \c this are equal
 			 */
-			bool operator!=( const matrix_generator_iterator< DIMS, T > & o ) const {
-				if( o.i() != this->i() ) {
-					return true;
-				}
-				return o.j() != this->j();
+			bool operator!=( const self_t &o ) const {
+				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
@@ -285,8 +173,8 @@ namespace grb {
 			 * @return true of the row or the column is different between \p o and \c this
 			 * @return false if both row and column of \p o and \c this are equal
 			 */
-			bool operator==( const matrix_generator_iterator< DIMS, T > & o ) const {
-				return o.i() == this->i() && o.j() == this->j();
+			bool operator==( const self_t &o ) const {
+				return ! operator!=( o );
 			}
 
 			/**
@@ -295,22 +183,26 @@ namespace grb {
 			 * Useful when building the matrix by copying the triple of coordinates and value,
 			 * like for the BSP1D backend.
 			 */
-			const value_type & operator*() const {
-				return current_values;
+			reference operator*() const {
+				return _val;
+			}
+
+			pointer operator->() const {
+				return &_val;
 			}
 
 			/**
 			 * @brief Returns current row.
 			 */
 			inline RowIndexType i() const {
-				return current_values.first.first;
+				return _val.i();
 			}
 
 			/**
 			 * @brief Returns current column.
 			 */
 			inline ColumnIndexType j() const {
-				return current_values.first.second;
+				return _val.j();
 			}
 
 			/**
@@ -320,80 +212,143 @@ namespace grb {
 			 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
 			 */
 			inline ValueType v() const {
-				return j() == i() ? diagonal_value : non_diagonal_value;
+				return _val.v();
 			}
 
 		private:
-			// offsets w.r.t. rows
-			array_t col_coords;        ///< coordinates corresponding to current column
-			array_t column_max_values; ///< maximum values for the column coordinates, to stop column increment
-			//// and reset the column coordinates
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
+			value_type _val;
+			const linear_system_t *_lin_system;
+			__iter_t _sys_iter;
 
 			/**
-			 * @brief Updates the maximum values each column coordinate can reach, according to the row coordinates.
+			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
+			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
 			 *
-			 * To be called after each row coordinates update.
+			 * @param sizes array with the sizes along the dimensions
+			 * @param _halo halo of points to iterate around; must be > 0
+			 * @param diag value to emit when on the diagonal
+			 * @param non_diag value to emit outside the diagonal
 			 */
-			void update_column_max_values() {
-				for( std::size_t i { 0 }; i < column_max_values.size(); i++ ) {
-					column_max_values[ i ] = std::min( this->physical_sizes[ i ] - 1, this->row_coords[ i ] + halo );
-				}
+			matrix_generator_iterator(
+				const linear_system_t &system,
+				ValueType diag,
+				ValueType non_diag
+			) noexcept :
+				_val( diag, non_diag, 0, 0 ),
+				_lin_system( &system ),
+				_sys_iter( system.begin() )
+			{
+				update_coords();
 			}
 
-			/**
-			 * @brief Resets the value of column dimension \p dim to the first possible value.
-			 *
-			 * The final value of #col_coords[dim] depends on the current row (#row_coords) and on the \p halo
-			 * and is \f$ max(0, \f$ #row_coords \f$[dim])\f$.
-			 *
-			 * @param dim the dimension to reset
-			 */
-			void reset_column_coords( std::size_t dim ) {
-				// cannot use std::max because row_coords is unsigned and can wrap-around
-				col_coords[ dim ] = this->row_coords[ dim ] <= halo ? 0 : ( this->row_coords[ dim ] - halo );
+			void update_coords() {
+				_val._i = _sys_iter->get_element_linear();
+				_val._j = _sys_iter->get_neighbor_linear();
 			}
+		};
 
-			/**
-			 * @brief resets all values in #col_coords to the initial coordinates,
-			 * iterating from on the current row.
-			 */
-			void reset_all_columns() {
-				for( std::size_t i { 0 }; i < col_coords.size(); i++ ) {
-					reset_column_coords( i );
+
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		class hpcg_builder {
+
+			using system_t = grb::utils::geometry::linearized_halo_ndim_system< CoordT, DIMS >;
+
+			system_t system;
+			// const grb::utils::geometry::linearized_halo_ndim_system< CoordT, DIMS > system;
+			const CoordT halo;
+
+		public:
+
+			using hpcg_sys_iterator = matrix_generator_iterator< DIMS, CoordT, T >;
+
+			hpcg_builder(
+				const std::array< CoordT, DIMS > &sizes,
+				CoordT _halo
+			) :
+				system( sizes, _halo ),
+				halo( _halo )
+			{
+				if( _halo <= 0 ) {
+					throw std::invalid_argument( "halo should be higher than 0" );
+				}
+				for( const auto i : sizes ) {
+					if( i < 2 * _halo + 1 ) {
+						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
+					}
 				}
 			}
 
-			/**
-			 * @brief Increment the column according to the iteration order, thus resetting the column coordinates
-			 * when the last possible column value for the current row has been reached.
-			 *
-			 * @return true if the column coordinates have been reset, and thus also the row must be incremented
-			 * @return false if the column coordinates
-			 */
-			bool increment_column() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & col = col_coords[ i ];
-					// must rewind dimension if the column offset is already at the max value
-					// or if the column coordinates are already at the max value
-					rewind = ( col == column_max_values[ i ] );
-					if( rewind ) {
-						// col = this->row_coords[i] == 0 ? 0 : this->row_coords[i] - (halo);
-						reset_column_coords( i );
-					} else {
-						++col;
-					}
-					++i;
-				} while( rewind && i < col_coords.size() );
+			hpcg_builder( const hpcg_builder< DIMS, CoordT, T> & ) = delete;
+
+			hpcg_builder( hpcg_builder< DIMS, CoordT, T> && ) = delete;
+
+			hpcg_builder< DIMS, CoordT, T> & operator=( const hpcg_builder< DIMS, CoordT, T> & ) = delete;
+
+			hpcg_builder< DIMS, CoordT, T> & operator=( hpcg_builder< DIMS, CoordT, T> && ) = delete;
+
+			size_t system_size() const {
+				return system.halo_system_size();
+			}
+
+			hpcg_sys_iterator make_begin_iterator(
+				T diag,
+				T non_diag
+			) {
+				return hpcg_sys_iterator( system, diag, non_diag );
+			}
 
-				// if we change z, then we also must reset x and y; if only y, we must reset x, and so on
-				return rewind;
+			hpcg_sys_iterator make_end_iterator(
+				T diag,
+				T non_diag
+			) {
+				hpcg_sys_iterator result( system, diag, non_diag );
+				result += system_size() - 1; // do not trigger boundary checks
+				++result;
+				return result;
 			}
+
 		};
 
-		// ===============================================================
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		class hpcg_coarsener_builder;
+
 
 		/**
 		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
@@ -408,56 +363,66 @@ namespace grb {
 		 * @tparam DIMS number of dimensions of the system
 		 * @tparam T type of matrix values
 		 */
-		template< std::size_t DIMS, typename T = double >
-		struct coarsener_generator_iterator : public row_generator< DIMS > {
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		struct coarsener_generator_iterator {
+
+			friend hpcg_coarsener_builder< DIMS, CoordT, T >;
 
-			using RowIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ColumnIndexType = typename row_generator< DIMS >::RowIndexType;
+			using RowIndexType = CoordT; ///< numeric type of rows
+			using ColumnIndexType = CoordT;
 			using ValueType = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< RowIndexType, ColumnIndexType >, T >;
 
-			// the sizes to project from
-			const array_t finer_sizes; ///< the size of the finer system (columns)
-			array_t steps;             ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
+			using lin_system_t = grb::utils::geometry::linearized_ndim_system< CoordT,
+				grb::utils::geometry::array_vector_storage< CoordT, DIMS > >;
+			using __iter_t = typename lin_system_t::iterator;
+			using self_t = coarsener_generator_iterator< DIMS, CoordT, T >;
+			using array_t = std::array< CoordT, DIMS >;
 
-			/**
-			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
-			 * setting its row at \p _current_row and the column at the corresponding value.
-			 *
-			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
-			 * construction will throw an exception.
-			 *
-			 * @param _coarser_sizes sizes of the coarser system (rows)
-			 * @param _finer_sizes sizes of the finer system (columns)
-			 * @param _current_row row (in the coarser system) to set the iterator on
-			 */
-			coarsener_generator_iterator( const array_t & _coarser_sizes, const array_t & _finer_sizes, RowIndexType _current_row ) :
-				row_generator< DIMS >( _coarser_sizes, _current_row ), finer_sizes( _finer_sizes ), steps( { 0 } ) {
-				for( std::size_t i { 0 }; i < DIMS; i++ ) {
-					// finer size MUST be an exact multiple of coarser_size
-					typename array_t::value_type step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
-					if( step == 0 || finer_sizes[ i ] / step != this->physical_sizes[ i ] ) {
-						throw std::invalid_argument( std::string( "finer size "
-																  "of "
-																  "dimension"
-																  " " ) +
-							std::to_string( i ) +
-							std::string( "is not an exact multiple of coarser "
-										 "size" ) );
-					}
-					steps[ i ] = step;
+			struct __value {
+
+				friend self_t;
+
+				__value(
+					RowIndexType i,
+					ColumnIndexType j
+				) noexcept :
+					_i( i ),
+					_j( j )
+				{}
+
+				__value( const __value & ) = default;
+
+				__value & operator=( const __value & ) = default;
+
+				inline RowIndexType i() const { return _i; }
+				inline ColumnIndexType j() const { return _j; }
+				inline ValueType v() const {
+					return static_cast< ValueType >( 1 );
 				}
-				current_values.first.first = _current_row;
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
-			}
 
-			coarsener_generator_iterator( const coarsener_generator_iterator & o ) = default;
+			private:
+				RowIndexType _i;
+				ColumnIndexType _j;
+			};
+
+			// interface for std::random_access_iterator
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = __value;
+			using pointer = const value_type;
+			using reference = const value_type&;
+			using difference_type = typename __iter_t::difference_type;
+
+			coarsener_generator_iterator( const self_t & o ) = default;
+
+			coarsener_generator_iterator( self_t && o ) = default;
 
-			coarsener_generator_iterator( coarsener_generator_iterator && o ) = default;
+			self_t & operator=( const self_t & ) = default;
+
+			self_t & operator=( self_t && ) = default;
 
 			/**
 			 * @brief Increments the row and the column according to the respective physical sizes,
@@ -465,29 +430,34 @@ namespace grb {
 			 *
 			 * @return \code *this \endcode, i.e. the same object with the updates row and column
 			 */
-			coarsener_generator_iterator< DIMS, T > & operator++() {
-				this->increment_row();
-				current_values.first.first = this->coords_to_rowcol( this->row_coords );
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
+			self_t & operator++() noexcept {
+				(void) ++_sys_iter;
+				update_coords();
+				return *this;
+			}
+
+			self_t & operator+=( size_t offset ) {
+				_sys_iter += offset;
+				update_coords();
 				return *this;
 			}
 
+			difference_type operator-( const self_t &o ) const {
+				return this->_sys_iter - o._sys_iter;
+			}
+
 			/**
 			 * @brief Returns whether \c this and \p o differ.
 			 */
-			bool operator!=( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				if( this->i() != o.i() ) {
-					return true;
-				}
-				return this->j() != o.j();
+			bool operator!=( const self_t &o ) const {
+				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
 			 * @brief Returns whether \c this and \p o are equal.
 			 */
-			bool operator==( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				return this->i() == o.i() && this->j() == o.j();
+			bool operator==( const self_t &o ) const {
+				return ! this->operator!=( o );
 			}
 
 			/**
@@ -496,101 +466,151 @@ namespace grb {
 			 * Useful when building the matrix by copying the triple of coordinates and value,
 			 * like for the BSP1D backend.
 			 */
-			const value_type & operator*() const {
-				return current_values;
+			reference operator*() const {
+				return _val;
+			}
+
+			pointer operator->() const {
+				return &_val;
 			}
 
 			/**
 			 * @brief Returns the current row, according to the coarser system.
 			 */
 			inline RowIndexType i() const {
-				return current_values.first.first;
+				return _val.i();
 			}
 
 			/**
 			 * @brief Returns the current column, according to the finer system.
 			 */
 			inline ColumnIndexType j() const {
-				return current_values.first.second;
+				return _val.j();
 			}
 
 			/**
 			 * @brief Returns always 1, as the coarsening keeps the same value.
 			 */
 			inline ValueType v() const {
-				return static_cast< ValueType >( 1 );
+				return _val.v();
 			}
 
 		private:
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
+			//// incremented when incrementing the row coordinates; is is the ration between
+			//// #finer_sizes and row_generator#physical_sizes
+			const lin_system_t *_lin_sys;
+			const array_t *_steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			__iter_t _sys_iter;
+			value_type _val;
+
+			/**
+			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
+			 * setting its row at \p _current_row and the column at the corresponding value.
+			 *
+			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
+			 * construction will throw an exception.
+			 *
+			 * @param _coarser_sizes sizes of the coarser system (rows)
+			 * @param _finer_sizes sizes of the finer system (columns)
+			 * @param _current_row row (in the coarser system) to set the iterator on
+			 */
+			coarsener_generator_iterator(
+				const lin_system_t &system,
+				const array_t &steps
+			) noexcept :
+				_lin_sys( &system ),
+				_steps( &steps ),
+				_sys_iter( _lin_sys->begin() ),
+				_val(0, 0)
+			{
+				update_coords();
+			}
+
+			void update_coords() noexcept {
+				_val._i = _sys_iter->get_linear_position();
+				_val._j = coarse_rows_to_finer_col();
+			}
 
 			/**
 			 * @brief Returns the row coordinates converted to the finer system, to compute
 			 * the column value.
 			 */
-			ColumnIndexType coords_to_finer_col() const {
-				ColumnIndexType row { 0 };
+			ColumnIndexType coarse_rows_to_finer_col() const noexcept {
+				ColumnIndexType finer { 0 };
 				ColumnIndexType s { 1 };
-				for( typename array_t::size_type i { 0 }; i < this->row_coords.size(); i++ ) {
-					s *= steps[ i ];
-					row += s * this->row_coords[ i ];
-					s *= this->physical_sizes[ i ];
+				for( size_t i { 0 }; i < DIMS; i++ ) {
+					s *= (*_steps)[ i ];
+					finer += s * _sys_iter->get_position()[ i ];
+					s *= _lin_sys->get_sizes()[ i ];
 				}
-				return row;
+				return finer;
 			}
 		};
 
-	} // end namespace algorithms
 
-} // end namespace grb
+		template<
+			size_t DIMS,
+			typename CoordT,
+			typename T
+		>
+		class hpcg_coarsener_builder {
+		public:
 
-namespace std {
+			using array_t = std::array< CoordT, DIMS >;
+			using hpcg_coarsener_iterator = coarsener_generator_iterator< DIMS, CoordT, T >;
 
-	/**
-	 * Specialises the standard STL iterator traits for
-	 * #grb::algorithms::matrix_generator_iterator
-	 */
-	template< size_t DIMS, typename T >
-	class iterator_traits<
-		grb::algorithms::matrix_generator_iterator< DIMS, T >
-	> {
+			hpcg_coarsener_builder(
+				const array_t &_coarser_sizes,
+				const array_t &_finer_sizes
+			) : system( _coarser_sizes.begin(), _coarser_sizes.end() ) {
+				for( size_t i { 0 }; i < DIMS; i++ ) {
+					// finer size MUST be an exact multiple of coarser_size
+					size_t step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
+					if( step == 0 || _finer_sizes[ i ] / step != _coarser_sizes[ i ] ) {
+						throw std::invalid_argument(
+							std::string( "finer size of dimension " ) + std::to_string( i ) +
+							std::string( "is not an exact multiple of coarser size" )
+						);
+					}
+					steps[ i ] = step;
+				}
+			}
 
-		private:
+			hpcg_coarsener_builder( const hpcg_coarsener_builder< DIMS, CoordT, T> & ) = delete;
 
-			typedef grb::algorithms::matrix_generator_iterator< DIMS, T > SelfType;
+			hpcg_coarsener_builder( hpcg_coarsener_builder< DIMS, CoordT, T> && ) = delete;
 
+			hpcg_coarsener_builder< DIMS, CoordT, T> & operator=( const hpcg_coarsener_builder< DIMS, CoordT, T> & ) = delete;
 
-		public:
+			hpcg_coarsener_builder< DIMS, CoordT, T> & operator=( hpcg_coarsener_builder< DIMS, CoordT, T> && ) = delete;
 
-			typedef typename SelfType::ValueType value_type;
-			typedef const value_type * pointer;
-			typedef const value_type & reference;
-			typedef size_t difference_type;
-			typedef forward_iterator_tag iterator_category;
+			size_t system_size() const {
+				return system.system_size();
+			}
 
-	};
+			hpcg_coarsener_iterator make_begin_iterator() {
+				return hpcg_coarsener_iterator( system, steps );
+			}
 
-	template< size_t DIMS, typename T >
-	class iterator_traits<
-		grb::algorithms::coarsener_generator_iterator< DIMS, T >
-	> {
+			hpcg_coarsener_iterator make_end_iterator() {
+				hpcg_coarsener_iterator result( system, steps );
+				result += system_size() - 1; // do not trigger boundary checks
+				++result;
+				return result;
+			}
 
 		private:
+			const grb::utils::geometry::linearized_ndim_system< CoordT,
+				grb::utils::geometry::array_vector_storage< CoordT, DIMS > > system;
 
-			typedef grb::algorithms::coarsener_generator_iterator< DIMS, T > SelfType;
-
-
-		public:
-
-			typedef typename SelfType::ValueType value_type;
-			typedef const value_type * pointer;
-			typedef const value_type & reference;
-			typedef size_t difference_type;
-			typedef forward_iterator_tag iterator_category;
+			array_t steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			//// incremented when incrementing the row coordinates; is is the ration between
+			//// #finer_sizes and row_generator#physical_sizes
+		};
 
-	};
 
-} // end namespace std
+	} // namespace algorithms
+} // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
 
diff --git a/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
new file mode 100644
index 000000000..9bb5c7a95
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
@@ -0,0 +1,173 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hpcg_matrix_building_utils.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Utilities to build the matrices for HPCG simulations in an arbitrary number of dimensions.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_OLD_MATRIX_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_OLD_MATRIX_BUILDING_UTILS
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+
+#include <graphblas.hpp>
+
+#include "old_ndim_matrix_builders.hpp"
+
+
+namespace grb {
+	namespace algorithms {
+		namespace old {
+
+
+		/**
+		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
+		 *
+		 * This routine initializes \p M to a matrix representing a \p DIMS -dimensions system of sizes
+		 * \p sys_sizes, with an iteration halo of size \p halo_size . The matrix diagonal values are initialized
+		 * to \p diag_value while the other non-zero values are initialized to \p non_diag_value .
+		 *
+		 * @tparam DIMS system dimensions
+		 * @tparam T type of matrix values
+		 * @tparam B matrix GraphBLAS backend
+		 * @param M the matrix to be initialized; it must be already constructed
+		 * @param sys_sizes the sizes of the physical system
+		 * @param halo_size the size of the halo of point to iterate in
+		 * @param diag_value diagonal value
+		 * @param non_diag_value value outside of the diagonal
+		 * @return grb::RC the success value returned when trying to build the matrix
+		 */
+		template< std::size_t DIMS, typename T, enum grb::Backend B >
+		grb::RC build_ndims_system_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & sys_sizes, std::size_t halo_size, T diag_value, T non_diag_value ) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+			std::size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			if( grb::nrows( M ) != n || grb::nrows( M ) != grb::ncols( M ) ) {
+				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
+											"be square"
+											" and in accordance with given system "
+											"sizes" );
+			}
+			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
+			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
+		}
+
+		/**
+		 * @brief Builds a coarsener matrix for an HPCG simulation.
+		 *
+		 * It initializes \p M as a rectangular matrix, with rows corresponding to the coarser system
+		 * (of dimensions \p coarser_sizes - output) and columns corresponding to the finer system
+		 * (of dimensions \p finer_sizes - input). The resulting coarsening matrix takes in input the finer system
+		 * and coarsens it by keeping one element every \a S , where \a S is the ratio between the finer and
+		 * the coarser dimension (computed for each dimension). In this way each \p DIMS -dimensional finer element
+		 * corresponds to its bounding coarser element.
+		 *
+		 * For the coarsening to be feasible, the sizes of the finer system \b must be a multiple of those of the
+		 * coarser system. If this condition is not met, an exception is thrown.
+		 *
+		 * @tparam DIMS system dimensions
+		 * @tparam T type of matrix values
+		 * @tparam B matrix GraphBLAS backend
+		 * @param M the matrix to be initialized; it must be already constructed with proper dimensions
+		 * @param coarser_sizes sizes of the coarser system
+		 * @param finer_sizes sizes of the finer system; each one \b must be a multiple of the corresponding value
+		 *                    in \p coarser_size , otherwise an exception is thrown
+		 * @return grb::RC the success value returned when trying to build the matrix
+		 */
+		template< std::size_t DIMS, typename T, enum grb::Backend B >
+		grb::RC build_ndims_coarsener_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & coarser_sizes, const std::array< std::size_t, DIMS > & finer_sizes ) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+			std::size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
+				std::size_t step = finer_sizes[ i ] / coarser_sizes[ i ];
+				if( step * coarser_sizes[ i ] != finer_sizes[ i ] ) {
+					throw std::invalid_argument( "finer sizes should be a multiple of "
+												"coarser sizes" );
+				}
+			}
+			std::size_t const cols { std::accumulate( finer_sizes.cbegin(), finer_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
+				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
+											"be rectangular"
+											" with rows == <product of coarser sizes> "
+											"and cols == <product of finer sizes>" );
+			}
+
+			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
+			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
+		}
+
+		/**
+		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
+		 *
+		 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
+		 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
+		 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
+		 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
+		 *
+		 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
+		 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
+		 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
+		 *
+		 * @tparam B GraphBLAS backend for the vector
+		 * @param masks output vector of color masks
+		 * @param matrix_size size of the system matrix
+		 * @param colors numbers of colors masks to build; it must be < \p matrix_size
+		 * @return grb::RC the success value returned when trying to build the vector
+		 */
+		template< enum grb::Backend B >
+		grb::RC build_static_color_masks( std::vector< grb::Vector< bool, B > > & masks, std::size_t matrix_size, std::size_t colors ) {
+			if( ! masks.empty() ) {
+				throw std::invalid_argument( "vector of masks is expected to be "
+											"empty" );
+			}
+			if( matrix_size < colors ) {
+				throw std::invalid_argument( "syztem size is < number of colors: too "
+											"small" );
+			}
+			grb::RC rc { grb::SUCCESS };
+			masks.reserve( colors );
+			for( std::size_t i { 0U }; i < colors; i++ ) {
+				// build in-place, assuming the compiler deduces the right constructor according to B
+				masks.emplace_back( matrix_size );
+				grb::Vector< bool > & mask = masks.back();
+				// grb::set(mask, false); // DO NOT initialize false's explicitly, otherwise
+				// RBGS will touch them too and the runtime will increase!
+				for( std::size_t j = i; j < matrix_size; j += colors ) {
+					rc = grb::setElement( mask, true, j );
+					assert( rc == grb::SUCCESS );
+					if( rc != grb::SUCCESS )
+						return rc;
+				}
+			}
+			return rc;
+		}
+
+		} //namespace old
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
new file mode 100644
index 000000000..256995b02
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
@@ -0,0 +1,548 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file ndim_matrix_builders.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
+ *
+ * In particular, the main matrices are:
+ * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
+ *   each dimension in priority order, where the first dimension has highest priority and the last
+ *   dimension least priority; for each point (row), all its N-dimensional neighbours within
+ *   a given distance are generated for the column
+ * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
+ *   each point to a corresponding system of finer sizes
+ *
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_OLD_NDIM_MATRIX_BUILDERS
+#define _H_GRB_ALGORITHMS_OLD_NDIM_MATRIX_BUILDERS
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <initializer_list>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace grb {
+	namespace algorithms {
+		namespace old {
+
+		/**
+		 * @brief Base class that iterates on DIMS dimensions starting from the first one.
+		 *
+		 * The coordinates are assumed to generate the row number in a matrix whose number of rows is
+		 * the product of all sizes. This class generates row numbers for physical problems described as
+		 * systems of linear equations in an n-dimensional space.
+		 *
+		 * Example of iterations in a 3D (x, y, z) system of size (4,3,2), with generated row numbers
+		 * reported as '=> ROW':
+		 * - z[0]
+		 * - y[0]
+		 * - x[0] => 0, x[1] => 1, x[2] => 2, x[3] => 3
+		 * - y[1]
+		 * - x[0] => 4, x[1] => 5, x[2] => 6, x[3] => 7
+		 * - y[2]
+		 * - x[0] => 8, x[1] => 9, x[2] => 10, x[3] => 11
+		 * - z[1]
+		 * - y[0]
+		 * - x[0] => 12, x[1] => 13, x[2] => 14, x[3] => 15
+		 * - y[1]
+		 * - x[0] => 16, x[1] => 17, x[2] => 18, x[3] => 19
+		 * - y[2]
+		 * - x[0] => 20, x[1] => 21, x[2] => 22, x[3] => 23
+		 *
+		 * The main goal of this class is to be derived by other classes to generate matrices in an
+		 * STL-iterator-fashion; hence, this class contains all the code for basic coordinate-to-row-column
+		 * conversion in \p DIM dimensions and the basic logic to increment the row number.
+		 *
+		 * @tparam DIMS number os dimensions of the system
+		 */
+		template< std::size_t DIMS >
+		struct row_generator {
+
+			using row_coordinate_type = std::size_t; ///< numeric type of rows
+			using array_t = std::array< row_coordinate_type,
+				DIMS >; ///< type for the array storing the coordinates.
+
+			const array_t physical_sizes; ///< size of each dimension, starting from the one to be explored first
+
+			/**
+			 * @brief Construct a new row generator object
+			 * @param[in] _sizes array of sizes of each dimension; no dimension should be 0, otherwise an exception
+			 *                   is thrown
+			 * @param[in] first_row first row to iterate from; it is allowed to be beyond the matrix size, e.g. to create
+			 *                      an end iterator (no check occurs)
+			 */
+			row_generator( const array_t & _sizes, row_coordinate_type first_row ) : physical_sizes( _sizes ) {
+				static_assert( DIMS > 0, "DIMS should be higher than 0" );
+				for( const auto i : _sizes ) {
+					if( i == static_cast< row_coordinate_type >( 0U ) ) {
+						throw std::invalid_argument( "All dimension sizes must "
+													 "be > 0" );
+					}
+				}
+				row_to_coords( first_row );
+			}
+
+			row_generator( const row_generator & o ) = default;
+
+			row_generator( row_generator && o ) = default;
+
+		protected:
+			// x: row_coords[0], y: row_coords[1], z: row_coords[2], ...
+			array_t row_coords; ///< n-D coordinates from which to compute the row
+
+			/**
+			 * @brief converts a row number into a n-D coordinates according to the sizes in #physical_sizes
+			 *
+			 * In case the input is higher than the nunber of rows, the last coordinate is allowed to
+			 * go beyond its physical size. E.g., if the system has size (4,3,2) and \p rowcol is 24,
+			 * the coordinates are (0,0,3).
+			 *
+			 * @param[in] rowcol row number to convert; it can be any number
+			 */
+			void row_to_coords( row_coordinate_type rowcol ) {
+				std::size_t s = 1;
+				for( std::size_t i { 0 }; i < row_coords.size() - 1; i++ )
+					s *= physical_sizes[ i ];
+
+				for( typename array_t::size_type i { row_coords.size() - 1 }; i > 0; i-- ) {
+					row_coords[ i ] = rowcol / s;
+					rowcol -= row_coords[ i ] * s;
+					s /= physical_sizes[ i ];
+				}
+				row_coords[ 0 ] = rowcol % physical_sizes[ 0 ];
+			}
+
+			/**
+			 * @brief Pure function converting an array of coordinates into a row number, based on #physical_sizes.
+			 * @param a the #array_t array of coordinates to convert
+			 * @return #row_coordinate_type the row corresponding to the coordinates in \p a
+			 */
+			row_coordinate_type coords_to_rowcol( const array_t & a ) const {
+				row_coordinate_type row { 0 };
+				row_coordinate_type s { 1 };
+				for( typename array_t::size_type i { 0 }; i < a.size(); i++ ) {
+					row += s * a[ i ];
+					s *= physical_sizes[ i ];
+				}
+				return row;
+			}
+
+			/**
+			 * @brief Increment #row_coords in order to move to the next coordinate (according to the
+			 * n-dimensional iteration order) and update #current_row accordingly.
+			 *
+			 * To be used by derived classes in order to generate the matrix, e.g. via the \c operator()++
+			 * operator prescribed for STL-like iterators.
+			 */
+			void increment_row() {
+				bool rewind;
+				typename array_t::size_type i { 0 };
+				do {
+					typename array_t::value_type & coord = row_coords[ i ];
+					// must rewind dimension if we wrap-around
+					typename array_t::value_type new_coord = ( coord + 1 ) % physical_sizes[ i ];
+					rewind = new_coord < coord;
+					coord = new_coord;
+					++i;
+				} while( rewind && i < row_coords.size() - 1 ); // rewind only the first N-1 coordinates
+
+				// if we still have to rewind, increment the last coordinate, which is unbounded
+				if( rewind ) {
+					row_coords.back()++;
+				}
+			}
+		};
+
+		// ===============================================================
+
+		/**
+		 * @brief STL-like iterable class to generate the values for a matrix by iterating in an n-dimensional
+		 * space along the coordinates.
+		 *
+		 * For each \f$ X=(x0, x1, ...,xn) \f$ point of the underlying (n+1)-dimensional space,
+		 * this class iterates through the points of the n-dimensional halo of radius \p halo around \f$ X \f$,
+		 * generating the row number corresponding to \f$ X \f$ and the column number corresponding to
+		 * each halo point. At each coordinate \code (row, col) \endcode generated this way, the corresponding matrix value
+		 * being generated depends on whether \code row == col \endcode.
+		 *
+		 * @tparam DIMS number of dimensions of the system
+		 * @tparam HALO halo size, determining the number of points to iterate around and thus the column coordinates
+		 * @tparam T type of matrix values
+		 */
+		template< std::size_t DIMS, typename T = double >
+		struct matrix_generator_iterator : public row_generator< DIMS > {
+
+			using row_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
+			using column_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
+			using nonzero_value_type = T;
+			using array_t = typename row_generator< DIMS >::array_t;
+			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
+
+			// halo may in future become a DIM-size array to iterate in arbitrary shapes
+			const row_coordinate_type halo;              ///< number of points per dimension to iterate around
+			const nonzero_value_type diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
+			const nonzero_value_type non_diagonal_value; ///< value to emit outside of the diagonal
+
+			/**
+			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
+			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
+			 *
+			 * @param sizes array with the sizes along the dimensions
+			 * @param row current row to initialize the matrix on
+			 * @param _halo halo of points to iterate around; must be > 0
+			 * @param diag value to emit when on the diagonal
+			 * @param non_diag value to emit outside the diagonal
+			 */
+			matrix_generator_iterator( const array_t & sizes, row_coordinate_type row, row_coordinate_type _halo, nonzero_value_type diag, nonzero_value_type non_diag ) :
+				row_generator< DIMS >( sizes, row ), halo( _halo ), diagonal_value( diag ), non_diagonal_value( non_diag ) {
+				if( halo <= 0 ) {
+					throw std::invalid_argument( "halo should be higher than "
+												 "0" );
+				}
+				for( const auto i : sizes ) {
+					if( i < static_cast< row_coordinate_type >( 2 * halo + 1 ) ) {
+						throw std::invalid_argument( "Iteration halo goes "
+													 "beyond system sizes" );
+					}
+				}
+				current_values.first.first = row;
+				update_column_max_values();
+				reset_all_columns();
+				current_values.first.second = this->coords_to_rowcol( col_coords );
+				current_values.second = v();
+			}
+
+			matrix_generator_iterator( const matrix_generator_iterator & o ) = default;
+
+			matrix_generator_iterator( matrix_generator_iterator && o ) = default;
+
+			/**
+			 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
+			 *
+			 * This operator internally increments the columns coordinates until wrap-around, when it increments
+			 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
+			 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
+			 *
+			 * @return matrix_generator_iterator<DIMS, T>& \c this object, with the updated state
+			 */
+			matrix_generator_iterator< DIMS, T > & operator++() {
+				bool must_rewind = increment_column();
+				if( must_rewind ) {
+					this->increment_row();
+					// after changing row, we must find the first non-zero column
+					reset_all_columns();
+					current_values.first.first = this->coords_to_rowcol( this->row_coords );
+					update_column_max_values();
+				}
+				// trigger column update after row update, as a row update
+				// triggers a column update
+				current_values.first.second = this->coords_to_rowcol( col_coords );
+				current_values.second = this->v();
+				return *this;
+			}
+
+			/**
+			 * @brief Operator to compare \c this against \p o  and return whether they differ.
+			 *
+			 * @param o object to compare \c this against
+			 * @return true of the row or the column is different between \p o and \c this
+			 * @return false if both row and column of \p o and \c this are equal
+			 */
+			bool operator!=( const matrix_generator_iterator< DIMS, T > & o ) const {
+				if( o.i() != this->i() ) {
+					return true;
+				}
+				return o.j() != this->j();
+			}
+
+			/**
+			 * @brief Operator to compare \c this against \p o  and return whether they are equal.
+			 *
+			 * @param o object to compare \c this against
+			 * @return true of the row or the column is different between \p o and \c this
+			 * @return false if both row and column of \p o and \c this are equal
+			 */
+			bool operator==( const matrix_generator_iterator< DIMS, T > & o ) const {
+				return o.i() == this->i() && o.j() == this->j();
+			}
+
+			/**
+			 * @brief Operator returning the triple to directly access row, column and element values.
+			 *
+			 * Useful when building the matrix by copying the triple of coordinates and value,
+			 * like for the BSP1D backend.
+			 */
+			const value_type & operator*() const {
+				return current_values;
+			}
+
+			/**
+			 * @brief Returns current row.
+			 */
+			inline row_coordinate_type i() const {
+				return current_values.first.first;
+			}
+
+			/**
+			 * @brief Returns current column.
+			 */
+			inline column_coordinate_type j() const {
+				return current_values.first.second;
+			}
+
+			/**
+			 * @brief Returns the current matrix value.
+			 *
+			 * @return nonzero_value_type #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
+			 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
+			 */
+			inline nonzero_value_type v() const {
+				return j() == i() ? diagonal_value : non_diagonal_value;
+			}
+
+		private:
+			// offsets w.r.t. rows
+			array_t col_coords;        ///< coordinates corresponding to current column
+			array_t column_max_values; ///< maximum values for the column coordinates, to stop column increment
+			//// and reset the column coordinates
+			value_type current_values; ///< triple storing the current value for row, column and matrix element
+
+			/**
+			 * @brief Updates the maximum values each column coordinate can reach, according to the row coordinates.
+			 *
+			 * To be called after each row coordinates update.
+			 */
+			void update_column_max_values() {
+				for( std::size_t i { 0 }; i < column_max_values.size(); i++ ) {
+					column_max_values[ i ] = std::min( this->physical_sizes[ i ] - 1, this->row_coords[ i ] + halo );
+				}
+			}
+
+			/**
+			 * @brief Resets the value of column dimension \p dim to the first possible value.
+			 *
+			 * The final value of #col_coords[dim] depends on the current row (#row_coords) and on the \p halo
+			 * and is \f$ max(0, \f$ #row_coords \f$[dim])\f$.
+			 *
+			 * @param dim the dimension to reset
+			 */
+			void reset_column_coords( std::size_t dim ) {
+				// cannot use std::max because row_coords is unsigned and can wrap-around
+				col_coords[ dim ] = this->row_coords[ dim ] <= halo ? 0 : ( this->row_coords[ dim ] - halo );
+			}
+
+			/**
+			 * @brief resets all values in #col_coords to the initial coordinates,
+			 * iterating from on the current row.
+			 */
+			void reset_all_columns() {
+				for( std::size_t i { 0 }; i < col_coords.size(); i++ ) {
+					reset_column_coords( i );
+				}
+			}
+
+			/**
+			 * @brief Increment the column according to the iteration order, thus resetting the column coordinates
+			 * when the last possible column value for the current row has been reached.
+			 *
+			 * @return true if the column coordinates have been reset, and thus also the row must be incremented
+			 * @return false if the column coordinates
+			 */
+			bool increment_column() {
+				bool rewind;
+				typename array_t::size_type i { 0 };
+				do {
+					typename array_t::value_type & col = col_coords[ i ];
+					// must rewind dimension if the column offset is already at the max value
+					// or if the column coordinates are already at the max value
+					rewind = ( col == column_max_values[ i ] );
+					if( rewind ) {
+						// col = this->row_coords[i] == 0 ? 0 : this->row_coords[i] - (halo);
+						reset_column_coords( i );
+					} else {
+						++col;
+					}
+					++i;
+				} while( rewind && i < col_coords.size() );
+
+				// if we change z, then we also must reset x and y; if only y, we must reset x, and so on
+				return rewind;
+			}
+		};
+
+		// ===============================================================
+
+		/**
+		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
+		 *
+		 * This class coarsens a finer system to a coarser system by projecting each input value (column),
+		 * espressed in finer coordinates, to an output (row) value espressed in coarser coordinates.
+		 * The coarser sizes are assumed to be row_generator#physical_sizes, while the finer sizes are here
+		 * stored inside #finer_sizes.
+		 *
+		 * The corresponding refinement matrix is obtained by transposing the coarsening matrix.
+		 *
+		 * @tparam DIMS number of dimensions of the system
+		 * @tparam T type of matrix values
+		 */
+		template< std::size_t DIMS, typename T = double >
+		struct coarsener_generator_iterator : public row_generator< DIMS > {
+
+			using row_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
+			using column_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
+			using nonzero_value_type = T;
+			using array_t = typename row_generator< DIMS >::array_t;
+			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
+
+			// the sizes to project from
+			const array_t finer_sizes; ///< the size of the finer system (columns)
+			array_t steps;             ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			//// incremented when incrementing the row coordinates; is is the ration between
+			//// #finer_sizes and row_generator#physical_sizes
+
+			/**
+			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
+			 * setting its row at \p _current_row and the column at the corresponding value.
+			 *
+			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
+			 * construction will throw an exception.
+			 *
+			 * @param _coarser_sizes sizes of the coarser system (rows)
+			 * @param _finer_sizes sizes of the finer system (columns)
+			 * @param _current_row row (in the coarser system) to set the iterator on
+			 */
+			coarsener_generator_iterator( const array_t & _coarser_sizes, const array_t & _finer_sizes, row_coordinate_type _current_row ) :
+				row_generator< DIMS >( _coarser_sizes, _current_row ), finer_sizes( _finer_sizes ), steps( { 0 } ) {
+				for( std::size_t i { 0 }; i < DIMS; i++ ) {
+					// finer size MUST be an exact multiple of coarser_size
+					typename array_t::value_type step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
+					if( step == 0 || finer_sizes[ i ] / step != this->physical_sizes[ i ] ) {
+						throw std::invalid_argument( std::string( "finer size "
+																  "of "
+																  "dimension"
+																  " " ) +
+							std::to_string( i ) +
+							std::string( "is not an exact multiple of coarser "
+										 "size" ) );
+					}
+					steps[ i ] = step;
+				}
+				current_values.first.first = _current_row;
+				current_values.first.second = coords_to_finer_col();
+				current_values.second = v();
+			}
+
+			coarsener_generator_iterator( const coarsener_generator_iterator & o ) = default;
+
+			coarsener_generator_iterator( coarsener_generator_iterator && o ) = default;
+
+			/**
+			 * @brief Increments the row and the column according to the respective physical sizes,
+			 * thus iterating onto the coarsening matrix coordinates.
+			 *
+			 * @return \code *this \endcode, i.e. the same object with the updates row and column
+			 */
+			coarsener_generator_iterator< DIMS, T > & operator++() {
+				this->increment_row();
+				current_values.first.first = this->coords_to_rowcol( this->row_coords );
+				current_values.first.second = coords_to_finer_col();
+				current_values.second = v();
+				return *this;
+			}
+
+			/**
+			 * @brief Returns whether \c this and \p o differ.
+			 */
+			bool operator!=( const coarsener_generator_iterator< DIMS, T > & o ) const {
+				if( this->i() != o.i() ) {
+					return true;
+				}
+				return this->j() != o.j();
+			}
+
+			/**
+			 * @brief Returns whether \c this and \p o are equal.
+			 */
+			bool operator==( const coarsener_generator_iterator< DIMS, T > & o ) const {
+				return this->i() == o.i() && this->j() == o.j();
+			}
+
+			/**
+			 * @brief Operator returning the triple to directly access row, column and element values.
+			 *
+			 * Useful when building the matrix by copying the triple of coordinates and value,
+			 * like for the BSP1D backend.
+			 */
+			const value_type & operator*() const {
+				return current_values;
+			}
+
+			/**
+			 * @brief Returns the current row, according to the coarser system.
+			 */
+			inline row_coordinate_type i() const {
+				return current_values.first.first;
+			}
+
+			/**
+			 * @brief Returns the current column, according to the finer system.
+			 */
+			inline column_coordinate_type j() const {
+				return current_values.first.second;
+			}
+
+			/**
+			 * @brief Returns always 1, as the coarsening keeps the same value.
+			 */
+			inline nonzero_value_type v() const {
+				return static_cast< nonzero_value_type >( 1 );
+			}
+
+		private:
+			value_type current_values; ///< triple storing the current value for row, column and matrix element
+
+			/**
+			 * @brief Returns the row coordinates converted to the finer system, to compute
+			 * the column value.
+			 */
+			column_coordinate_type coords_to_finer_col() const {
+				column_coordinate_type row { 0 };
+				column_coordinate_type s { 1 };
+				for( typename array_t::size_type i { 0 }; i < this->row_coords.size(); i++ ) {
+					s *= steps[ i ];
+					row += s * this->row_coords[ i ];
+					s *= this->physical_sizes[ i ];
+				}
+				return row;
+			}
+		};
+
+		} // namespace old
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 11adf82c1..959d21969 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -31,10 +31,16 @@
 #include <memory>
 
 #include <graphblas.hpp>
+#include <graphblas/utils/Timer.hpp>
 
 #include "hpcg_data.hpp"
 #include "matrix_building_utils.hpp"
 
+#ifndef MASTER_PRINT
+#define INTERNAL_MASTER_PRINT
+#define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
+#endif
+
 
 namespace grb {
 	namespace algorithms {
@@ -92,17 +98,32 @@ namespace grb {
 
 			// initialize the main (=uncoarsened) system matrix
 			grb::RC rc { grb::SUCCESS };
+			const size_t pid { spmd<>::pid() };
+			grb::utils::Timer timer;
+			MASTER_PRINT( pid, "\n-- generating system matrix...\n" << std::endl );
+			grb::spmd<>::barrier();
+			timer.reset();
 			rc = build_ndims_system_matrix< DIMS, T >( data->A, params.physical_sys_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+			MASTER_PRINT( pid, "\n-- generating system matrix... time (ms) " << timer.time() << std::endl );
 
 			if( rc != grb::SUCCESS ) {
-				std::cerr << "Failure to generate the initial system (" << toString( rc ) << ") of size " << n << std::endl;
+				MASTER_PRINT( pid, "Failure to generate the initial system ("
+					<< toString( rc ) << ") of size " << n << "\n" );
 				return rc;
 			}
 
-			// set values of diagonal vector
+			// set values of vectors
+			MASTER_PRINT( pid, "-- populating vectors..." );
+			timer.reset();
 			set( data->A_diagonal, params.diag_value );
+			data->zero_temp_vectors();
+			MASTER_PRINT( pid, " time (ms) " << timer.time() << std::endl );
+
 
+			MASTER_PRINT( pid, "-- generating color masks...\n" << std::endl );
+			timer.reset();
 			build_static_color_masks( data->color_masks, n, params.num_colors );
+			MASTER_PRINT( pid, "\n\n-- generating color masks... time (ms) " << timer.time() << std::endl );
 
 			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
 			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
@@ -124,20 +145,45 @@ namespace grb {
 				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
+
+				MASTER_PRINT( pid, "-- level " << coarsening_level << "\n\tgenerating coarsening matrix...\n" );
+				timer.reset();
 				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
 				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
 				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate coarsening matrix (" << toString( rc ) << ")." << std::endl;
+					MASTER_PRINT( pid, "Failure to generate coarsening matrix (" << toString( rc ) << ").\n" );
 					return rc;
 				}
+				double coarsener_gen_time{ timer.time() };
+
+				MASTER_PRINT( pid, "\tgenerating system matrix...\n" );
+				timer.reset();
 				rc = build_ndims_system_matrix< DIMS, T >( new_coarser->A, coarser_sizes, params.halo_size, params.diag_value, params.non_diag_value );
 				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate system matrix (" << toString( rc ) << ")for size " << coarser_size << std::endl;
+					MASTER_PRINT( pid, "Failure to generate system matrix (" << toString( rc )
+						<< ") for size " << coarser_size << "\n" );
 					return rc;
 				}
+				double coarse_sys_gen_time{ timer.time() };
+
+				MASTER_PRINT( pid, "\tpopulating vectors...\n" );
+				timer.reset();
 				set( new_coarser->A_diagonal, params.diag_value );
+				new_coarser->zero_temp_vectors();
+				double coarser_vec_gen_time{ timer.time() };
+
 				// build color masks for coarser level (same masks, but with coarser system size)
+				MASTER_PRINT( pid, "\tgenerating color masks..." << std::endl );
+				timer.reset();
 				rc = build_static_color_masks( new_coarser->color_masks, coarser_size, params.num_colors );
+				double coarse_masks_sys_time{ timer.time() };
+				MASTER_PRINT( pid, "-- level " << coarsening_level << "... time (ms) for "
+					"[coarsening matrix,coarse system matrix,coarser vectors,color masks]:"
+					<< coarsening_level << "," << coarsener_gen_time
+					<< "," << coarse_sys_gen_time
+					<< "," << coarser_vec_gen_time
+					<< "," << coarse_masks_sys_time << std::endl;
+				);
 
 				// prepare for new iteration
 				coarser = &new_coarser->coarser_level;
@@ -152,4 +198,9 @@ namespace grb {
 	} // namespace algorithms
 } // namespace grb
 
+#ifdef INTERNAL_MASTER_PRINT
+#undef INTERNAL_MASTER_PRINT
+#undef MASTER_PRINT
+#endif
+
 #endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
diff --git a/include/graphblas/utils/geometry/array_vector_storage.hpp b/include/graphblas/utils/geometry/array_vector_storage.hpp
new file mode 100644
index 000000000..451364754
--- /dev/null
+++ b/include/graphblas/utils/geometry/array_vector_storage.hpp
@@ -0,0 +1,67 @@
+
+#ifndef _ARRAY_VECTOR_STORAGE_H_
+#define _ARRAY_VECTOR_STORAGE_H_
+
+#include <array>
+#include <stdexcept>
+#include <algorithm>
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename T, std::size_t DIMS > class array_vector_storage: public std::array< T, DIMS > {
+
+public:
+
+	using vector_storage = std::array< T, DIMS >&;
+	using const_vector_storage = const std::array< T, DIMS >&;
+
+	array_vector_storage( std::size_t _dimensions ) {
+		static_assert( DIMS > 0, "cannot allocate 0-sized array" );
+		if( _dimensions != DIMS ) {
+			throw std::invalid_argument("given dimensions must match the type dimensions");
+		}
+	}
+
+	array_vector_storage() = delete;
+
+	// only copy constructor/assignment, since there's no external storage
+	array_vector_storage( const array_vector_storage< T, DIMS >& o ) noexcept {
+		std::copy_n( o.cbegin(), DIMS, this->begin() );
+	}
+
+	/*
+	array_vector_storage( array_vector_storage< T >&& o ) {
+		std::copy_n( o._storage.cbegin(), DIMS, this->_storage.cbegin() );
+	}
+	*/
+
+	array_vector_storage< T, DIMS >& operator=( const array_vector_storage< T, DIMS > &original ) noexcept {
+		std::copy_n( original.begin(), DIMS, this->begin() );
+		return *this;
+	}
+
+	//array_vector_storage< T, DIMS >& operator=( array_vector_storage< T, DIMS > &&original ) = delete;
+
+	~array_vector_storage() {}
+
+	constexpr std::size_t dimensions() const {
+		return DIMS;
+	}
+
+	inline vector_storage storage() {
+		return *this;
+	}
+
+	inline const_vector_storage storage() const {
+		return *this;
+	}
+
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _ARRAY_VECTOR_STORAGE_H_
diff --git a/include/graphblas/utils/geometry/generic_vector_storage.hpp b/include/graphblas/utils/geometry/generic_vector_storage.hpp
new file mode 100644
index 000000000..166dad3b8
--- /dev/null
+++ b/include/graphblas/utils/geometry/generic_vector_storage.hpp
@@ -0,0 +1,117 @@
+
+#ifndef _GENERIC_VECTOR_STORAGE_H_
+#define _GENERIC_VECTOR_STORAGE_H_
+
+#include <cstddef>
+#include <algorithm>
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename T > class generic_vector_storage {
+
+	std::size_t _dimensions;
+	T* _storage;
+
+	void clean() {
+		if( this->_storage != nullptr ) {
+			delete[] this->_storage;
+		}
+	}
+
+public:
+
+	using reference = T&;
+	using const_reference = const T&;
+	using iterator = T*;
+	using const_iterator = const T*;
+	using pointer = T*;
+	using const_pointer = const T*;
+	using vector_storage = T*;
+	using const_vector_storage = T*;
+
+	generic_vector_storage( std::size_t __dimensions ):
+		_dimensions( __dimensions ) {
+		if( __dimensions == 0 ) {
+			throw std::invalid_argument("dimensions cannot be 0");
+		}
+		this->_storage = new T[ __dimensions ];
+	}
+
+	generic_vector_storage() = delete;
+
+	generic_vector_storage( const generic_vector_storage< T >& o ):
+		_dimensions( o._dimensions ), _storage( new T[ o._dimensions ] ) {
+		std::copy_n( o._storage, o._dimensions, this->_storage );
+	}
+
+	generic_vector_storage( generic_vector_storage< T >&& o ) = delete;
+
+	generic_vector_storage< T >& operator=( const generic_vector_storage< T > &original ) {
+		if( original._dimensions != this->_dimensions ) {
+			this->clean();
+			this->_storage = new T[ original._dimensions];
+		}
+		this->_dimensions = original._dimensions;
+		std::copy_n( original._storage, original._dimensions, this->_storage );
+		return *this;
+	}
+
+	generic_vector_storage< T >& operator=( generic_vector_storage< T > &&original ) = delete;
+
+	~generic_vector_storage() {
+		this->clean();
+	}
+
+	std::size_t dimensions() const {
+		return this->_dimensions;
+	}
+
+	inline iterator begin() {
+		return this->_storage;
+	}
+
+	inline iterator end() {
+		return this->_storage + this->_dimensions;
+	}
+
+	inline const_iterator begin() const {
+		return this->_storage;
+	}
+
+	inline const_iterator end() const {
+		return this->_storage + this->_dimensions;
+	}
+
+	inline const_iterator cbegin() const {
+		return this->_storage;
+	}
+
+	inline const_iterator cend() const {
+		return this->_storage + this->_dimensions;
+	}
+
+	inline vector_storage storage() {
+		return this->_storage;
+	}
+
+	inline const_vector_storage storage() const {
+		return this->_storage;
+	}
+
+	inline reference operator[]( std::size_t pos ) {
+		return *( this->_storage + pos);
+	}
+
+	inline const_reference operator[]( std::size_t pos ) const {
+		return *( this->_storage + pos );
+	}
+
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _GENERIC_VECTOR_STORAGE_H_
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
new file mode 100644
index 000000000..4d7fd62ce
--- /dev/null
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
@@ -0,0 +1,232 @@
+
+#ifndef _LINEARIZED_HALO_NDIM_GEOMETRY_H_
+#define _LINEARIZED_HALO_NDIM_GEOMETRY_H_
+
+#include <cstddef>
+#include <vector>
+#include <array>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+
+#include "linearized_ndim_system.hpp"
+#include "array_vector_storage.hpp"
+#include "generic_vector_storage.hpp"
+#include "ndim_vector.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename CoordT, std::size_t DIMS > void __compute_neighbors_range(
+	const array_vector_storage< CoordT, DIMS >& _system_sizes,
+	const CoordT halo,
+	const array_vector_storage< CoordT, DIMS >& system_coordinates,
+	array_vector_storage< CoordT, DIMS >& neighbors_start,
+	array_vector_storage< CoordT, DIMS >& neighbors_range ) {
+
+	for( CoordT i{0}; i < DIMS/* - 1*/; i++ ) {
+		const CoordT start{ system_coordinates[i] <= halo ? 0 : system_coordinates[i] - halo };
+		const CoordT end{ std::min( system_coordinates[i] + halo, _system_sizes[i] - 1 ) };
+		neighbors_start[i] = start;
+		neighbors_range[i] = end - start + 1;
+	}
+	/*
+	const std::size_t last{ DIMS - 1 };
+	const CoordT start{ system_coordinates[ last ] <= halo ? 0 : system_coordinates[ last ] - halo };
+	const CoordT end{ system_coordinates[ last ] + halo }; // can extend beyond actual DIMS-dimensional space
+	neighbors_start[ last ] = start;
+	neighbors_range[ last ] = end - start + 1;
+	*/
+}
+
+
+
+
+
+
+template< typename CoordT, std::size_t DIMS > std::size_t __neighbour_to_system_coords(
+	const std::array< CoordT, DIMS > & sizes,
+	std::size_t system_size,
+	const std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > > & dimension_neighbors,
+	CoordT halo,
+	CoordT neighbor,
+	array_vector_storage< CoordT, DIMS > & result) {
+
+	if( neighbor > system_size ) {
+		throw std::invalid_argument("neighbor number ( " + std::to_string(neighbor)
+			+ " ) >= system size ( " + std::to_string( system_size ) + " )");
+	}
+
+	array_vector_storage< CoordT, DIMS > halo_coords( DIMS );
+#ifdef DBG
+	std::size_t * const halo_coords_end{ halo_coords.data() + DIMS };
+#endif
+	std::fill_n( halo_coords.begin(), DIMS, 0 );
+
+	for( std::size_t _dim{DIMS}; _dim > 0; _dim--) {
+
+		const std::size_t dimension{_dim - 1};
+		const std::size_t dimension_size{ sizes[dimension] };
+		const ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > & neighbors{ dimension_neighbors[dimension] };
+
+		CoordT * const halo_coords_begin{ halo_coords.data() + dimension };
+
+#ifdef DBG
+		std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour " << neighbor << std::endl;
+		std::cout << "\thalo : ";
+		print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+#endif
+
+		std::size_t h{0};
+		std::size_t previous_neighs{ 0 };
+		*halo_coords_begin = h;
+		std::size_t halo_max_neighs{ neighbors.at( halo_coords_begin ) };
+		//std::cout << "\tinitial halo_max_neighs " << halo_max_neighs << std::endl;
+		while( h < halo && neighbor >= previous_neighs + halo_max_neighs ) {
+			h++;
+			*halo_coords_begin = h;
+			previous_neighs += halo_max_neighs;
+			halo_max_neighs = neighbors.at( halo_coords_begin );
+		}
+#ifdef DBG
+		std::cout << "- initial halo - neighbour " << neighbor << std::endl;
+		std::cout << "\th " << h << std::endl;
+		std::cout << "\thalo : ";
+		print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+
+
+		if ( h < halo ){
+			result[dimension] = h;
+			neighbor -= previous_neighs;
+#ifdef DBG
+			std::cout << "end neighbour " << neighbor << std::endl;
+#endif
+			continue;
+		}
+		// saturation occurred
+		const std::size_t distance_from_halo{ ( neighbor - previous_neighs ) / halo_max_neighs };
+#ifdef DBG
+		std::cout << "- before middle elements - neighbour " << neighbor << std::endl;
+		std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+		std::cout << "\tdistance_from_halo " << distance_from_halo << std::endl;
+		std::cout << "\tdimension_size " << dimension_size << std::endl;
+#endif
+		if ( distance_from_halo < dimension_size - 2 * halo ) {
+			result[dimension] =  distance_from_halo + halo;
+			neighbor -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
+#ifdef DBG
+			std::cout << "end neighbour " << neighbor << std::endl;
+#endif
+			continue;
+		}
+		previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
+#ifdef DBG
+		std::cout << "- after middle elements -neighbour " << neighbor << std::endl;
+		std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+
+		h = halo - 1;
+		*halo_coords_begin = h;
+		halo_max_neighs = neighbors.at( halo_coords_begin );
+		while( h > 0 && neighbor >= previous_neighs + halo_max_neighs ) {
+			h--;
+			*halo_coords_begin = h;
+			previous_neighs += halo_max_neighs;
+			halo_max_neighs = neighbors.at( halo_coords_begin );
+		}
+		neighbor -= previous_neighs;
+#ifdef DBG
+		std::cout << "- final halo - neighbour " << neighbor << std::endl;
+		std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
+#endif
+		// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
+		// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
+		result[dimension] = dimension_size - 1 - h;
+#ifdef DBG
+		std::cout << "end neighbour " << neighbor << std::endl;
+#endif
+	}
+
+	return neighbor;
+}
+
+
+template< typename CoordT > std::size_t __accumulate_dimension_neighbours(
+	const ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >& prev_neighs,
+    CoordT* coords_buffer,
+	std::size_t halo,
+    std::size_t local_size ) {
+	std::size_t neighs{0};
+	std::size_t h{0};
+	for( ; h < halo && local_size > 1; h++ ) {
+		*coords_buffer = h;
+
+		const std::size_t local_neighs{ prev_neighs.at( coords_buffer ) };
+		neighs += 2 * local_neighs; // the 2 sides
+		local_size -= 2;
+	}
+	*coords_buffer = h;
+	neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
+	return neighs;
+}
+
+template< typename CoordT > void __populate_halo_neighbors( std::size_t halo,
+    ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >& container ) {
+
+	using it_type = typename ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >::domain_iterator;
+	it_type end{ container.domain_end() };
+	for( it_type it{ container.domain_begin() }; it != end; ++it ) {
+		std::size_t res{1};
+		for( std::size_t h: it->get_position() ) res *= (h + 1 + halo);
+		container.at( it->get_position() ) = res;
+	}
+}
+
+template< typename CoordT, std::size_t DIMS > std::size_t __init_halo_search(
+    typename linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > >::const_vector_reference sizes,
+    std::size_t halo,
+	std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > >& dimension_limits ) {
+
+    using nd_vec = ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >;
+    using nd_vec_iterator = typename nd_vec::domain_iterator;
+
+	std::vector<std::size_t> halo_sizes( DIMS, halo + 1);
+	dimension_limits.emplace_back(halo_sizes);
+
+	// initialize values
+	__populate_halo_neighbors< CoordT >( halo, dimension_limits[0] );
+	for( std::size_t i{1}; i < DIMS; i++ ) {
+		std::vector<std::size_t> halos( DIMS - i, halo + 1 );
+		dimension_limits.emplace_back(halos);
+	}
+
+    std::array< CoordT, DIMS > prev_coords_buffer; // store at most DIMS values
+    CoordT* const prev_coords{ prev_coords_buffer.data() };
+	CoordT* const second{ prev_coords + 1 }; // store previous coordinates from second position
+	for( std::size_t dimension{1}; dimension < DIMS; dimension++ ) {
+		const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
+		nd_vec& current_neighs{dimension_limits[dimension]};
+
+		nd_vec_iterator end{ current_neighs.domain_end() };
+		for( nd_vec_iterator it{ current_neighs.domain_begin() }; it != end; ++it ) {
+			typename nd_vec::const_domain_vector_reference current_halo_coords{ it->get_position() };
+
+			std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
+			std::size_t local_size{ sizes[dimension - 1] };
+			const std::size_t neighs{ __accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size) };
+			current_neighs.at(current_halo_coords) = neighs;
+		}
+	}
+	return __accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
+}
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _LINEARIZED_HALO_NDIM_GEOMETRY_H_
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
new file mode 100644
index 000000000..ede3af52c
--- /dev/null
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
@@ -0,0 +1,377 @@
+
+#ifndef _LINEARIZED_HALO_NDIM_ITERATOR_H_
+#define _LINEARIZED_HALO_NDIM_ITERATOR_H_
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+#include <iterator>
+#include <limits>
+
+#include "linearized_ndim_system.hpp"
+#include "array_vector_storage.hpp"
+#include "linearized_ndim_iterator.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+// forward declaration
+template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_system;
+
+template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_iterator {
+
+	using system_t = linearized_halo_ndim_system< CoordT, DIMS >;
+	using vector_t = array_vector_storage< CoordT, DIMS >;
+	using vector_iter = linearized_ndim_iterator< CoordT, vector_t >;
+public:
+
+	//using vector_t = typename vector_iter::vector_t;
+	using const_vector_reference = typename vector_iter::const_vector_reference;
+
+
+
+	struct halo_ndim_point {
+	private:
+
+		// for linearization
+		const system_t* _system;
+
+		// for iteration
+		vector_iter _element_iter; // coordinates iterator
+
+		//vector_t* _element;
+		//std::size_t _coordinates_linear;
+		vector_t _neighbor; //the actual neighbor
+		//std::size_t _neighbor_linear;
+		CoordT _position;
+
+	public:
+
+		friend linearized_halo_ndim_iterator< CoordT, DIMS>;
+
+		halo_ndim_point() = delete;
+
+		halo_ndim_point( const halo_ndim_point& ) = default;
+
+		halo_ndim_point( halo_ndim_point&& ) = delete;
+
+		halo_ndim_point( const system_t& system ) noexcept :
+			_system( &system ),
+			_element_iter( system ),
+			_neighbor( DIMS ),
+			_position( 0 )
+		{
+			std::fill_n( this->_neighbor.begin(), DIMS, 0 );
+		}
+
+		halo_ndim_point& operator=( const halo_ndim_point& ) = default;
+
+		//halo_ndim_point& operator=( halo_ndim_point&& ) = delete;
+
+		const_vector_reference get_element() const {
+			return this->_element_iter->get_position();
+		}
+
+		std::size_t get_element_linear() const {
+			return this->_system->ndim_to_linear( this->_element_iter->get_position() );
+		}
+
+		const_vector_reference get_neighbor() const {
+			return this->_neighbor;
+		}
+
+		std::size_t get_neighbor_linear() const {
+			return this->_system->ndim_to_linear( this->_neighbor );
+		}
+
+		CoordT get_position() const {
+			return this->_position;
+		}
+	};
+
+
+
+
+
+
+	using const_point_reference = const struct halo_ndim_point&;
+	using const_point_pointer = const struct halo_ndim_point*;
+
+	// interface for std::random_access_iterator
+	using iterator_category = std::random_access_iterator_tag;
+	using value_type = halo_ndim_point;
+	using pointer = const halo_ndim_point*;
+	using reference = const halo_ndim_point&;
+	using difference_type = signed long;
+
+private:
+
+	halo_ndim_point _point;
+	linearized_ndim_system< CoordT, vector_t > _neighbors_linearizer;
+	vector_iter _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
+	vector_t _neighbors_start;
+	vector_iter _neighbor_end;
+
+	inline void __update_neighbor() {
+		for( std::size_t i{0}; i < DIMS; i++ ) {
+			//(this->_point)._neighbor[i] = this->_neighbors_start[i] + (*(this->_neighbor_iter))[i];
+			this->_point._neighbor[i] = this->_neighbors_start[i] + this->_neighbor_iter->get_position()[i];
+		}
+	}
+
+	/*
+	void __update_neighbor_linear() {
+		(this->_point)._neighbor_linear =
+			this->_system.ndim_to_linear( this->_point._neighbor );
+	}
+	*/
+
+	inline void on_neighbor_iter_update() {
+		this->__update_neighbor();
+		//this->__update_neighbor_linear();
+	}
+
+	/*
+	void __update_coordinates_linear() {
+		(this->_point)._coordinates_linear =
+			this->_system.ndim_to_linear( *this->_element_iter );
+	}
+	*/
+
+	void on_element_update() {
+		//this->__update_coordinates_linear();
+		// reset everything
+		vector_t neighbors_range( DIMS );
+		this->_point._system->compute_neighbors_range(
+			//*(this->_point._element_iter),
+			this->_point._element_iter->get_position(),
+			this->_neighbors_start,
+			neighbors_range
+		);
+		/*
+		std::cout << "\t=== start ";
+		print( this->_neighbors_start ) << " range ";
+		print( neighbors_range )  << std::endl;
+		*/
+		// re-target _neighbors_linearizer
+		this->_neighbors_linearizer.retarget( neighbors_range );
+	}
+
+	void on_element_advance() {
+		this->on_element_update();
+
+		this->_neighbor_iter = vector_iter( this->_neighbors_linearizer );
+		this->_neighbor_end = vector_iter::make_system_end_iterator( this->_neighbors_linearizer );
+
+		this->on_neighbor_iter_update();
+	}
+
+public:
+
+	linearized_halo_ndim_iterator() = delete;
+
+	linearized_halo_ndim_iterator( const system_t& system ) noexcept :
+		_point( system ),
+		_neighbors_linearizer( DIMS, system.halo() + 1 ),
+		_neighbor_iter( this->_neighbors_linearizer ),
+		_neighbors_start( DIMS ),
+		_neighbor_end( vector_iter::make_system_end_iterator( this->_neighbors_linearizer ) )
+	{
+		std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
+	}
+
+
+	/*
+	linearized_halo_ndim_iterator( const linearized_halo_ndim_iterator< CoordT, DIMS >& original ) noexcept:
+		_coordinates_linearizer( original._coordinates_linearizer ),
+		_halo( original._halo ),
+		_dimension_limits( original._dimension_limits ),
+		_neighbors_linearizer( original._neighbors_linearizer ),
+		_element_iter( original._element_iter ),
+		_neighbor_iter( original._neighbor_iter ),
+		_neighbor_end( original._neighbor_end ),
+		_neighbors_start( original._neighbors_start ),
+		_point( original._point ) {}
+	*/
+
+	linearized_halo_ndim_iterator( const linearized_halo_ndim_iterator< CoordT, DIMS >& ) = default;
+
+	//linearized_halo_ndim_iterator( linearized_halo_ndim_iterator< CoordT, DIMS >&& original ) = delete;
+
+	/*
+	linearized_halo_ndim_iterator< CoordT, DIMS >& operator=(
+		const linearized_halo_ndim_iterator< CoordT, DIMS >& original ) noexcept {
+		this->_coordinates_linearizer = original._coordinates_linearizer;
+		this->_halo = original._halo;
+		this->_dimension_limits = original._dimension_limits;
+		this->_neighbors_linearizer = original._neighbors_linearizer;
+		this->_element_iter = original._element_iter;
+		this->_coordinates_linear = original._coordinates_linear;
+		this->_neighbor_iter = original._neighbor_iter;
+		this->_neighbor_end = original._neighbor_end;
+		this->_neighbor = original._neighbor;
+		this->_neighbors_start = original._neighbors_start;
+		this->_neighbor_linear = original._neighbor_linear;
+	}
+	*/
+
+	linearized_halo_ndim_iterator< CoordT, DIMS >& operator=( const linearized_halo_ndim_iterator< CoordT, DIMS >& ) = default;
+
+	//linearized_halo_ndim_iterator< CoordT, DIMS >& operator=( linearized_halo_ndim_iterator< CoordT, DIMS >&& ) = delete;
+
+	bool operator!=( const linearized_halo_ndim_iterator< CoordT, DIMS >& other ) const {
+		//return (this->_point)._coordinates_linear != (other._point)._coordinates_linear
+		//	|| (this->_point)._neighbor_linear != (other._point)._neighbor_linear;
+		return this->_point._position != other._point._position; // use linear coordinate
+	}
+
+	const_point_reference operator*() const {
+		return this->_point;
+	}
+
+	const_point_pointer operator->() const {
+		return &(this->_point);
+	}
+
+	bool has_more_neighbours() const {
+		return this->_neighbor_iter != this->_neighbor_end;
+	}
+
+	void next_neighbour() {
+		/*
+		std::cout << "sizes: " << this->_neighbors_linearizer.get_sizes()
+			<< " offset " << this->_neighbor_iter->get_position() << " -> "
+			<< this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() )
+			<< std::endl;
+		*/
+		++(this->_neighbor_iter);
+		this->on_neighbor_iter_update();
+		this->_point._position++;
+	}
+
+	bool has_more_elements() const {
+		return this->_point.get_element_linear() != (this->_point._system)->base_system_size();
+	}
+
+	void next_element() {
+		std::size_t num_neighbours = this->_neighbors_linearizer.system_size();
+		std::size_t neighbour_position_offset =
+			this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() );
+		// std::cout << " num_neighbours " << num_neighbours << " offset " << neighbour_position_offset << std::endl;
+		++(this->_point._element_iter);
+		this->on_element_advance();
+		// this->_point._position++;
+		this->_point._position -= neighbour_position_offset;
+		this->_point._position += num_neighbours;
+	}
+
+	linearized_halo_ndim_iterator< CoordT, DIMS >& operator++() noexcept {
+		++(this->_neighbor_iter);
+		if( !has_more_neighbours() ) {
+			++(this->_point._element_iter);
+			//this->_coordinates_linear = this->_coordinates_linearizer.ndim_to_linear( this->_element_iter );
+			this->on_element_advance();
+
+		} else {
+			this->on_neighbor_iter_update();
+		}
+		this->_point._position++;
+		return *this;
+	}
+
+
+
+	linearized_halo_ndim_iterator< CoordT, DIMS >& operator+=( std::size_t offset ) {
+		if( offset == 1UL ) {
+			return this->operator++();
+		}
+		const std::size_t final_position { this->_point._position + offset };
+		if( final_position > this->_point._system->halo_system_size() ) {
+			throw std::range_error( "neighbor linear value beyond system" );
+		}
+		vector_t final_element( DIMS );
+		std::size_t neighbor_index{ (this->_point._system->neighbour_linear_to_element( final_position, final_element )) };
+
+		// std::cout << "\t=== element " << offset << " -- ";
+		// std::cout << final_element[0] << " " << final_element[0] << std::endl;
+
+		this->_point._element_iter = vector_iter( *this->_point._system, final_element.cbegin() );
+		//this->_point._element = &( *this->_element_iter );
+		this->_point._position = final_position;
+
+		this->on_element_update();
+		this->_neighbors_linearizer.linear_to_ndim( neighbor_index, final_element );
+
+		this->_neighbor_iter = vector_iter( this->_neighbors_linearizer, final_element.cbegin() );
+		this->_neighbor_end = vector_iter::make_system_end_iterator( this->_neighbors_linearizer );
+		this->on_neighbor_iter_update();
+
+		return *this;
+	}
+
+	difference_type operator-( const linearized_halo_ndim_iterator< CoordT, DIMS >& other ) const {
+		/*
+		if( _point.get_position() < a_point.get_position() ) {
+			throw std::invalid_argument( "first iterator is in a lower position than second" );
+		}
+		*/
+		std::size_t a_pos{ _point.get_position() }, b_pos{ other._point.get_position() };
+		// std::cout << "diff " << a_pos << " - " << b_pos << std::endl;
+		std::size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
+		using diff_t = typename linearized_halo_ndim_iterator< CoordT, DIMS >::difference_type;
+
+		if( highest - lowest > static_cast< std::size_t >(
+			std::numeric_limits< diff_t >::max() ) ) {
+			throw std::invalid_argument( "iterators are too distant" );
+		}
+
+		return ( static_cast< diff_t >( a_pos - b_pos ) );
+	}
+
+
+
+
+	// implementation depending on logic in operator++
+	static linearized_halo_ndim_iterator< CoordT, DIMS > make_system_end_iterator(
+		const system_t& system
+	) {
+		linearized_halo_ndim_iterator< CoordT, DIMS > result( system );
+
+		/*
+		std::cout << "result 0: element ";
+		print(result->get_element()) << " neighbor ";
+		print(result->get_neighbor())  << std::endl;
+		*/
+
+		// go to the very first point outside of space
+		result._point._element_iter = vector_iter::make_system_end_iterator( system );
+		/*
+		std::cout << "result 1: element ";
+		print(result->get_element()) << " neighbor ";
+		print(result->get_neighbor())  << std::endl;
+		*/
+
+		result.on_element_advance();
+		result._point._position = system.halo_system_size();
+		//std::cout << "got sys size " << system.halo_system_size() << std::endl;
+
+		return result;
+	}
+
+};
+
+/*
+template< typename CoordT, std::size_t DIMS > linearized_halo_ndim_iterator< CoordT, DIMS >
+	operator+( const linearized_halo_ndim_iterator< CoordT, DIMS >& original, std::size_t increment ) {
+	linearized_halo_ndim_iterator< CoordT, DIMS > res( original );
+	return ( res += increment );
+}
+*/
+
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _LINEARIZED_HALO_NDIM_ITERATOR_H_
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
new file mode 100644
index 000000000..f915492ac
--- /dev/null
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
@@ -0,0 +1,111 @@
+
+#ifndef _LINEARIZED_HALO_NDIM_SYSTEM_H_
+#define _LINEARIZED_HALO_NDIM_SYSTEM_H_
+
+#include <cstddef>
+#include <vector>
+#include <array>
+#include <cassert>
+
+#include "array_vector_storage.hpp"
+#include "linearized_ndim_system.hpp"
+#include "linearized_halo_ndim_geometry.hpp"
+#include "linearized_halo_ndim_iterator.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+// only with array_vector_storage
+template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_system:
+	public linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > > {
+public:
+
+	using iterator = linearized_halo_ndim_iterator< CoordT, DIMS >;
+    using const_vector_reference = typename array_vector_storage< CoordT, DIMS >::const_vector_storage;
+	using self_t = linearized_halo_ndim_system< CoordT, DIMS >;
+	using base_t = linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > >;
+
+    linearized_halo_ndim_system( const_vector_reference sizes, CoordT halo ):
+		base_t( sizes.cbegin(), sizes.cend() ),
+        _halo( halo ) {
+
+		for( CoordT __size : sizes ) {
+			if ( __size < 2 * halo + 1 ) {
+				throw std::invalid_argument(
+					std::string( "the halo (" + std::to_string(halo) +
+					std::string( ") goes beyond a system size (" ) +
+					std::to_string( __size) + std::string( ")" ) ) );
+			}
+		}
+
+        this->_system_size = __init_halo_search< CoordT, DIMS >(
+				this->get_sizes(),
+				_halo, this->_dimension_limits );
+		assert( this->_dimension_limits.size() == DIMS );
+    }
+
+    linearized_halo_ndim_system() = delete;
+
+    linearized_halo_ndim_system( const self_t & ) = default;
+
+    linearized_halo_ndim_system( self_t && ) = delete;
+
+    ~linearized_halo_ndim_system() noexcept {}
+
+    self_t & operator=( const self_t & ) = default;
+
+    self_t & operator=( self_t && ) = delete;
+
+	iterator begin() const {
+		return iterator( *this );
+	}
+
+	iterator end() const {
+		return iterator::make_system_end_iterator( *this );
+	}
+
+	std::size_t halo_system_size() const {
+		return this->_system_size;
+	}
+
+	std::size_t base_system_size() const {
+		return this->base_t::system_size();
+	}
+
+    std::size_t halo() const {
+        return this->_halo;
+    }
+
+    void compute_neighbors_range(
+        const array_vector_storage< CoordT, DIMS >& system_coordinates,
+	    array_vector_storage< CoordT, DIMS >& neighbors_start,
+	    array_vector_storage< CoordT, DIMS >& neighbors_range) const noexcept {
+        __compute_neighbors_range( this->get_sizes(),
+            this->_halo,
+            system_coordinates,
+            neighbors_start,
+            neighbors_range
+        );
+    }
+
+    std::size_t neighbour_linear_to_element (
+        CoordT neighbor,
+	    array_vector_storage< CoordT, DIMS > & result) const noexcept {
+        return __neighbour_to_system_coords( this->get_sizes(),
+        this->_system_size, this->_dimension_limits, this->_halo, neighbor, result );
+    }
+
+private:
+
+    const CoordT _halo;
+    std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > > _dimension_limits;
+    std::size_t _system_size;
+
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _LINEARIZED_HALO_NDIM_SYSTEM_H_
diff --git a/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp b/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
new file mode 100644
index 000000000..20a6473cc
--- /dev/null
+++ b/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
@@ -0,0 +1,178 @@
+
+#ifndef _NDIM_ITERATOR_H_
+#define _NDIM_ITERATOR_H_
+
+#include <cstddef>
+#include <algorithm>
+#include <stdexcept>
+#include <type_traits>
+#include <limits>
+
+#include "array_vector_storage.hpp"
+
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+// forward declaration for default
+template< typename T, typename StorageT > class linearized_ndim_system;
+
+template< typename T, typename StorageT > class linearized_ndim_iterator {
+public:
+
+	using storage_t = StorageT;
+	using lin_t = linearized_ndim_system< T, storage_t >;
+	using const_vector_reference = const storage_t&;
+	using self_t = linearized_ndim_iterator< T, StorageT >;
+
+	struct ndim_point {
+	private:
+
+		const lin_t* system; // pointer because of copy assignment
+		storage_t coords;
+
+	public:
+
+		friend self_t;
+
+		ndim_point() = delete;
+
+		ndim_point( const ndim_point& ) = default;
+
+		ndim_point( ndim_point&& ) = delete;
+
+		ndim_point( const lin_t& _system ) noexcept :
+			system( &_system ),
+			coords( _system.dimensions() )
+		{
+			std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
+		}
+
+		ndim_point& operator=( const ndim_point& ) = default;
+
+		inline const_vector_reference get_position() const {
+			return coords;
+		}
+
+		std::size_t get_linear_position() const {
+			return system->ndim_to_linear( coords );
+		}
+	};
+
+
+	// interface for std::random_access_iterator
+	using iterator_category = std::random_access_iterator_tag;
+	using value_type = ndim_point;
+	using pointer = const value_type*;
+	using reference = const value_type&;
+	using difference_type = signed long;
+
+	linearized_ndim_iterator( const lin_t &_system ) noexcept :
+		_p( _system )
+	{}
+
+	template< typename IterT > linearized_ndim_iterator( const lin_t &_system, IterT begin ) noexcept :
+		_p( _system )
+	{
+		std::copy_n( begin, _system.dimensions(), this->_p.coords.begin() );
+	}
+
+	linearized_ndim_iterator() = delete;
+
+	linearized_ndim_iterator( const self_t& original ):
+		_p( original._p ) {}
+
+	self_t& operator=( const self_t& original ) = default;
+
+	//linearized_ndim_iterator( self_t&& original ) = delete;
+
+	//self_t operator=( self_t&& ) = delete;
+
+	~linearized_ndim_iterator() {}
+
+    self_t & operator++() noexcept {
+		bool rewind{ true };
+		// rewind only the first N-1 coordinates
+		for( std::size_t i { 0 }; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
+			T& coord = this->_p.coords[ i ];
+			// must rewind dimension if we wrap-around
+			/*
+			T new_coord = ( coord + 1 ) % this->_p.system->get_sizes()[ i ];
+			rewind = new_coord < coord;
+			coord = new_coord;
+			*/
+			T plus = coord + 1;
+			rewind = plus >= this->_p.system->get_sizes()[ i ];
+			coord = rewind ? 0 : plus;
+		}
+		// if we still have to rewind, increment the last coordinate, which is unbounded
+		if( rewind ) {
+			this->_p.coords[ this->_p.system->dimensions() - 1 ]++;
+		}
+		return *this;
+	}
+
+    self_t & operator+=( std::size_t offset ) {
+		std::size_t linear{ _p.get_linear_position() + offset };
+		if( linear > _p.system->system_size() ) {
+			throw std::invalid_argument("increment is too large");
+		}
+		_p.system->linear_to_ndim( linear, _p.coords );
+		return *this;
+	}
+
+	difference_type operator-( const self_t &other ) const {
+		std::size_t a_pos{ _p.get_linear_position() },
+			b_pos{ other._p.get_linear_position() };
+		std::size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
+
+		if( highest - lowest > static_cast< std::size_t >(
+			std::numeric_limits< difference_type >::max() ) ) {
+			throw std::invalid_argument( "iterators are too distant" );
+		}
+
+		return ( static_cast< difference_type >( a_pos - b_pos ) );
+	}
+
+	reference operator*() const {
+        return this->_p;
+    }
+
+	pointer operator->() const {
+		return &( this->_p );
+	}
+
+    bool operator!=( const self_t &o ) const {
+		const std::size_t dims{ this->_p.system->dimensions() };
+		if( dims != o._p.system->dimensions() ) {
+			throw std::invalid_argument("system sizes do not match");
+		}
+        bool equal{ true };
+		for( std::size_t i{0}; i < dims && equal; i++) {
+			equal &= ( this->_p.coords[i] == o._p.coords[i] );
+		}
+		return !equal;
+    }
+
+	// implementation depending on logic in operator++
+	static self_t
+		make_system_end_iterator( const lin_t &_system ) {
+		// fill with 0s
+		self_t iter( _system );
+		std::size_t last{ iter->system->dimensions() - 1 };
+		// store last size in last position
+		iter._p.coords[ last ] = iter->system->get_sizes()[ last ];
+		return iter;
+	}
+
+private:
+	ndim_point _p;
+
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _NDIM_ITERATOR_H_
diff --git a/include/graphblas/utils/geometry/linearized_ndim_system.hpp b/include/graphblas/utils/geometry/linearized_ndim_system.hpp
new file mode 100644
index 000000000..2916208ed
--- /dev/null
+++ b/include/graphblas/utils/geometry/linearized_ndim_system.hpp
@@ -0,0 +1,174 @@
+
+#ifndef _NDIM_SYSTEM_LINEARIZER_H_
+#define _NDIM_SYSTEM_LINEARIZER_H_
+
+#include <cstddef>
+#include <algorithm>
+#include <vector>
+#include <utility>
+#include <stdexcept>
+#include <cassert>
+#include <string>
+
+#include "ndim_system.hpp"
+#include "linearized_ndim_iterator.hpp"
+#include "array_vector_storage.hpp"
+
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename IterIn, typename IterOut >
+	std::size_t __compute_offsets( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
+	std::size_t prod{1};
+	for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
+		*out_begin = prod;
+		prod *= *in_begin;
+	}
+	return prod;
+}
+
+// container for system sizes, doing only ndim <--> linear translation
+template< typename T, typename StorageT > class linearized_ndim_system:
+	public ndim_system< T, StorageT > {
+public:
+
+	using base_t = ndim_system< T, StorageT >;
+	using storage_t = StorageT;
+	using self_t = linearized_ndim_system< T, StorageT >;
+
+	using vector_reference = typename base_t::vector_reference;
+	using const_vector_reference = typename base_t::const_vector_reference;
+	using vector_storage = typename storage_t::vector_storage;
+	using const_vector_storage = typename storage_t::const_vector_storage;
+	using iterator = linearized_ndim_iterator< T, storage_t >;
+
+	template< typename IterT > linearized_ndim_system( IterT begin, IterT end) noexcept :
+		base_t( begin, end ),
+		offsets( std::distance( begin, end ) )
+	{
+		this->_system_size = __compute_offsets( begin, end, this->offsets.begin() ) ;
+	}
+
+	linearized_ndim_system() = delete;
+
+	linearized_ndim_system( const self_t &original ) = default;
+
+
+	linearized_ndim_system( self_t &&original ) noexcept:
+		base_t( std::move(original) ), offsets( std::move( original.offsets ) ),
+		_system_size( original._system_size ) {
+			original._system_size = 0;
+	}
+
+	linearized_ndim_system( const std::vector<std::size_t> & _sizes ) noexcept :
+		linearized_ndim_system( _sizes.cbegin(), _sizes.cend() ) {}
+
+	linearized_ndim_system( std::size_t _dimensions, std::size_t max_value ) noexcept :
+		base_t( _dimensions, max_value ),
+		offsets( _dimensions ),
+		_system_size( _dimensions )
+	{
+		T v{1};
+		for( std::size_t i{0}; i < _dimensions; i++ ) {
+			this->offsets[i] = v;
+			v *= max_value;
+		}
+		this->_system_size = v;
+	}
+
+	~linearized_ndim_system() {}
+
+	self_t& operator=( const self_t & ) = default;
+
+	//linearized_ndim_system& operator=( linearized_ndim_system &&original ) = delete;
+
+	inline std::size_t system_size() const {
+		return this->_system_size;
+	}
+
+	inline const_vector_reference get_offsets() const {
+		return this->offsets;
+	}
+
+	void linear_to_ndim(std::size_t linear, vector_reference output ) const {
+		if( linear > this->_system_size ) {
+			throw std::range_error( "linear value beyond system" );
+		}
+		for( std::size_t _i{ this->offsets.dimensions() }; _i > 0; _i-- ) {
+			const std::size_t dim{ _i - 1 };
+			const std::size_t coord{ linear / this->offsets[dim] };
+			output[dim] = coord;
+			linear -= ( coord * this->offsets[dim] );
+		}
+		assert( linear == 0 );
+	}
+
+	std::size_t ndim_to_linear_check( const_vector_reference ndim_vector) const {
+		return this->ndim_to_linear_check( ndim_vector.storage() );
+	}
+
+	std::size_t ndim_to_linear_check( const_vector_storage ndim_vector ) const {
+        std::size_t linear { 0 };
+        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
+			if( ndim_vector[i] >= this->get_sizes()[i] ) {
+				throw std::invalid_argument( "input vector beyond system sizes" );
+			}
+        }
+        return ndim_to_linear( ndim_vector );
+	}
+
+	std::size_t ndim_to_linear( const_vector_reference ndim_vector) const {
+		return this->ndim_to_linear( ndim_vector.storage() );
+	}
+
+	std::size_t ndim_to_linear( const_vector_storage ndim_vector ) const {
+        std::size_t linear { 0 };
+        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
+            linear += this->offsets[i] * ndim_vector[i];
+        }
+        return linear;
+	}
+
+	std::size_t ndim_to_linear_offset( const_vector_storage ndim_vector ) const {
+        std::size_t linear { 0 };
+		std::size_t steps{ 1 };
+        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
+            linear += steps * ndim_vector[i];
+			steps *= this->_sizes[i];
+        }
+        return linear;
+	}
+
+	// must be same dimensionality
+	void retarget( const_vector_reference _new_sizes ) {
+		if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
+			throw std::invalid_argument("new system must have same dimensions as previous: new "
+				+ std::to_string( _new_sizes.dimensions() ) + ", old "
+				+ std::to_string( this->_sizes.dimensions() ) );
+		}
+		this->_sizes = _new_sizes; // copy
+		this->_system_size = __compute_offsets( _new_sizes.begin(), _new_sizes.end(), this->offsets.begin() ) ;
+	}
+
+	iterator begin() const {
+		return iterator( *this );
+	}
+
+	iterator end() const {
+		return iterator::make_system_end_iterator( *this );
+	}
+
+private:
+	storage_t offsets;
+	std::size_t _system_size;
+
+};
+
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _NDIM_SYSTEM_LINEARIZER_H_
diff --git a/include/graphblas/utils/geometry/ndim_system.hpp b/include/graphblas/utils/geometry/ndim_system.hpp
new file mode 100644
index 000000000..41434f3c4
--- /dev/null
+++ b/include/graphblas/utils/geometry/ndim_system.hpp
@@ -0,0 +1,69 @@
+
+#ifndef _NDIM_SYSTEM_H_
+#define _NDIM_SYSTEM_H_
+
+#include <cstddef>
+#include <algorithm>
+#include <vector>
+#include <utility>
+
+#include "array_vector_storage.hpp"
+
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename T, typename StorageT > class ndim_system {
+
+public:
+	using storage_t = StorageT;
+	using vector_reference = storage_t&;
+	using const_vector_reference = const storage_t&;
+	using self_t = ndim_system< T, StorageT >;
+
+	template< typename IterT > ndim_system( IterT begin, IterT end) noexcept :
+		_sizes( std::distance( begin, end ) )
+	{
+		std::copy( begin, end, this->_sizes.begin() );
+	}
+
+	ndim_system() = delete;
+
+	ndim_system( const self_t & ) = default;
+
+	ndim_system( const std::vector<std::size_t> & _sizes ) noexcept :
+		self_t( _sizes.cbegin(), _sizes.cend() ) {}
+
+	ndim_system( std::size_t _dimensions, std::size_t max_value ) noexcept :
+		_sizes( _dimensions )
+	{
+		std::fill_n( this->_sizes.begin(), _dimensions, max_value );
+	}
+
+	ndim_system( self_t &&original ) noexcept: _sizes( std::move( original._sizes ) ) {}
+
+	~ndim_system() {}
+
+	self_t & operator=( const self_t &original ) = default;
+
+	//self_t & operator=( self_t &&original ) = delete;
+
+	inline std::size_t dimensions() const noexcept {
+		return _sizes.dimensions();
+	}
+
+	inline const_vector_reference get_sizes() const noexcept {
+		return this->_sizes;
+	}
+
+protected:
+
+	storage_t _sizes;
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif
diff --git a/include/graphblas/utils/geometry/ndim_vector.hpp b/include/graphblas/utils/geometry/ndim_vector.hpp
new file mode 100644
index 000000000..9c9ad3b6a
--- /dev/null
+++ b/include/graphblas/utils/geometry/ndim_vector.hpp
@@ -0,0 +1,122 @@
+
+#ifndef _NDIM_VECTOR_H_
+#define _NDIM_VECTOR_H_
+
+#include <utility>
+#include <vector>
+#include <array>
+#include <stdexcept>
+#include <cassert>
+#include <iterator>
+#include <type_traits>
+
+#include "linearized_ndim_system.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+template< typename OutT, typename CoordsT, typename StorageT  > class ndim_vector {
+
+public:
+
+	using const_domain_vector_reference =
+		typename linearized_ndim_system< CoordsT, StorageT >::const_vector_reference;
+	using domain_vector_storage = typename StorageT::const_vector_storage;
+	using domain_iterator = typename linearized_ndim_system< CoordsT, StorageT >::iterator;
+
+private:
+
+	const linearized_ndim_system< CoordsT, StorageT > _linearizer;
+	OutT* data;
+
+	inline std::size_t get_coordinate( domain_vector_storage coordinates ) const {
+		return this->_linearizer.ndim_to_linear( coordinates );
+	}
+
+	inline std::size_t get_coordinate( domain_iterator coordinates ) const {
+		return this->_linearizer.ndim_to_linear( coordinates );
+	}
+
+    void clean_mem() {
+        if ( this->data == nullptr ) {
+            delete[] this->data;
+        }
+    }
+
+public:
+
+	ndim_vector() = delete;
+
+	template< typename IterT > ndim_vector( IterT begin, IterT end): _linearizer( begin, end ) {
+		static_assert( std::is_default_constructible< OutT >::value,
+			"the stored type is not default constructible" );
+		this->data = new OutT[ _linearizer.system_size() ];
+	}
+
+	ndim_vector( const std::vector<std::size_t> & _sizes ):
+		ndim_vector( _sizes.cbegin(), _sizes.cend() ) {}
+
+	// ndim_vector( const ndim_vector< OutT, CoordsT, StorageT >& original ):
+	// 	_linearizer( original._linearizer ) {
+    //     this->data = new std::size_t[ original.data_size() ];
+	// 	std::copy_n( original.data, original.data_size(), this->data );
+    // }
+	ndim_vector( const ndim_vector< OutT, CoordsT, StorageT >& original ) = delete;
+
+
+	ndim_vector( ndim_vector< OutT, CoordsT, StorageT >&& original ) noexcept:
+		_linearizer( std::move( original._linearizer ) ) {
+        this->data = original.data;
+        original.data = nullptr;
+    }
+	// ndim_vector( ndim_vector< OutT, CoordsT, StorageT >&& original ) = delete;
+
+	ndim_vector< OutT, CoordsT, StorageT >& operator=(
+			const ndim_vector< OutT, CoordsT, StorageT > &original ) = delete;
+
+	ndim_vector< OutT, CoordsT, StorageT >& operator=(
+			ndim_vector< OutT, CoordsT, StorageT > &&original ) = delete;
+
+    ~ndim_vector() {
+        this->clean_mem();
+    }
+
+	std::size_t dimensions() const {
+		return this->_linearizer.dimensions();
+	}
+
+	std::size_t data_size() const {
+		return this->_linearizer.system_size();
+	}
+
+	inline OutT& at( const_domain_vector_reference coordinates ) {
+		return this->data[ this->get_coordinate( coordinates.storage() ) ];
+	}
+
+	inline const OutT& at( const_domain_vector_reference coordinates ) const {
+		return this->data[ this->get_coordinate( coordinates.storage() ) ];
+	}
+
+	inline OutT& at( domain_vector_storage coordinates ) {
+		return this->data[ this->get_coordinate( coordinates ) ];
+	}
+
+	inline const OutT& at( domain_vector_storage coordinates ) const {
+		return this->data[ this->get_coordinate( coordinates ) ];
+	}
+
+	domain_iterator domain_begin() const {
+		return this->_linearizer.begin();
+	}
+
+	domain_iterator domain_end() const {
+		return this->_linearizer.end();
+	}
+};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _NDIM_VECTOR_H_
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 5b34d9895..8425432de 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -38,6 +38,12 @@
 #include <graphblas/algorithms/hpcg/hpcg.hpp>
 #include <graphblas/algorithms/hpcg/system_building_utils.hpp>
 
+#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
+
+#include <chrono>
+
+// #define TEST_ITER
+
 // here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
 #ifdef HPCG_PRINT_STEPS
 
@@ -111,6 +117,7 @@ struct simulation_input : public system_input {
 	size_t smoother_steps;
 	bool evaluation_run;
 	bool no_preconditioning;
+	bool print_iter_stats;
 };
 
 /**
@@ -197,9 +204,9 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
-	assert( spmd<>::pid() < spmd<>::nprocs() );
+	const size_t pid { spmd<>::pid() };
+	assert( pid < spmd<>::nprocs() );
 	grb::utils::Timer timer;
-	timer.reset();
 
 	// assume successful run
 	out.error_code = SUCCESS;
@@ -207,15 +214,24 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
+	if( pid == 0 ) {
+		thcout << "beginning input generation..." << std::endl;
+	}
+	timer.reset();
 	rc = build_3d_system( hpcg_state, in );
+	double input_duration { timer.time() };
 
 	if( rc != SUCCESS ) {
 		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
 		out.error_code = rc;
 		return;
 	}
+	if( pid == 0 ) {
+		thcout << "input generation time (ms): " << input_duration << std::endl;
+	}
+
 #ifdef HPCG_PRINT_SYSTEM
-	if( spmd<>::pid() == 0 ) {
+	if( pid == 0 ) {
 		print_system( *hpcg_state );
 	}
 #endif
@@ -231,7 +247,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	set( x, 0.0 );
 
 #ifdef HPCG_PRINT_SYSTEM
-	if( spmd<>::pid() == 0 ) {
+	if( pid == 0 ) {
 		print_vector( x, 50, "X" );
 		print_vector( b, 50, "B" );
 	}
@@ -242,41 +258,77 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	const bool with_preconditioning = ! in.no_preconditioning;
 	if( in.evaluation_run ) {
 		out.test_repetitions = 0;
+		if( pid == 0 ) {
+			thcout << "beginning evaluation run..." << std::endl;
+		}
 		timer.reset();
-		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
+			in.max_iterations, 0.0, out.performed_iterations, out.residual, false );
 		double single_time = timer.time();
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
 		}
+		if( rc != SUCCESS ) {
+			thcerr << "error during evaluation run" << std::endl;
+			out.error_code = rc;
+			return;
+		}
 		out.times.useful = single_time;
 		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
-	} else {
-		// do benchmark
+
+		if( pid == 0 ) {
+			thcout << "Evaluation run" << std::endl;
+		}
+
+		std::cout << "  iterations: " << out.performed_iterations << std::endl
+			<< "  computed residual: " << out.residual << std::endl
+			<< "  time taken (ms): " << out.times.useful << std::endl
+			<< "  deduced inner repetitions for 1s duration: " << out.test_repetitions << std::endl;
+		return;
+	}
+
+	// do a cold run to warm the system up
+	if( pid == 0 ) {
+		thcout << "beginning cold run..." << std::endl;
+	}
+	timer.reset();
+	rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
+		1, 0.0, out.performed_iterations, out.residual, false );
+	double iter_duration { timer.time() };
+	if( pid == 0 ) {
+		thcout << "cold run duration (ms): " << iter_duration << std::endl;
+	}
+
+
+	// do benchmark
+	for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
+		rc = set( x, 0.0 );
+		assert( rc == SUCCESS );
+		if( pid == 0 ) {
+			thcout << "beginning iteration: " << i << std::endl;
+		}
 		timer.reset();
-		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
-			rc = set( x, 0.0 );
-			assert( rc == SUCCESS );
-			rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-			out.test_repetitions++;
-			if( rc != SUCCESS ) {
-				break;
-			}
+		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
+			in.max_iterations, 0.0, out.performed_iterations, out.residual, in.print_iter_stats );
+		iter_duration = timer.time();
+		out.times.useful += iter_duration;
+		if( pid == 0 ) {
+			thcout << "repetition,duration (ms): " << i << "," << iter_duration << std::endl;
+		}
+		out.test_repetitions++;
+		if( rc != SUCCESS ) {
+			break;
 		}
-		double time_taken { timer.time() };
-		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
-		// sleep( 1 );
 	}
+	out.times.useful /= static_cast< double >( in.test_repetitions );
 
 	if( spmd<>::pid() == 0 ) {
 		if( rc == SUCCESS ) {
-			if( in.evaluation_run ) {
-				std::cout << "Info: cold HPCG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
-						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
-			} else {
-				std::cout << "Average time taken for each of " << out.test_repetitions << " HPCG calls (hot start): " << out.times.useful << std::endl;
-			}
+			thcout << "repetitions, average time (ms): " << out.test_repetitions
+				<< ", " << out.times.useful << std::endl;
 		} else {
-			std::cerr << "Failure: call to HPCG did not succeed (" << toString( rc ) << ")." << std::endl;
+			thcerr << "Failure: call to HPCG did not succeed (" << toString( rc )
+				<< ")." << std::endl;
 		}
 	}
 
@@ -285,7 +337,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// set error code
 	out.error_code = rc;
 
-	Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one > ring;
+	Semiring< grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one > ring;
 	grb::set( b, 1.0 );
 	out.square_norm_diff = 0.0;
 	grb::eWiseMul( b, -1.0, x, ring );
@@ -303,11 +356,22 @@ void grbProgram( const simulation_input & in, struct output & out ) {
  */
 static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
 
+#ifdef TEST_ITER
+static void test_iters();
+static void test_iters2();
+#endif
+
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
 	size_t test_outer_iterations;
 	double max_residual_norm;
 
+#ifdef TEST_ITER
+	test_iters();
+	test_iters2();
+	return 0;
+#endif
+
 	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
@@ -317,6 +381,7 @@ int main( int argc, char ** argv ) {
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
 	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
 	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
+	thcout << "Print iteration residual: " << std::boolalpha << sim_in.print_iter_stats << std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
 	thcout << "Maximum norm for residual: " << max_residual_norm << std::endl;
@@ -374,7 +439,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
-		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
 			"maximum level for coarsening; 0 means no coarsening; note: actual "
 			"level may be limited"
 			" by the minimum system dimension" )
@@ -388,7 +453,9 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
 			"launch single run directly, without benchmarker (ignore "
 			"repetitions)" )
-		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" )
+		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false, "on each iteration, print more statistics" );
+
 
 	parser.parse( argc, argv );
 
@@ -422,3 +489,186 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	}
 }
 
+
+
+
+struct NZ {
+	size_t i;
+	size_t j;
+	double v;
+
+	NZ( size_t _i, size_t _j, double _v ): i(_i), j(_j), v(_v) {}
+
+	bool operator!=( const NZ& o ) const {
+		return i != o.i || j != o.j || v != o.v;
+	}
+};
+
+#ifdef TEST_ITER
+static void test_iters() {
+
+	using clock = std::chrono::steady_clock;
+
+	constexpr size_t DIMS = 3;
+
+	std::array< unsigned, DIMS > finer_sizes{ 1024, 1024, 1024};
+	std::array< unsigned, DIMS > coarser_sizes;
+	for( size_t i = 0; i < finer_sizes.size(); i++ ) {
+		coarser_sizes[ i ] = finer_sizes[ i ] / 2;
+	}
+
+	size_t rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
+
+	std::array< size_t, DIMS > lfiner_sizes{ 1024, 1024, 1024};
+	std::array< size_t, DIMS > lcoarser_sizes{};
+	for( size_t i = 0; i < lfiner_sizes.size(); i++ ) {
+		lcoarser_sizes[ i ] = lfiner_sizes[ i ] / 2;
+	}
+	grb::algorithms::old::coarsener_generator_iterator< DIMS, double > sbegin( lcoarser_sizes, lfiner_sizes, 0 );
+	grb::algorithms::old::coarsener_generator_iterator< DIMS, double > send( lcoarser_sizes, lfiner_sizes, rows );
+
+
+	using citer = hpcg_coarsener_builder< DIMS, unsigned, double >::hpcg_coarsener_iterator;
+	hpcg_coarsener_builder< DIMS, unsigned, double > coarsener( coarser_sizes, finer_sizes );
+	citer pbegin( coarsener.make_begin_iterator() );
+	const citer pend( coarsener.make_end_iterator() );
+
+	size_t num_elements = pend - pbegin;
+	std::cout << "number of elements: " << num_elements << std::endl;
+
+	std::vector< NZ > svalues;
+	svalues.reserve( num_elements);
+	typename clock::time_point start( clock::now() );
+	for( ; sbegin != send; ++sbegin ) {
+		// printf( "inserting %lu %lu\n", sbegin.i(), sbegin.j() );
+		svalues.emplace_back( sbegin.i(), sbegin.j(), sbegin.v() );
+	}
+	typename clock::time_point finish( clock::now() );
+	std::cout << "sequential generation time (ms): " <<
+		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
+
+
+
+
+	const size_t nthreads = omp_get_max_threads();
+	size_t per_thread_num = ( num_elements + nthreads - 1 ) / nthreads;
+	std::vector< std::vector< NZ > > tvalues( nthreads );
+	for( size_t i = 0; i < nthreads; i++ ) {
+		tvalues[i].reserve( per_thread_num );
+	}
+	start = clock::now();
+	#pragma omp parallel
+	{
+
+		int t = omp_get_thread_num();
+		std::vector< NZ > &tv = tvalues[ t ];
+		// printf( "thread %d, size %lu\n", t, tv.size() );
+		#pragma omp for schedule( static )
+		for( auto it = pbegin; it != pend; ++it ) {
+			tv.emplace_back( it.i(), it.j(), it.v() );
+			// printf( "thread %d: inserting %lu %lu\n", t, it.i(), it.j() );
+		}
+	}
+	finish = clock::now();
+	std::cout << "parallel generation time (ms): " <<
+		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
+
+	std::vector< NZ > pvalues;
+	for( const std::vector< NZ > &tv: tvalues ) {
+		pvalues.insert( pvalues.end(), tv.cbegin(), tv.cend() );
+	}
+
+
+	if( svalues.size() != pvalues.size() ) {
+		std::cout << "different sizes!" << std::endl;
+		std::exit(-1);
+	}
+
+	for( size_t i = 0; i < svalues.size(); i++ ) {
+		if( svalues[i] != pvalues[i] ) {
+			std::cout << "error at position " << i << std::endl;
+		}
+	}
+	std::cout << "all OK" << std::endl;
+}
+
+static void test_iters2() {
+
+	using clock = std::chrono::steady_clock;
+
+	constexpr size_t DIMS = 3, halo_size = 1;
+	constexpr double diag_value = 26.0, non_diag_value = -1.0;
+
+	std::array< unsigned, DIMS > sys_sizes{ 64, 64, 64};
+	size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
+
+	std::array< size_t, DIMS > large_sys_sizes{ 64, 64, 64};
+	old::matrix_generator_iterator< DIMS, double > sbegin( large_sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
+	old::matrix_generator_iterator< DIMS, double > send( large_sys_sizes, n, halo_size, diag_value, non_diag_value );
+
+	hpcg_builder< DIMS, unsigned, double > hpcg_system( sys_sizes, halo_size );
+	matrix_generator_iterator< DIMS, unsigned, double > pbegin(
+		hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
+	matrix_generator_iterator< DIMS, unsigned, double > pend(
+		hpcg_system.make_end_iterator( diag_value, non_diag_value )
+	);
+
+	size_t num_elements = pend - pbegin;
+	std::cout << "number of elements: " << num_elements << std::endl;
+
+	std::vector< NZ > svalues;
+	svalues.reserve( num_elements);
+	typename clock::time_point start( clock::now() );
+	for( ; sbegin != send; ++sbegin ) {
+		svalues.emplace_back( sbegin.i(), sbegin.j(), sbegin.v() );
+	}
+	typename clock::time_point finish( clock::now() );
+	std::cout << "sequential generation time (ms): " <<
+		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
+
+
+
+
+	const size_t nthreads = omp_get_max_threads();
+	size_t per_thread_num = ( num_elements + nthreads - 1 ) / nthreads;
+	std::vector< std::vector< NZ > > tvalues( nthreads );
+	for( size_t i = 0; i < nthreads; i++ ) {
+		tvalues[i].reserve( per_thread_num );
+	}
+	start = clock::now();
+	#pragma omp parallel
+	{
+
+		int t = omp_get_thread_num();
+		std::vector< NZ > &tv = tvalues[ t ];
+		// printf( "thread %d, size %lu\n", t, tv.size() );
+		#pragma omp for schedule( static )
+		for( auto it = pbegin; it != pend; ++it ) {
+			tv.emplace_back( it.i(), it.j(), it.v() );
+			// printf( "thread %d: inserting %lu %lu\n", t, it.i(), it.j() );
+		}
+	}
+	finish = clock::now();
+	std::cout << "parallel generation time (ms): " <<
+		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
+
+	std::vector< NZ > pvalues;
+	for( const std::vector< NZ > &tv: tvalues ) {
+		pvalues.insert( pvalues.end(), tv.cbegin(), tv.cend() );
+	}
+
+
+	if( svalues.size() != pvalues.size() ) {
+		std::cout << "different sizes!" << std::endl;
+		std::exit(-1);
+	}
+
+	for( size_t i = 0; i < svalues.size(); i++ ) {
+		if( svalues[i] != pvalues[i] ) {
+			std::cout << "error at position " << i << std::endl;
+		}
+	}
+
+	std::cout << "all OK" << std::endl;
+}
+#endif // TEST_ITER

From 01ecebdc64e7af5690928488b3d257d5a3861572 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 4 Jul 2022 15:05:56 +0200
Subject: [PATCH 04/28] building masks via iterators

---
 .../algorithms/hpcg/matrix_building_utils.hpp | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index 2dfeabc49..4791d9d2d 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -210,6 +210,104 @@ namespace grb {
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
+		template< typename T >
+		struct color_mask_iter {
+
+			using self_t = color_mask_iter< T >;
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = T;
+			using pointer = const value_type *;
+			using reference = value_type;
+			using difference_type = long;
+
+			color_mask_iter() = delete;
+
+			color_mask_iter( T _num_cols, T _pos ) noexcept:
+				color_num( _num_cols),
+				position( _pos ) {}
+
+
+			color_mask_iter( const self_t &o ):
+				color_num( o.color_num ),
+				position( o.position ) {}
+
+			//self_t & operator=( const self_t & ) = default;
+
+			bool operator!=( const self_t &o ) const {
+				return position != o.position;
+			}
+
+			self_t & operator++() noexcept {
+				position += color_num;
+				return *this;
+			}
+
+			self_t & operator++( int ) noexcept {
+				return operator++();
+			}
+
+			self_t & operator+=( size_t offset ) noexcept {
+				position += offset * color_num;
+				return *this;
+			}
+
+			difference_type operator-( const self_t &o ) const noexcept {
+				return static_cast< difference_type >( ( position - o.position ) / color_num );
+			}
+
+			pointer operator->() const {
+				return &position;
+			}
+
+			reference operator*() const {
+				// std::cout << "returning " << position << std::endl;
+				return position;
+			}
+
+			static self_t build_end_iterator( T vsize, T _num_cols, T _col ) {
+				T final_pos = ( ( vsize - _col + _num_cols - 1 ) / _num_cols ) * _num_cols + _col;
+				return self_t( _num_cols, final_pos );
+			}
+
+			private:
+			const T color_num;
+			T position;
+		};
+
+		struct true_iter {
+
+			static const bool TRUE = true;
+
+			using self_t = true_iter;
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = bool;
+			using pointer = const bool *;
+			using reference = bool;
+			using difference_type = long;
+
+			true_iter() = default;
+
+			bool operator!=( const self_t & ) const {
+				return true;
+			}
+
+			self_t & operator++() noexcept {
+				return *this;
+			}
+
+			self_t & operator++( int ) noexcept {
+				return operator++();
+			}
+
+			pointer operator->() const {
+				return &TRUE;
+			}
+
+			reference operator*() const {
+				return true;
+			}
+		};
+
 		/**
 		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
 		 *
@@ -246,12 +344,18 @@ namespace grb {
 				grb::Vector< bool > & mask = masks.back();
 				// grb::set(mask, false); // DO NOT initialize false's explicitly, otherwise
 				// RBGS will touch them too and the runtime will increase!
+				/*
 				for( std::size_t j = i; j < matrix_size; j += colors ) {
 					rc = grb::setElement( mask, true, j );
 					assert( rc == grb::SUCCESS );
 					if( rc != grb::SUCCESS )
 						return rc;
 				}
+				*/
+				color_mask_iter< unsigned > begin( colors, i );
+				color_mask_iter< unsigned > end =
+					color_mask_iter< unsigned >::build_end_iterator( matrix_size, colors, i );
+				grb::buildVectorUnique( mask, begin, end, true_iter(), true_iter(), IOMode::SEQUENTIAL );
 			}
 			return rc;
 		}

From 1b0a7402954415ec7957f847a44d156d7d5583bb Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 18 Nov 2022 14:01:22 +0100
Subject: [PATCH 05/28] factoring out logic to build base system, better
 handling error values during input creation and using N-dimensional system
 generator in multiple places

---
 .../graphblas/algorithms/hpcg/hpcg_data.hpp   |  44 ++++-
 .../algorithms/hpcg/matrix_building_utils.hpp | 120 ++++++-------
 .../algorithms/hpcg/ndim_matrix_builders.hpp  |  10 +-
 .../hpcg/old_ndim_matrix_builders.hpp         |  14 ++
 .../algorithms/hpcg/system_building_utils.hpp | 164 ++++++++++++------
 5 files changed, 224 insertions(+), 128 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/hpcg_data.hpp b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
index 96b39856d..c53ef99e4 100644
--- a/include/graphblas/algorithms/hpcg/hpcg_data.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
@@ -43,7 +43,10 @@ namespace grb {
 		 * @tparam IOType type of values of the vectors for intermediate results
 		 * @tparam NonzeroType type of the values stored inside the system matrix #A
 		 */
-		template< typename IOType, typename NonzeroType >
+		template<
+			typename IOType,
+			typename NonzeroType
+		>
 		struct system_data {
 
 			const std::size_t system_size; ///< size of the system, i.e. side of the #A
@@ -65,14 +68,24 @@ namespace grb {
 			 * of rows and columns of the #A matrix.
 			 */
 			system_data( std::size_t sys_size ) :
-				system_size( sys_size ), A( sys_size, sys_size ), A_diagonal( sys_size ), z( sys_size ), r( sys_size ),
-				// temp(sys_size),
-				smoother_temp( sys_size ) {}
+				system_size( sys_size ),
+				A( sys_size, sys_size ),
+				A_diagonal( sys_size ),
+				z( sys_size ),
+				r( sys_size ),
+				smoother_temp( sys_size ) { }
 
 			// for safety, disable copy semantics
 			system_data( const system_data & o ) = delete;
 
 			system_data & operator=( const system_data & ) = delete;
+
+			grb::RC zero_temp_vectors() {
+				grb::RC rc = grb::set( z, 0 );
+				rc = rc ? rc : grb::set( r, 0 );
+				rc = rc ? rc : grb::set( smoother_temp, 0 );
+				return rc;
+			}
 		};
 
 		/**
@@ -104,7 +117,10 @@ namespace grb {
 		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
 		 * but their values are \b not initialized.
 		 */
-		template< typename IOType, typename NonzeroType >
+		template<
+			typename IOType,
+			typename NonzeroType
+		>
 		struct multi_grid_data : public system_data< IOType, NonzeroType > {
 
 			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
@@ -125,7 +141,10 @@ namespace grb {
 			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
 			 */
 			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
-				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ), Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
+				system_data< IOType, NonzeroType >( coarser_size ),
+				finer_size( _finer_size ),
+				Ax_finer( finer_size ),
+				coarsening_matrix( coarser_size, finer_size ) {
 				coarser_level = nullptr;
 			}
 
@@ -137,6 +156,12 @@ namespace grb {
 					delete coarser_level;
 				}
 			}
+
+			grb::RC zero_temp_vectors() {
+				grb::RC rc = this->system_data< IOType, NonzeroType >::zero_temp_vectors();
+				rc = rc ? rc : grb::set( Ax_finer, 0 );
+				return rc;
+			}
 		};
 
 		/**
@@ -185,6 +210,13 @@ namespace grb {
 					delete coarser_level;
 				}
 			}
+
+			grb::RC zero_temp_vectors() {
+				grb::RC rc = this->system_data< IOType, NonzeroType >::zero_temp_vectors();
+				rc = rc ? rc : grb::set( u, 0 );
+				rc = rc ? rc : grb::set( p, 0 );
+				return rc;
+			}
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index 4791d9d2d..2ccd18fa8 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -38,15 +38,6 @@
 #include "ndim_matrix_builders.hpp"
 
 
-#define PAR
-
-
-
-#ifndef PAR
-#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
-#endif
-
-
 namespace grb {
 	namespace algorithms {
 
@@ -76,20 +67,6 @@ namespace grb {
 			begin += first;
 		}
 
-#ifndef PAR
-		template< typename T > void partition_rows(
-				T rows,
-				T& first_row,
-				T& last_row
-		) {
-			const size_t num_procs{ spmd<>::nprocs() };
-			const T per_process{ ( rows + num_procs - 1 ) / num_procs }; // round up
-			first_row = std::min( per_process * static_cast< T >( spmd<>::pid() ), rows );
-			last_row = std::min( first_row + per_process, rows );
-		}
-#endif
-
-
 		/**
 		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
 		 *
@@ -107,24 +84,25 @@ namespace grb {
 		 * @param non_diag_value value outside of the diagonal
 		 * @return grb::RC the success value returned when trying to build the matrix
 		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_system_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & sys_sizes, std::size_t halo_size, T diag_value, T non_diag_value ) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
-			if( grb::nrows( M ) != n || grb::nrows( M ) != grb::ncols( M ) ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be square"
-											" and in accordance with given system "
-											"sizes" );
-			}
-#ifdef PAR
-			using coord_t = unsigned;
-			if( n > std::numeric_limits< coord_t >::max() ) {
+		template<
+			std::size_t DIMS,
+			typename coord_t,
+			typename T,
+			enum grb::Backend B
+		> grb::RC build_ndims_system_matrix(
+			grb::Matrix< T, B > & M,
+			const grb::algorithms::hpcg_builder< DIMS, coord_t, T > & hpcg_system,
+			T diag_value,
+			T non_diag_value
+		) {
+			if( hpcg_system.system_size() > std::numeric_limits< coord_t >::max() ) {
 				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
 			}
+			/*
 			std::array< coord_t, DIMS > _sys_sizes;
 			for( size_t i = 0; i < DIMS; i++ ) _sys_sizes[i] = sys_sizes[i];
 			grb::algorithms::hpcg_builder< DIMS, coord_t, T > hpcg_system( _sys_sizes, halo_size );
+			*/
 			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > begin(
 				hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
 			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > end(
@@ -133,12 +111,6 @@ namespace grb {
 			partition_iteration_range( hpcg_system.system_size(), begin, end );
 
 			// std::cout << "num nonzeroes " << ( end - begin ) << std::endl;
-#else
-			size_t first_row, last_row;
-			partition_rows( n, first_row, last_row );
-			grb::algorithms::old::matrix_generator_iterator< DIMS, T > begin( sys_sizes, first_row, halo_size, diag_value, non_diag_value );
-			grb::algorithms::old::matrix_generator_iterator< DIMS, T > end( sys_sizes, last_row, halo_size, diag_value, non_diag_value );
-#endif
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
@@ -164,8 +136,15 @@ namespace grb {
 		 *                    in \p coarser_size , otherwise an exception is thrown
 		 * @return grb::RC the success value returned when trying to build the matrix
 		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_coarsener_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & coarser_sizes, const std::array< std::size_t, DIMS > & finer_sizes ) {
+		template<
+			std::size_t DIMS,
+			typename T,
+			enum grb::Backend B
+		> grb::RC build_ndims_coarsener_matrix(
+			grb::Matrix< T, B > & M,
+			const std::array< std::size_t, DIMS > & coarser_sizes,
+			const std::array< std::size_t, DIMS > & finer_sizes
+		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 			size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
 			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
@@ -182,7 +161,6 @@ namespace grb {
 											" with rows == <product of coarser sizes> "
 											"and cols == <product of finer sizes>" );
 			}
-#ifdef PAR
 			using coord_t = unsigned;
 			if( rows > std::numeric_limits< coord_t >::max() ) {
 				throw std::domain_error( "CoordT cannot store the row coordinates" );
@@ -201,12 +179,6 @@ namespace grb {
 				coarsener.make_end_iterator()
 			);
 			partition_iteration_range( coarsener.system_size(), begin, end );
-#else
-			size_t first_row, last_row;
-			partition_rows( rows, first_row, last_row );
-			grb::algorithms::old::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, first_row );
-			grb::algorithms::old::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, last_row );
-#endif
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
@@ -274,40 +246,58 @@ namespace grb {
 			T position;
 		};
 
+		template< typename CoordT >
 		struct true_iter {
 
-			static const bool TRUE = true;
+			static const bool __TRUE = true;
 
-			using self_t = true_iter;
+			using self_t = true_iter< CoordT >;
 			using iterator_category = std::random_access_iterator_tag;
 			using value_type = bool;
 			using pointer = const bool *;
-			using reference = bool;
+			using reference = const bool&;
 			using difference_type = long;
 
-			true_iter() = default;
+			true_iter() = delete;
+
+			true_iter( CoordT first ): index( first ) {}
+
+			true_iter( const self_t & ) = default;
 
-			bool operator!=( const self_t & ) const {
-				return true;
+			self_t & operator=( const self_t & ) = default;
+
+			bool operator!=( const self_t & other ) const {
+				return this->index != other.index;
 			}
 
 			self_t & operator++() noexcept {
+				(void) index++;
 				return *this;
 			}
 
-			self_t & operator++( int ) noexcept {
-				return operator++();
+			self_t & operator+=( size_t increment ) noexcept {
+				index += increment;
+				return *this;
+			}
+
+			difference_type operator-( const self_t & other ) noexcept {
+				return static_cast< difference_type >( this->index - other.index );
 			}
 
 			pointer operator->() const {
-				return &TRUE;
+				return &__TRUE;
 			}
 
 			reference operator*() const {
-				return true;
+				return *(this->operator->());
 			}
+
+		private:
+			CoordT index;
 		};
 
+		template< typename CoordT > const bool true_iter< CoordT >::__TRUE;
+
 		/**
 		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
 		 *
@@ -327,7 +317,11 @@ namespace grb {
 		 * @return grb::RC the success value returned when trying to build the vector
 		 */
 		template< enum grb::Backend B >
-		grb::RC build_static_color_masks( std::vector< grb::Vector< bool, B > > & masks, std::size_t matrix_size, std::size_t colors ) {
+		grb::RC build_static_color_masks(
+			std::vector< grb::Vector< bool, B > > & masks,
+			std::size_t matrix_size,
+			std::size_t colors
+		) {
 			if( ! masks.empty() ) {
 				throw std::invalid_argument( "vector of masks is expected to be "
 											"empty" );
@@ -355,7 +349,7 @@ namespace grb {
 				color_mask_iter< unsigned > begin( colors, i );
 				color_mask_iter< unsigned > end =
 					color_mask_iter< unsigned >::build_end_iterator( matrix_size, colors, i );
-				grb::buildVectorUnique( mask, begin, end, true_iter(), true_iter(), IOMode::SEQUENTIAL );
+				grb::buildVectorUnique( mask, begin, end, true_iter< size_t >( 0 ), true_iter< size_t >( matrix_size ), IOMode::SEQUENTIAL );
 			}
 			return rc;
 		}
diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
index 06672d110..db94c8a29 100644
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
@@ -291,22 +291,26 @@ namespace grb {
 			hpcg_builder< DIMS, CoordT, T> & operator=( hpcg_builder< DIMS, CoordT, T> && ) = delete;
 
 			size_t system_size() const {
+				return system.base_system_size();
+			}
+
+			size_t num_neighbors() const {
 				return system.halo_system_size();
 			}
 
 			hpcg_sys_iterator make_begin_iterator(
 				T diag,
 				T non_diag
-			) {
+			) const {
 				return hpcg_sys_iterator( system, diag, non_diag );
 			}
 
 			hpcg_sys_iterator make_end_iterator(
 				T diag,
 				T non_diag
-			) {
+			) const {
 				hpcg_sys_iterator result( system, diag, non_diag );
-				result += system_size() - 1; // do not trigger boundary checks
+				result += num_neighbors() - 1; // do not trigger boundary checks
 				++result;
 				return result;
 			}
diff --git a/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
index 256995b02..9f64e9884 100644
--- a/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
@@ -202,6 +202,13 @@ namespace grb {
 			using array_t = typename row_generator< DIMS >::array_t;
 			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
 
+			using RowIndexType = typename row_generator< DIMS >::row_coordinate_type;
+			using ColumnIndexType = typename row_generator< DIMS >::row_coordinate_type;
+			using iterator_category = std::forward_iterator_tag;
+			using pointer = const value_type;
+			using reference = const value_type&;
+			using difference_type = long;
+
 			// halo may in future become a DIM-size array to iterate in arbitrary shapes
 			const row_coordinate_type halo;              ///< number of points per dimension to iterate around
 			const nonzero_value_type diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
@@ -418,6 +425,13 @@ namespace grb {
 			using array_t = typename row_generator< DIMS >::array_t;
 			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
 
+			using RowIndexType = typename row_generator< DIMS >::row_coordinate_type;
+			using ColumnIndexType = typename row_generator< DIMS >::row_coordinate_type;
+			using iterator_category = std::forward_iterator_tag;
+			using pointer = const value_type;
+			using reference = const value_type&;
+			using difference_type = long;
+
 			// the sizes to project from
 			const array_t finer_sizes; ///< the size of the finer system (columns)
 			array_t steps;             ///< array of steps, i.e. how much each column coordinate (finer system) must be
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 959d21969..78759a539 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -29,6 +29,7 @@
 #include <cassert>
 #include <cstddef>
 #include <memory>
+#include <type_traits>
 
 #include <graphblas.hpp>
 #include <graphblas/utils/Timer.hpp>
@@ -66,16 +67,87 @@ namespace grb {
 		 */
 		template< std::size_t DIMS, typename T >
 		struct hpcg_system_params {
-			const std::array< std::size_t, DIMS > & physical_sys_sizes;
-			const std::size_t halo_size;
-			const std::size_t num_colors;
-			const T diag_value;
-			const T non_diag_value;
-			const std::size_t min_phys_size;
-			const std::size_t max_levels;
-			const std::size_t coarsening_step;
+			std::array< std::size_t, DIMS > physical_sys_sizes;
+			std::size_t halo_size;
+			std::size_t num_colors;
+			T diag_value;
+			T non_diag_value;
+			std::size_t min_phys_size;
+			std::size_t max_levels;
+			std::size_t coarsening_step;
 		};
 
+		// SystemData must have a zero_temp_vectors()
+		template< std::size_t DIMS, typename IOType, typename NonzeroType, typename SystemData >
+		grb::RC build_base_system(
+			typename std::enable_if<
+				std::is_base_of< system_data< IOType, NonzeroType >, SystemData >::value,
+			SystemData& >::type system,
+			size_t system_size,
+			const std::array< std::size_t, DIMS > & physical_sys_sizes,
+			size_t halo_size,
+			NonzeroType diag_value,
+			NonzeroType non_diag_value,
+			size_t num_colors,
+			std::array< double, 3 > & times
+		) {
+
+			grb::RC rc { grb::SUCCESS };
+			const size_t pid { spmd<>::pid() };
+			grb::utils::Timer timer;
+			static const char * const log_prefix = "  -- ";
+
+			using coord_t = unsigned;
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+			size_t n { std::accumulate( physical_sys_sizes.cbegin(), physical_sys_sizes.cend(),
+				1UL, std::multiplies< size_t >() ) };
+			if( n > std::numeric_limits< coord_t >::max() ) {
+				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+			}
+			std::array< coord_t, DIMS > sys_sizes;
+			for( size_t i = 0; i < DIMS; i++ ) sys_sizes[i] = physical_sys_sizes[i];
+			grb::algorithms::hpcg_builder< DIMS, coord_t, NonzeroType > system_generator( sys_sizes, halo_size );
+
+			MASTER_PRINT( pid, log_prefix << "generating system matrix..." );
+			timer.reset();
+			rc = build_ndims_system_matrix< DIMS, coord_t, NonzeroType >(
+				system.A,
+				system_generator,
+				diag_value, non_diag_value
+			);
+			if( rc != grb::SUCCESS ) {
+				return rc;
+			}
+			times[ 0 ] = timer.time();
+			MASTER_PRINT( pid, " time (ms) " << times[ 0 ] << std::endl );
+
+			// set values of vectors
+			MASTER_PRINT( pid, log_prefix << "populating vectors..." );
+			timer.reset();
+			rc = set( system.A_diagonal, diag_value );
+			if( rc != grb::SUCCESS ) {
+				return rc;
+			}
+			rc = system.zero_temp_vectors();
+			if( rc != grb::SUCCESS ) {
+				return rc;
+			}
+			times[ 1 ] = timer.time();
+			MASTER_PRINT( pid, " time (ms) " << times[ 1 ] << std::endl );
+
+
+			MASTER_PRINT( pid, log_prefix << "generating color masks..." );
+			timer.reset();
+			rc = build_static_color_masks( system.color_masks, system_size, num_colors );
+			if( rc != grb::SUCCESS ) {
+				return rc;
+			}
+			times[ 2 ] = timer.time();
+			MASTER_PRINT( pid, " time (ms) " << times[ 2 ] << std::endl );
+
+			return rc;
+		}
+
 		/**
 		 * @brief Generates an entire HPCG problem according to the parameters in \p params , storing it in \p holder .
 		 *
@@ -87,9 +159,13 @@ namespace grb {
 		 * otherwise the first unsuccessful return value
 		 */
 		template< std::size_t DIMS, typename T = double >
-		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params ) {
+		grb::RC build_hpcg_system(
+			std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder,
+			const hpcg_system_params< DIMS, T > & params
+		) {
 			// n is the system matrix size
-			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(),
+				params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
 
 			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n ) };
 
@@ -100,30 +176,21 @@ namespace grb {
 			grb::RC rc { grb::SUCCESS };
 			const size_t pid { spmd<>::pid() };
 			grb::utils::Timer timer;
-			MASTER_PRINT( pid, "\n-- generating system matrix...\n" << std::endl );
-			grb::spmd<>::barrier();
-			timer.reset();
-			rc = build_ndims_system_matrix< DIMS, T >( data->A, params.physical_sys_sizes, params.halo_size, params.diag_value, params.non_diag_value );
-			MASTER_PRINT( pid, "\n-- generating system matrix... time (ms) " << timer.time() << std::endl );
 
+			std::array< double, 3 > times;
+			MASTER_PRINT( pid, "\n-- main system" << std::endl );
+			rc = build_base_system< DIMS, T, T, grb::algorithms::hpcg_data< T, T, T > >( *data, n, params.physical_sys_sizes, params.halo_size,
+				params.diag_value, params.non_diag_value, params.num_colors, times );
 			if( rc != grb::SUCCESS ) {
-				MASTER_PRINT( pid, "Failure to generate the initial system ("
-					<< toString( rc ) << ") of size " << n << "\n" );
+				MASTER_PRINT( pid, " error: " << toString( rc ) );
 				return rc;
 			}
-
-			// set values of vectors
-			MASTER_PRINT( pid, "-- populating vectors..." );
-			timer.reset();
-			set( data->A_diagonal, params.diag_value );
-			data->zero_temp_vectors();
-			MASTER_PRINT( pid, " time (ms) " << timer.time() << std::endl );
-
-
-			MASTER_PRINT( pid, "-- generating color masks...\n" << std::endl );
-			timer.reset();
-			build_static_color_masks( data->color_masks, n, params.num_colors );
-			MASTER_PRINT( pid, "\n\n-- generating color masks... time (ms) " << timer.time() << std::endl );
+			MASTER_PRINT( pid, "-- main system generation time (ms) "
+				"[system matrix,vectors,color masks]:"
+				<< times[ 0 ]
+				<< "," << times[ 1 ]
+				<< "," << times[ 2 ] << std::endl;
+			);
 
 			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
 			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
@@ -142,47 +209,32 @@ namespace grb {
 				std::size_t coarser_size { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
 				std::size_t previous_size { std::accumulate( previous_sizes.cbegin(), previous_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
 				// build data structures for new level
-				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
+				grb::algorithms::multi_grid_data< T, T > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
 
-				MASTER_PRINT( pid, "-- level " << coarsening_level << "\n\tgenerating coarsening matrix...\n" );
+				MASTER_PRINT( pid, "-- level " << coarsening_level << "\n  -- generating coarsening matrix...\n" );
 				timer.reset();
 				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
 				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
 				if( rc != grb::SUCCESS ) {
-					MASTER_PRINT( pid, "Failure to generate coarsening matrix (" << toString( rc ) << ").\n" );
+					MASTER_PRINT( pid, " error: " << toString( rc ) );
 					return rc;
 				}
 				double coarsener_gen_time{ timer.time() };
 
-				MASTER_PRINT( pid, "\tgenerating system matrix...\n" );
-				timer.reset();
-				rc = build_ndims_system_matrix< DIMS, T >( new_coarser->A, coarser_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+				rc = build_base_system< DIMS, T, T, grb::algorithms::multi_grid_data< T, T > >( *new_coarser, coarser_size, coarser_sizes, params.halo_size,
+					params.diag_value, params.non_diag_value, params.num_colors, times );
 				if( rc != grb::SUCCESS ) {
-					MASTER_PRINT( pid, "Failure to generate system matrix (" << toString( rc )
-						<< ") for size " << coarser_size << "\n" );
+					MASTER_PRINT( pid, " error: " << toString( rc ) );
 					return rc;
 				}
-				double coarse_sys_gen_time{ timer.time() };
-
-				MASTER_PRINT( pid, "\tpopulating vectors...\n" );
-				timer.reset();
-				set( new_coarser->A_diagonal, params.diag_value );
-				new_coarser->zero_temp_vectors();
-				double coarser_vec_gen_time{ timer.time() };
-
-				// build color masks for coarser level (same masks, but with coarser system size)
-				MASTER_PRINT( pid, "\tgenerating color masks..." << std::endl );
-				timer.reset();
-				rc = build_static_color_masks( new_coarser->color_masks, coarser_size, params.num_colors );
-				double coarse_masks_sys_time{ timer.time() };
-				MASTER_PRINT( pid, "-- level " << coarsening_level << "... time (ms) for "
-					"[coarsening matrix,coarse system matrix,coarser vectors,color masks]:"
+				MASTER_PRINT( pid, "-- level generation time (ms) "
+					"[level,coarsening matrix,system matrix,vectors,color masks]:"
 					<< coarsening_level << "," << coarsener_gen_time
-					<< "," << coarse_sys_gen_time
-					<< "," << coarser_vec_gen_time
-					<< "," << coarse_masks_sys_time << std::endl;
+					<< "," << times[ 0 ]
+					<< "," << times[ 1 ]
+					<< "," << times[ 2 ] << std::endl;
 				);
 
 				// prepare for new iteration

From 7525c1811b530711d592a7805d9501641d3d94c2 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 18 Nov 2022 11:50:48 +0100
Subject: [PATCH 06/28] adding greedy coloring algorithm for HPCG smoother and
 generating color masks according to the result of the coloring algorithm and
 adding test for coloring algorithm

---
 .../graphblas/algorithms/hpcg/coloring.hpp    | 165 ++++++++++++++++++
 .../algorithms/hpcg/matrix_building_utils.hpp | 132 +++++---------
 .../algorithms/hpcg/ndim_matrix_builders.hpp  |  12 ++
 .../algorithms/hpcg/system_building_utils.hpp |  58 ++++--
 tests/smoke/hpcg.cpp                          |  64 +++++--
 5 files changed, 307 insertions(+), 124 deletions(-)
 create mode 100644 include/graphblas/algorithms/hpcg/coloring.hpp

diff --git a/include/graphblas/algorithms/hpcg/coloring.hpp b/include/graphblas/algorithms/hpcg/coloring.hpp
new file mode 100644
index 000000000..1e6378c59
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/coloring.hpp
@@ -0,0 +1,165 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_COLORING
+#define _H_GRB_ALGORITHMS_HPCG_COLORING
+
+#include <vector>
+#include <cstddef>
+
+#include <graphblas/utils/geometry/linearized_halo_ndim_system.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+
+		/**
+		 * Coloring algorithm for matrix generated by a \p DIMS - dimensional system.
+		 *
+		 * This function implements a < b>greedy heuristics< /b> to color the rows of a matrix generated by
+		 * a \p DIMS - dimensional generator \p system, so that no two connected elements < em>(i,j)< /em>
+		 * in the system (corresponding to a nonzero < em>(i,j)< /em> entry in the matrix) have the same color.
+		 * If \p reorder_rows_per_color is false (as per default), the coloring information is stored into
+		 * \p row_colors, while \p color_counters stores the number of rows for each color.
+		 *
+		 * If \p reorder_rows_per_color is true, the function performs the additional step of \b re-ordering
+		 * the rows depending on their color: rows of color \a 0 are moved first, then rows of color \a 1
+		 * are moved to the following positions and so on. In this case, \p row_colors stores the new row number
+		 * while \p color_counters stores at each position \a i the new position of the first row of color \a i.
+		 *
+		 * In both cases, \a color_counters.size() gives the number of found colors.
+		 *
+		 * @tparam DIMS dimensions of the system
+		 * @tparam CoordType type of the coordinates
+		 * @param[in] system generator for an \p DIMS - dimesional system with halo
+		 * @param[out] row_colors if \p reorder_rows_per_color is false, stores the color of each row;
+		 * 	if \p reorder_rows_per_color is true, stores the new position of each row, so that rows
+		 * 	of the same color are gourped together; the initial content of the vector is destroyed
+		 * @param[out] color_counters if \p reorder_rows_per_color is false, stores the number of rows per color;
+		 * 	if \p reorder_rows_per_color is true, stores at each position \a i the offset in \p color_counters
+		 * 	where the (clustered) rows of color \a i start from; the initial content of the vector is destroyed
+		 * @param[in] reorder_rows_per_color whether to do the clustering after the coloring
+		 */
+		template<
+			std::size_t DIMS,
+			typename CoordType
+		> void color_matrix_greedy(
+			const grb::utils::geometry::linearized_halo_ndim_system< CoordType, DIMS > &system,
+			std::vector< CoordType > &row_colors,
+			std::vector< CoordType > &color_counters,
+			bool reorder_rows_per_color = false
+		) {
+
+			// This function can be used to completely transform any part of the data structures.
+			// Right now it does nothing, so compiling with a check for unused variables results in complaints
+
+			CoordType nrows = system.system_size();
+			row_colors.insert( row_colors.begin(), nrows, nrows ); // value `nrow' means `uninitialized'; initialized colors go from 0 to nrow-1
+			CoordType totalColors = 1;
+			row_colors[0] = 0; // first point gets color 0
+
+			// Finds colors in a greedy (a likely non-optimal) fashion.
+			typename grb::utils::geometry::linearized_halo_ndim_system< CoordType, DIMS >::iterator begin = system.begin();
+			begin.next_element(); // skip first row
+
+			while( begin.has_more_elements() ) {
+				CoordType curRow = begin->get_element_linear();
+
+				if( row_colors[ curRow ] != nrows ) {
+					// if color already assigned to curRow
+					continue;
+				}
+				std::vector< bool > assigned( totalColors, false );
+				CoordType currentlyAssigned = 0;
+
+				while( begin.has_more_neighbours() ) {
+					CoordType curCol = begin->get_neighbor_linear();
+					if( curCol < curRow ) {
+						assert( row_colors[ curCol ] < nrows ); // if curCol < curRow, curCol has already a color assigned
+						std::vector< bool >::reference color_is_assigned = assigned[ row_colors[ curCol ] ];
+						if( !color_is_assigned ) {
+							// count how many colors are already assigned
+							(void) currentlyAssigned++;
+						}
+						// track which colors are assigned
+						color_is_assigned = true;
+					} // else // could take advantage of indices being sorted
+					begin.next_neighbour();
+				}
+
+				if( currentlyAssigned < totalColors ) {
+					// if there is at least one color left to use, look for it
+					for( CoordType j = 0; j < totalColors; ++j ) {
+						if( !assigned[ j ] ) {
+							// if no neighbor with this color, use it for this row
+							row_colors[ curRow ] = j;
+							break;
+						}
+					}
+				} else {
+					assert( row_colors[ curRow ] == nrows );
+					if( row_colors[ curRow ] == nrows ) {
+						row_colors[ curRow ] = totalColors;
+						(void) totalColors++;
+					} else {
+						assert( 0 ); // should never get here
+					}
+				}
+				begin.next_element();
+			}
+
+#ifdef _DEBUG
+			std::cout << "assigned colors: " << totalColors << " [ <row> -> <color>]\n";
+			for( size_t i = 0; i < row_colors.size(); i++ ){
+				std::cout << i << " -> " << row_colors[ i ] << ", ";
+			}
+			std::cout << std::endl;
+#endif
+
+			// count number of vertices per color
+			color_counters.insert( color_counters.begin(), totalColors, 0 );
+			for( CoordType i = 0; i < nrows; ++i ) {
+				(void) color_counters[ row_colors[ i ] ]++;
+			}
+
+			if( !reorder_rows_per_color ) {
+				return;
+			}
+
+			// form in-place prefix scan
+			CoordType old = 0, old0;
+			for( CoordType i = 1; i < totalColors; ++i ) {
+				old0 = color_counters[i];
+				color_counters[i] = color_counters[i-1] + old;
+				old = old0;
+			}
+			color_counters[0] = 0;
+
+			// translate `colors' into a permutation
+			for( CoordType i = 0; i < nrows; ++i ) {
+				row_colors[ i ] = color_counters[ row_colors[ i ] ]++;
+			}
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+
+
+#endif // _H_GRB_ALGORITHMS_HPCG_COLORING
+
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index 2ccd18fa8..45486e99b 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -32,6 +32,8 @@
 #include <stdexcept>
 #include <utility>
 #include <limits.h>
+#include <iterator>
+#include <type_traits>
 
 #include <graphblas.hpp>
 
@@ -57,6 +59,9 @@ namespace grb {
 			IterT &begin,
 			IterT &end
 		) {
+			static_assert( std::is_base_of< std::random_access_iterator_tag,
+				typename std::iterator_traits< IterT >::iterator_category >::value,
+				"the given iterator is not a random access one" );
 			assert( num_nonzeroes == static_cast< size_t >( end - begin ) );
 			size_t first, last;
 			partition_nonzeroes( num_nonzeroes, first, last );
@@ -108,9 +113,8 @@ namespace grb {
 			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > end(
 				hpcg_system.make_end_iterator( diag_value, non_diag_value )
 			);
-			partition_iteration_range( hpcg_system.system_size(), begin, end );
+			partition_iteration_range( hpcg_system.num_neighbors(), begin, end );
 
-			// std::cout << "num nonzeroes " << ( end - begin ) << std::endl;
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
@@ -182,70 +186,6 @@ namespace grb {
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
-		template< typename T >
-		struct color_mask_iter {
-
-			using self_t = color_mask_iter< T >;
-			using iterator_category = std::random_access_iterator_tag;
-			using value_type = T;
-			using pointer = const value_type *;
-			using reference = value_type;
-			using difference_type = long;
-
-			color_mask_iter() = delete;
-
-			color_mask_iter( T _num_cols, T _pos ) noexcept:
-				color_num( _num_cols),
-				position( _pos ) {}
-
-
-			color_mask_iter( const self_t &o ):
-				color_num( o.color_num ),
-				position( o.position ) {}
-
-			//self_t & operator=( const self_t & ) = default;
-
-			bool operator!=( const self_t &o ) const {
-				return position != o.position;
-			}
-
-			self_t & operator++() noexcept {
-				position += color_num;
-				return *this;
-			}
-
-			self_t & operator++( int ) noexcept {
-				return operator++();
-			}
-
-			self_t & operator+=( size_t offset ) noexcept {
-				position += offset * color_num;
-				return *this;
-			}
-
-			difference_type operator-( const self_t &o ) const noexcept {
-				return static_cast< difference_type >( ( position - o.position ) / color_num );
-			}
-
-			pointer operator->() const {
-				return &position;
-			}
-
-			reference operator*() const {
-				// std::cout << "returning " << position << std::endl;
-				return position;
-			}
-
-			static self_t build_end_iterator( T vsize, T _num_cols, T _col ) {
-				T final_pos = ( ( vsize - _col + _num_cols - 1 ) / _num_cols ) * _num_cols + _col;
-				return self_t( _num_cols, final_pos );
-			}
-
-			private:
-			const T color_num;
-			T position;
-		};
-
 		template< typename CoordT >
 		struct true_iter {
 
@@ -318,40 +258,50 @@ namespace grb {
 		 */
 		template< enum grb::Backend B >
 		grb::RC build_static_color_masks(
-			std::vector< grb::Vector< bool, B > > & masks,
 			std::size_t matrix_size,
-			std::size_t colors
+			const std::vector< std::vector< size_t > > &per_color_rows,
+			std::vector< grb::Vector< bool, B > > & masks
 		) {
 			if( ! masks.empty() ) {
-				throw std::invalid_argument( "vector of masks is expected to be "
-											"empty" );
-			}
-			if( matrix_size < colors ) {
-				throw std::invalid_argument( "syztem size is < number of colors: too "
-											"small" );
+				throw std::invalid_argument( "vector of masks is expected to be empty" );
 			}
-			grb::RC rc { grb::SUCCESS };
-			masks.reserve( colors );
-			for( std::size_t i { 0U }; i < colors; i++ ) {
-				// build in-place, assuming the compiler deduces the right constructor according to B
+			for( size_t i = 0; i < per_color_rows.size(); i++ ) {
+				const std::vector< size_t > & rows = per_color_rows[ i ];
+				/*
+				{
+					std::cout << "\ncolor " << i << std::endl;
+					for( size_t row : rows ) {
+						std::cout << row << " ";
+					}
+					std::cout << std::endl;
+				}
+				*/
 				masks.emplace_back( matrix_size );
-				grb::Vector< bool > & mask = masks.back();
-				// grb::set(mask, false); // DO NOT initialize false's explicitly, otherwise
-				// RBGS will touch them too and the runtime will increase!
+				grb::Vector< bool > & output_mask = masks.back();
+				std::vector< size_t >::const_iterator begin = rows.cbegin();
+				std::vector< size_t >::const_iterator end = rows.cend();
+				// partition_iteration_range( rows.size(), begin, end );
+				grb::RC rc = grb::buildVectorUnique( output_mask, begin , end, true_iter< size_t >( 0 ),
+					true_iter< size_t >( std::distance( begin, end ) ), IOMode::SEQUENTIAL );
+				if( rc != SUCCESS ) {
+					std::cerr << "error while creating output mask for color " << i << ": "
+						<< toString( rc ) << std::endl;
+					return rc;
+				}
 				/*
-				for( std::size_t j = i; j < matrix_size; j += colors ) {
-					rc = grb::setElement( mask, true, j );
-					assert( rc == grb::SUCCESS );
-					if( rc != grb::SUCCESS )
-						return rc;
+				{
+					std::cout << "mask color " << i << std::endl;
+					size_t count = 0;
+					for( const auto & v : output_mask ) {
+						std::cout << v.first << " ";
+						count++;
+						if( count > 20 ) break;
+					}
+					std::cout << std::endl;
 				}
 				*/
-				color_mask_iter< unsigned > begin( colors, i );
-				color_mask_iter< unsigned > end =
-					color_mask_iter< unsigned >::build_end_iterator( matrix_size, colors, i );
-				grb::buildVectorUnique( mask, begin, end, true_iter< size_t >( 0 ), true_iter< size_t >( matrix_size ), IOMode::SEQUENTIAL );
 			}
-			return rc;
+			return grb::SUCCESS;
 		}
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
index db94c8a29..35a15238d 100644
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
@@ -215,6 +215,10 @@ namespace grb {
 				return _val.v();
 			}
 
+			const __iter_t & it() const {
+				return this->_sys_iter;
+			}
+
 		private:
 			value_type _val;
 			const linear_system_t *_lin_system;
@@ -298,6 +302,10 @@ namespace grb {
 				return system.halo_system_size();
 			}
 
+			const system_t & get_generator() const {
+				return system;
+			}
+
 			hpcg_sys_iterator make_begin_iterator(
 				T diag,
 				T non_diag
@@ -499,6 +507,10 @@ namespace grb {
 				return _val.v();
 			}
 
+			const __iter_t & it() const {
+				return this->_sys_iter;
+			}
+
 		private:
 			//// incremented when incrementing the row coordinates; is is the ration between
 			//// #finer_sizes and row_generator#physical_sizes
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 78759a539..77bef1995 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -37,6 +37,8 @@
 #include "hpcg_data.hpp"
 #include "matrix_building_utils.hpp"
 
+#include "coloring.hpp"
+
 #ifndef MASTER_PRINT
 #define INTERNAL_MASTER_PRINT
 #define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
@@ -69,7 +71,6 @@ namespace grb {
 		struct hpcg_system_params {
 			std::array< std::size_t, DIMS > physical_sys_sizes;
 			std::size_t halo_size;
-			std::size_t num_colors;
 			T diag_value;
 			T non_diag_value;
 			std::size_t min_phys_size;
@@ -77,6 +78,17 @@ namespace grb {
 			std::size_t coarsening_step;
 		};
 
+		template< typename CoordType > void split_rows_by_color(
+			const std::vector< CoordType > & row_colors,
+			size_t num_colors,
+			std::vector< std::vector< CoordType > > & per_color_rows
+		) {
+			per_color_rows.resize( num_colors );
+			for( CoordType i = 0; i < row_colors.size(); i++ ) {
+				per_color_rows[ row_colors[ i ] ].push_back( i );
+			}
+		}
+
 		// SystemData must have a zero_temp_vectors()
 		template< std::size_t DIMS, typename IOType, typename NonzeroType, typename SystemData >
 		grb::RC build_base_system(
@@ -88,8 +100,7 @@ namespace grb {
 			size_t halo_size,
 			NonzeroType diag_value,
 			NonzeroType non_diag_value,
-			size_t num_colors,
-			std::array< double, 3 > & times
+			std::array< double, 4 > & times
 		) {
 
 			grb::RC rc { grb::SUCCESS };
@@ -97,7 +108,7 @@ namespace grb {
 			grb::utils::Timer timer;
 			static const char * const log_prefix = "  -- ";
 
-			using coord_t = unsigned;
+			using coord_t = size_t;
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 			size_t n { std::accumulate( physical_sys_sizes.cbegin(), physical_sys_sizes.cend(),
 				1UL, std::multiplies< size_t >() ) };
@@ -135,15 +146,28 @@ namespace grb {
 			times[ 1 ] = timer.time();
 			MASTER_PRINT( pid, " time (ms) " << times[ 1 ] << std::endl );
 
+			MASTER_PRINT( pid, log_prefix << "running coloring heuristics..." );
+			timer.reset();
+			std::vector< coord_t > colors, color_counters;
+			color_matrix_greedy( system_generator.get_generator(), colors, color_counters );
+			std::vector< std::vector< coord_t > > per_color_rows;
+			split_rows_by_color( colors, color_counters.size(), per_color_rows );
+			if( rc != grb::SUCCESS ) {
+				return rc;
+			}
+			times[ 2 ] = timer.time();
+			MASTER_PRINT( pid, " found " << color_counters.size() << " colors, time (ms) "
+				<< times[ 2 ] << std::endl );
+
 
 			MASTER_PRINT( pid, log_prefix << "generating color masks..." );
 			timer.reset();
-			rc = build_static_color_masks( system.color_masks, system_size, num_colors );
+			rc = build_static_color_masks( system_size, per_color_rows, system.color_masks );
 			if( rc != grb::SUCCESS ) {
 				return rc;
 			}
-			times[ 2 ] = timer.time();
-			MASTER_PRINT( pid, " time (ms) " << times[ 2 ] << std::endl );
+			times[ 3 ] = timer.time();
+			MASTER_PRINT( pid, " time (ms) " << times[ 3 ] << std::endl );
 
 			return rc;
 		}
@@ -177,19 +201,17 @@ namespace grb {
 			const size_t pid { spmd<>::pid() };
 			grb::utils::Timer timer;
 
-			std::array< double, 3 > times;
+			std::array< double, 4 > times;
 			MASTER_PRINT( pid, "\n-- main system" << std::endl );
 			rc = build_base_system< DIMS, T, T, grb::algorithms::hpcg_data< T, T, T > >( *data, n, params.physical_sys_sizes, params.halo_size,
-				params.diag_value, params.non_diag_value, params.num_colors, times );
+				params.diag_value, params.non_diag_value, times );
 			if( rc != grb::SUCCESS ) {
 				MASTER_PRINT( pid, " error: " << toString( rc ) );
 				return rc;
 			}
 			MASTER_PRINT( pid, "-- main system generation time (ms) "
-				"[system matrix,vectors,color masks]:"
-				<< times[ 0 ]
-				<< "," << times[ 1 ]
-				<< "," << times[ 2 ] << std::endl;
+				"(system matrix,vectors,coloring,color masks):" << times[ 0 ] << "," << times[ 1 ]
+				<< "," << times[ 2 ] << "," << times[ 3 ] << std::endl;
 			);
 
 			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
@@ -224,17 +246,15 @@ namespace grb {
 				double coarsener_gen_time{ timer.time() };
 
 				rc = build_base_system< DIMS, T, T, grb::algorithms::multi_grid_data< T, T > >( *new_coarser, coarser_size, coarser_sizes, params.halo_size,
-					params.diag_value, params.non_diag_value, params.num_colors, times );
+					params.diag_value, params.non_diag_value, times );
 				if( rc != grb::SUCCESS ) {
 					MASTER_PRINT( pid, " error: " << toString( rc ) );
 					return rc;
 				}
 				MASTER_PRINT( pid, "-- level generation time (ms) "
-					"[level,coarsening matrix,system matrix,vectors,color masks]:"
-					<< coarsening_level << "," << coarsener_gen_time
-					<< "," << times[ 0 ]
-					<< "," << times[ 1 ]
-					<< "," << times[ 2 ] << std::endl;
+					"(level,coarsening matrix,system matrix,vectors,coloring,color masks):"
+					<< coarsening_level << "," << coarsener_gen_time << "," << times[ 0 ] << "," << times[ 1 ]
+					<< "," << times[ 2 ] << "," << times[ 3 ] << std::endl;
 				);
 
 				// prepare for new iteration
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 8425432de..93c69d87e 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -35,10 +35,6 @@
 #include <type_traits>
 
 #include <graphblas.hpp>
-#include <graphblas/algorithms/hpcg/hpcg.hpp>
-#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
-
-#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
 
 #include <chrono>
 
@@ -68,6 +64,12 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
 #define DBG_print_norm( vec, head ) print_norm( vec, head )
 #endif
 
+#include <graphblas/algorithms/hpcg/hpcg.hpp>
+#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
+
+#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
+#include <graphblas/algorithms/hpcg/coloring.hpp>
+
 #include <graphblas/utils/Timer.hpp>
 
 #include <utils/argument_parser.hpp>
@@ -160,9 +162,9 @@ T static next_pow_2( T n ) {
  * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
  */
 static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
-	const std::array< size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
 	struct hpcg_system_params< 3, double > params {
-		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
+		{ in.nx, in.ny, in.nz }, HALO_RADIUS, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE,
+			PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
 
 	return build_hpcg_system< 3, double >( holder, params );
@@ -361,6 +363,8 @@ static void test_iters();
 static void test_iters2();
 #endif
 
+void test_system_iter();
+
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
 	size_t test_outer_iterations;
@@ -371,6 +375,8 @@ int main( int argc, char ** argv ) {
 	test_iters2();
 	return 0;
 #endif
+	test_system_iter();
+	// return 0;
 
 	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
 	thcout << "System size x: " << sim_in.nx << std::endl;
@@ -490,6 +496,34 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 }
 
 
+void test_system_iter() {
+	constexpr size_t DIMS = 2;
+	using row_index_t = size_t;
+	std::array< row_index_t, DIMS > dims;
+	dims.fill( 4 );
+	grb::utils::geometry::linearized_halo_ndim_system< row_index_t, DIMS > system( dims, 1 );
+	grb::utils::geometry::linearized_halo_ndim_system< row_index_t, DIMS >::iterator begin = system.begin();
+
+	while( begin.has_more_elements() ) {
+		std::cout << "row " << begin->get_element_linear() << ": ";
+		while( begin.has_more_neighbours() ) {
+			std::cout << /* "-- " << */ begin->get_neighbor_linear() << " ";
+			begin.next_neighbour();
+		}
+		std::cout << std::endl;
+		begin.next_element();
+	}
+
+	std::vector< size_t > colors, counters;
+	color_matrix_greedy( system, colors, counters );
+
+	std::cout << "final assignment:" << std::endl;
+	for( size_t i = 0; i < colors.size(); i++ ){
+		std::cout << i << " -> " << colors[ i ] << ", ";
+	}
+	std::cout << std::endl;
+}
+
 
 
 struct NZ {
@@ -510,9 +544,10 @@ static void test_iters() {
 	using clock = std::chrono::steady_clock;
 
 	constexpr size_t DIMS = 3;
+	using coord_t = size_t;
 
-	std::array< unsigned, DIMS > finer_sizes{ 1024, 1024, 1024};
-	std::array< unsigned, DIMS > coarser_sizes;
+	std::array< coord_t, DIMS > finer_sizes{ 1024, 1024, 1024};
+	std::array< coord_t, DIMS > coarser_sizes;
 	for( size_t i = 0; i < finer_sizes.size(); i++ ) {
 		coarser_sizes[ i ] = finer_sizes[ i ] / 2;
 	}
@@ -528,8 +563,8 @@ static void test_iters() {
 	grb::algorithms::old::coarsener_generator_iterator< DIMS, double > send( lcoarser_sizes, lfiner_sizes, rows );
 
 
-	using citer = hpcg_coarsener_builder< DIMS, unsigned, double >::hpcg_coarsener_iterator;
-	hpcg_coarsener_builder< DIMS, unsigned, double > coarsener( coarser_sizes, finer_sizes );
+	using citer = hpcg_coarsener_builder< DIMS, coord_t, double >::hpcg_coarsener_iterator;
+	hpcg_coarsener_builder< DIMS, coord_t, double > coarsener( coarser_sizes, finer_sizes );
 	citer pbegin( coarsener.make_begin_iterator() );
 	const citer pend( coarsener.make_end_iterator() );
 
@@ -595,21 +630,22 @@ static void test_iters() {
 static void test_iters2() {
 
 	using clock = std::chrono::steady_clock;
+	using coord_t = size_t;
 
 	constexpr size_t DIMS = 3, halo_size = 1;
 	constexpr double diag_value = 26.0, non_diag_value = -1.0;
 
-	std::array< unsigned, DIMS > sys_sizes{ 64, 64, 64};
+	std::array< coord_t, DIMS > sys_sizes{ 64, 64, 64};
 	size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
 
 	std::array< size_t, DIMS > large_sys_sizes{ 64, 64, 64};
 	old::matrix_generator_iterator< DIMS, double > sbegin( large_sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
 	old::matrix_generator_iterator< DIMS, double > send( large_sys_sizes, n, halo_size, diag_value, non_diag_value );
 
-	hpcg_builder< DIMS, unsigned, double > hpcg_system( sys_sizes, halo_size );
-	matrix_generator_iterator< DIMS, unsigned, double > pbegin(
+	hpcg_builder< DIMS, coord_t, double > hpcg_system( sys_sizes, halo_size );
+	matrix_generator_iterator< DIMS, coord_t, double > pbegin(
 		hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
-	matrix_generator_iterator< DIMS, unsigned, double > pend(
+	matrix_generator_iterator< DIMS, coord_t, double > pend(
 		hpcg_system.make_end_iterator( diag_value, non_diag_value )
 	);
 

From f93194360c32c9dddbf8c11434cce742c82c47e0 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 18 Nov 2022 14:48:05 +0100
Subject: [PATCH 07/28] re-organizing the code with dedicated runners for the
 various algorithmical components of HPCG - splitting facilities to build the
 data for multi-grid, smoother and coarsener renaming classes in geometry to
 abide by naming conventions - better splitting of hpcg data structures -
 splitting out MG options into dedicated data structure - splitting out
 options and outputs of HPCG making kernels of HPCG simulation composable by
 the user - documenting classes for MG moving IteratorValueAdaptor to iterator
 folder - moving MG data structures to the files using them

---
 .../graphblas/algorithms/hpcg/coloring.hpp    |  23 +-
 include/graphblas/algorithms/hpcg/hpcg.hpp    | 254 +------
 .../graphblas/algorithms/hpcg/hpcg_data.hpp   | 227 ------
 .../algorithms/hpcg/matrix_building_utils.hpp | 310 --------
 .../algorithms/hpcg/multigrid_v_cycle.hpp     | 252 ------
 .../algorithms/hpcg/ndim_matrix_builders.hpp  | 322 ++------
 .../hpcg/old_matrix_building_utils.hpp        | 173 -----
 .../hpcg/old_ndim_matrix_builders.hpp         | 562 --------------
 .../algorithms/hpcg/system_building_utils.hpp | 443 ++++++-----
 .../algorithms/multigrid/coarsener.hpp        | 197 +++++
 .../multigrid/multigrid_building_utils.hpp    |  56 ++
 .../algorithms/multigrid/multigrid_cg.hpp     | 360 +++++++++
 .../algorithms/multigrid/multigrid_data.hpp   | 105 +++
 .../multigrid/multigrid_v_cycle.hpp           | 237 ++++++
 .../red_black_gauss_seidel.hpp                | 127 +++-
 .../utils/geometry/array_vector_storage.hpp   | 137 ++--
 .../utils/geometry/dynamic_vector_storage.hpp | 154 ++++
 .../utils/geometry/generic_vector_storage.hpp | 117 ---
 .../halo_matrix_generator_iterator.hpp        | 207 +++++
 .../linearized_halo_ndim_geometry.hpp         | 423 ++++++-----
 .../linearized_halo_ndim_iterator.hpp         | 718 +++++++++---------
 .../geometry/linearized_halo_ndim_system.hpp  | 185 ++---
 .../geometry/linearized_ndim_iterator.hpp     | 335 ++++----
 .../utils/geometry/linearized_ndim_system.hpp | 342 +++++----
 .../graphblas/utils/geometry/ndim_system.hpp  | 164 ++--
 .../graphblas/utils/geometry/ndim_vector.hpp  | 228 +++---
 .../utils/iterators/IteratorValueAdaptor.hpp  | 128 ++++
 .../utils/iterators/partition_range.hpp       |  71 ++
 tests/smoke/hpcg.cpp                          | 475 +++++-------
 29 files changed, 3545 insertions(+), 3787 deletions(-)
 delete mode 100644 include/graphblas/algorithms/hpcg/hpcg_data.hpp
 delete mode 100644 include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
 delete mode 100644 include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
 delete mode 100644 include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
 delete mode 100644 include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
 create mode 100644 include/graphblas/algorithms/multigrid/coarsener.hpp
 create mode 100644 include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
 create mode 100644 include/graphblas/algorithms/multigrid/multigrid_cg.hpp
 create mode 100644 include/graphblas/algorithms/multigrid/multigrid_data.hpp
 create mode 100644 include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
 rename include/graphblas/algorithms/{hpcg => multigrid}/red_black_gauss_seidel.hpp (57%)
 create mode 100644 include/graphblas/utils/geometry/dynamic_vector_storage.hpp
 delete mode 100644 include/graphblas/utils/geometry/generic_vector_storage.hpp
 create mode 100644 include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp
 create mode 100644 include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
 create mode 100644 include/graphblas/utils/iterators/partition_range.hpp

diff --git a/include/graphblas/algorithms/hpcg/coloring.hpp b/include/graphblas/algorithms/hpcg/coloring.hpp
index 1e6378c59..f9334afb3 100644
--- a/include/graphblas/algorithms/hpcg/coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/coloring.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,13 +27,12 @@
 namespace grb {
 	namespace algorithms {
 
-
 		/**
 		 * Coloring algorithm for matrix generated by a \p DIMS - dimensional system.
 		 *
 		 * This function implements a < b>greedy heuristics< /b> to color the rows of a matrix generated by
-		 * a \p DIMS - dimensional generator \p system, so that no two connected elements < em>(i,j)< /em>
-		 * in the system (corresponding to a nonzero < em>(i,j)< /em> entry in the matrix) have the same color.
+		 * a \p DIMS - dimensional generator \p system, so that no two connected elements \a i,j
+		 * in the system (corresponding to a nonzero \a (i,j) entry in the matrix) have the same color.
 		 * If \p reorder_rows_per_color is false (as per default), the coloring information is stored into
 		 * \p row_colors, while \p color_counters stores the number of rows for each color.
 		 *
@@ -44,22 +43,27 @@ namespace grb {
 		 *
 		 * In both cases, \a color_counters.size() gives the number of found colors.
 		 *
+		 * This algorithm performs a \a global coloring of the input system, i.e. it must run on the entire system
+		 * \a before any partitioning occurs. Although this is not scalable, it should not be a problem for
+		 * most sizes, as the constants in front of this algorithms are very small. Implementing a distributed
+		 * coloring algorithm is anyway out of the scope of this prototype.
+		 *
 		 * @tparam DIMS dimensions of the system
 		 * @tparam CoordType type of the coordinates
 		 * @param[in] system generator for an \p DIMS - dimesional system with halo
 		 * @param[out] row_colors if \p reorder_rows_per_color is false, stores the color of each row;
 		 * 	if \p reorder_rows_per_color is true, stores the new position of each row, so that rows
-		 * 	of the same color are gourped together; the initial content of the vector is destroyed
+		 * 	of the same color are grouped together; the initial content of the vector is destroyed
 		 * @param[out] color_counters if \p reorder_rows_per_color is false, stores the number of rows per color;
 		 * 	if \p reorder_rows_per_color is true, stores at each position \a i the offset in \p color_counters
 		 * 	where the (clustered) rows of color \a i start from; the initial content of the vector is destroyed
 		 * @param[in] reorder_rows_per_color whether to do the clustering after the coloring
 		 */
 		template<
-			std::size_t DIMS,
+			size_t DIMS,
 			typename CoordType
 		> void color_matrix_greedy(
-			const grb::utils::geometry::linearized_halo_ndim_system< CoordType, DIMS > &system,
+			const grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS > &system,
 			std::vector< CoordType > &row_colors,
 			std::vector< CoordType > &color_counters,
 			bool reorder_rows_per_color = false
@@ -74,7 +78,7 @@ namespace grb {
 			row_colors[0] = 0; // first point gets color 0
 
 			// Finds colors in a greedy (a likely non-optimal) fashion.
-			typename grb::utils::geometry::linearized_halo_ndim_system< CoordType, DIMS >::iterator begin = system.begin();
+			typename grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS >::Iterator begin = system.begin();
 			begin.next_element(); // skip first row
 
 			while( begin.has_more_elements() ) {
@@ -159,7 +163,4 @@ namespace grb {
 	} // namespace algorithms
 } // namespace grb
 
-
-
 #endif // _H_GRB_ALGORITHMS_HPCG_COLORING
-
diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 492eb038d..2d30584fe 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,229 +15,57 @@
  * limitations under the License.
  */
 
-/**
- * @file hpcg.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief File with the main routine to run a full HPCG simulation, comprising multi-grid runs
- *        with Red-Black Gauss-Seidel smoothing.
- * @date 2021-04-30
- */
-
 #ifndef _H_GRB_ALGORITHMS_HPCG
 #define _H_GRB_ALGORITHMS_HPCG
 
-#include <graphblas.hpp>
-
-#include "hpcg_data.hpp"
-#include "multigrid_v_cycle.hpp"
-
-#include <graphblas/utils/Timer.hpp>
+#include <utility>
 
+#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
+#include <graphblas/algorithms/multigrid/coarsener.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
 
 namespace grb {
 	namespace algorithms {
 
-		/**
-		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
-		 *
-		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the HPCG algorithm.
-		 * The implementation here closely follows the reference HPCG benchmark used for the HPCG500 rank,
-		 * visible at https://github.com/hpcg-benchmark/hpcg.
-		 * The only difference is the usage of a Red-Black Gauss-Seidel smoother instead of the standard one
-		 * for performance reasons, as the standard Gauss-Seidel algorithm is inherently sequential and not
-		 * expressible in terms of standard linear algebra operations.
-		 * In particular, this implementation (as the standard one) couples a standard CG algorithm with a V-cycle
-		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
-		 * availability of coarsening information, which should be stored inside \p data; otherwise,
-		 * the refinement is not performed and only the CG algorithm is run. For more information on inputs
-		 * and on coarsening information, you may consult the \ref hpcg_data class documentation.
-		 *
-		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
-		 * and populated with the proper values; in particular
-		 * - hpcg_data#x with the initial tentative solution (iterative solutions are also stored here)
-		 * - hpcg_data#A with the system matrix
-		 * - hpcg_data#b with the right-hand side vector \f$ b \f$
-		 * - hpcg_data#A_diagonal with the diagonal values of the matrix
-		 * - hpcg_data#color_masks with the color masks for this level
-		 * - hpcg_data#coarser_level with the information for the coarser multi-grid run (if any)
-		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
-		 * but not necessarily populated with values, as they are internally populated when needed; hence,
-		 * any previous values are overwritten.
-		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-		 * the failure code.
-		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam ResidualType type of the residual norm
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam InputType type of values of the right-hand side vector b
-		 * @tparam Ring the ring of algebraic operators zero-values
-		 * @tparam Minus the minus operator for subtractions
-		 *
-		 * @param[in,out] data \ref hpcg_data object storing inputs, outputs and temporary vectors used for the computation,
-		 *                     as long as the information for the recursive multi-grid runs
-		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
-		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
-		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
-		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
-		 *                           the simulation stops even if the residual norm is above \p tolerance
-		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
-		 *                      the simulation at
-		 * @param[out] iterations numbers of iterations performed
-		 * @param[out] norm_residual norm of the final residual
-		 * @param[in] ring the ring to perform the operations on
-		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-		 *                          unsuccessful operation otherwise
-		 */
-		template< typename IOType,
+		// simply "assemble" types
+		template<
+			typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
 			typename InputType,
-			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
-			class Minus = operators::subtract< IOType > >
-		grb::RC hpcg( hpcg_data< IOType, NonzeroType, InputType > &data,
-			bool with_preconditioning,
-			const size_t presmoother_steps,
-			const size_t postsmoother_steps,
-			const size_t max_iterations,
-			const ResidualType tolerance,
-			size_t &iterations,
-			ResidualType &norm_residual,
-			bool print_iter_stats,
-			const Ring &ring = Ring(),
-			const Minus &minus = Minus()
-		) {
-			ResidualType alpha;
-
-			const grb::Matrix< NonzeroType > &A { data.A };
-			grb::Vector< IOType > &x { data.x };
-			const grb::Vector< InputType > &b { data.b };
-			grb::Vector< IOType > &r { data.r };  // residual vector
-			grb::Vector< IOType > &p { data.p };  // direction vector
-			grb::Vector< IOType > &Ap { data.u }; // temp vector
-			grb::Vector< IOType > &z { data.z };  // pre-conditioned residual vector
-			grb::RC ret { SUCCESS };
-
-			ret = ret ? ret : grb::set( Ap, 0 );
-			ret = ret ? ret : grb::set( r, 0 );
-			ret = ret ? ret : grb::set( p, 0 );
-
-			ret = ret ? ret : grb::set( p, x );
-			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, ring ); // Ap = A * x
-			assert( ret == SUCCESS );
-
-			ret = ret ? ret : grb::eWiseApply( r, b, Ap, minus ); // r = b - Ap;
-			assert( ret == SUCCESS );
-
-			norm_residual = ring.template getZero< ResidualType >();
-			ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // norm_residual = r' * r;
-			assert( ret == SUCCESS );
-
-			// compute sqrt to avoid underflow
-			norm_residual = std::sqrt( norm_residual );
-
-			// initial norm of residual
-			const ResidualType norm_residual_initial { norm_residual };
-			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
-			size_t iter { 0 };
-
-			grb::utils::Timer timer;
-
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( p, "start p" );
-			DBG_print_norm( Ap, "start Ap" );
-			DBG_print_norm( r, "start r" );
-#endif
-
-			do {
-#ifdef HPCG_PRINT_STEPS
-				DBG_println( "========= iteration " << iter << " =========" );
-#endif
-				if( with_preconditioning ) {
-					if( print_iter_stats ) {
-						timer.reset();
-					}
-					ret = ret ? ret : internal::multi_grid( data, data.coarser_level,
-						presmoother_steps, postsmoother_steps, ring, minus );
-					assert( ret == SUCCESS );
-					if( print_iter_stats ) {
-						double duration = timer.time();
-						std::cout << "iteration, pre-conditioner: " << iter << ","
-							<< duration << std::endl;
-					}
-				} else {
-					ret = ret ? ret : grb::set( z, r ); // z = r;
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( z, "initial z" );
-#endif
-
-				ResidualType pAp;
-
-				if( iter == 0 ) {
-					ret = ret ? ret : grb::set( p, z ); //  p = z;
-					assert( ret == SUCCESS );
-
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
-					assert( ret == SUCCESS );
-				} else {
-					old_r_dot_z = r_dot_z;
-
-					r_dot_z = ring.template getZero< ResidualType >();
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
-					assert( ret == SUCCESS );
-
-					beta = r_dot_z / old_r_dot_z;
-					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
-					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, ring ); // Ap += beta * p + z;
-					std::swap( Ap, p );                                         // p = Ap;
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( p, "middle p" );
-#endif
-
-				ret = ret ? ret : grb::set( Ap, 0 );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, ring ); // Ap = A * p;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( Ap, "middle Ap" );
-#endif
-				pAp = static_cast< ResidualType >( 0.0 );
-				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
-				assert( ret == SUCCESS );
-
-				alpha = r_dot_z / pAp;
-
-				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( x, "end x" );
-#endif
-
-				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( r, "end r" );
-#endif
-
-				norm_residual = static_cast< ResidualType >( 0.0 );
-				ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // residual = r' * r;
-				assert( ret == SUCCESS );
-
-				norm_residual = std::sqrt( norm_residual );
-
-				if( print_iter_stats ) {
-					std::cout << "iteration, residual: " << iter << "," << norm_residual << std::endl;
-				}
-
-				++iter;
-			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
-
-			iterations = iter;
-			return ret;
+			class Ring,
+			class Minus
+		> using HPCGRunnerType = mg_cg_runner< IOType, NonzeroType, InputType, ResidualType,
+			multigrid_runner< IOType, NonzeroType, InputType,
+				red_black_smoother_runner< IOType, NonzeroType, Ring >,
+				single_point_coarsener< IOType, NonzeroType, Ring, Minus >,
+				Ring, Minus >,
+			Ring, Minus
+		>;
+
+		template<
+			typename IOType,
+			typename ResidualType,
+			typename NonzeroType,
+			typename InputType,
+			class Ring,
+			class Minus
+		> HPCGRunnerType< IOType, ResidualType, NonzeroType, InputType, Ring, Minus >
+			build_hpcg_runner( size_t smoother_steps ) {
+
+			single_point_coarsener< IOType, NonzeroType, Ring, Minus > coarsener;
+			red_black_smoother_runner< IOType, NonzeroType, Ring >
+				smoother{ smoother_steps, smoother_steps, 1UL, {}, Ring() };
+
+			multigrid_runner< IOType, NonzeroType, InputType,
+				red_black_smoother_runner< IOType, NonzeroType, Ring >,
+				single_point_coarsener< IOType, NonzeroType, Ring, Minus >,
+				Ring, Minus
+			> mg_runner( std::move( smoother ), std::move( coarsener ) );
+
+			return HPCGRunnerType< IOType, ResidualType, NonzeroType, InputType, Ring, Minus >(
+				std::move( mg_runner ) );
 		}
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/hpcg_data.hpp b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
deleted file mode 100644
index c53ef99e4..000000000
--- a/include/graphblas/algorithms/hpcg/hpcg_data.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg_data.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Data structures to store HPCG input/output data.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_HPCG_DATA
-#define _H_GRB_ALGORITHMS_HPCG_DATA
-
-#include <vector>
-#include <cstddef>
-
-#include <graphblas.hpp>
-
-
-namespace grb {
-
-	namespace algorithms {
-
-		/**
-		 * @brief basic data container for the HPCG algorithm, storing \b only the
-		 * data in common between the full CG run and the V-cycle multi-grid solver.
-		 * Additional data are stored in inheriting daata structures.
-		 *
-		 * @tparam IOType type of values of the vectors for intermediate results
-		 * @tparam NonzeroType type of the values stored inside the system matrix #A
-		 */
-		template<
-			typename IOType,
-			typename NonzeroType
-		>
-		struct system_data {
-
-			const std::size_t system_size; ///< size of the system, i.e. side of the #A
-
-			grb::Matrix< NonzeroType > A;                   ///< system matrix
-			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
-			grb::Vector< IOType > z;                        ///< multi-grid solution
-			grb::Vector< IOType > r;                        ///< residual
-			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
-			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
-
-			/**
-			 * @brief Constructor building all the stored vectors and matrices.
-			 *
-			 * Stored vectors and matrices are constructed according to \p sys_size but \b not initialized
-			 * to any value internally, as initialization is up to users's code.
-			 *
-			 * @param[in] sys_size the size of the underlying physical system, i.e. the size of vectors and the number
-			 * of rows and columns of the #A matrix.
-			 */
-			system_data( std::size_t sys_size ) :
-				system_size( sys_size ),
-				A( sys_size, sys_size ),
-				A_diagonal( sys_size ),
-				z( sys_size ),
-				r( sys_size ),
-				smoother_temp( sys_size ) { }
-
-			// for safety, disable copy semantics
-			system_data( const system_data & o ) = delete;
-
-			system_data & operator=( const system_data & ) = delete;
-
-			grb::RC zero_temp_vectors() {
-				grb::RC rc = grb::set( z, 0 );
-				rc = rc ? rc : grb::set( r, 0 );
-				rc = rc ? rc : grb::set( smoother_temp, 0 );
-				return rc;
-			}
-		};
-
-		/**
-		 * @brief Data container for all multi-grid inputs and outputs.
-		 *
-		 * @tparam IOType Type of values of the vectors for intermediate results
-		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
-		 *                     and the coarsening matrix #Ax_finer
-		 *
-		 * This data structure stores information for a full multi-grid V cycle, i.e.
-		 * - input and output vectors for solution, residual and temporary vectors
-		 * - coarsening information, in particular the #coarsening_matrix that
-		 *   coarsens a larger system of size #finer_size to the current system
-		 *   of size #system_size
-		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
-		 *   if no further coarsening is desired; note that this information is automatically
-		 *   destructed on object destruction (if any)
-		 *
-		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
-		 * thus having size #system_size; this also holds for the system matrix #A,
-		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
-		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
-		 * coming from another \code multi_grid_data<IOType, NonzeroType> \endcode object whose #system_size equals
-		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
-		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
-		 * the information for a further coarsening is stored inside #coarser_level, so that the
-		 * hierarchy of coarsened levels is reflected inside this data structure.
-		 *
-		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
-		 * but their values are \b not initialized.
-		 */
-		template<
-			typename IOType,
-			typename NonzeroType
-		>
-		struct multi_grid_data : public system_data< IOType, NonzeroType > {
-
-			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
-			///< typically \c finer_size \code == 8 * \endcode #system_size
-
-			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
-
-			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
-			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
-
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< pointer to next coarsening level, for recursive
-			                                                               ///< multi-grid V cycle implementations
-
-			/**
-			 * @brief Construct a new \c multi_grid_data_object by initializing internal data structures and setting
-			 *        #coarser_level to \c nullptr.
-			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
-			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
-			 */
-			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
-				system_data< IOType, NonzeroType >( coarser_size ),
-				finer_size( _finer_size ),
-				Ax_finer( finer_size ),
-				coarsening_matrix( coarser_size, finer_size ) {
-				coarser_level = nullptr;
-			}
-
-			/**
-			 * @brief Destroys the \c multi_grid_data_object object by destroying #coarser_level.
-			 */
-			virtual ~multi_grid_data() {
-				if( coarser_level != nullptr ) {
-					delete coarser_level;
-				}
-			}
-
-			grb::RC zero_temp_vectors() {
-				grb::RC rc = this->system_data< IOType, NonzeroType >::zero_temp_vectors();
-				rc = rc ? rc : grb::set( Ax_finer, 0 );
-				return rc;
-			}
-		};
-
-		/**
-		 * @brief Data stucture to store the data for a full HPCG run: system vectors and matrix,
-		 * coarsening information and temporary vectors.
-		 *
-		 * This data structures contains all the needed vectors and matrices to solve a linear system
-		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
-		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
-		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
-		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object and storing its pointer into
-		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
-		 * user's intervention.
-		 *
-		 * @tparam IOType type of values of the vectors for intermediate results
-		 * @tparam NonzeroType type of the values stored inside the system matrix #A
-		 * @tparam InputType type of the values of the right-hand side vector #b
-		 */
-		template< typename IOType, typename NonzeroType, typename InputType >
-		struct hpcg_data : public system_data< IOType, NonzeroType > {
-
-			grb::Vector< InputType > b; ///< right-side vector of known values
-			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
-			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
-			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
-			///< to set the initial solution value
-
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< information about the coarser system, for
-			                                                               ///< the multi-grid run
-
-			/**
-			 * @brief Construct a new \c hpcg_data object by building vectors and matrices and by setting
-			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
-			 *
-			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
-			 */
-			hpcg_data( std::size_t sys_size ) : system_data< IOType, NonzeroType >( sys_size ), b( sys_size ), u( sys_size ), p( sys_size ), x( sys_size ) {
-				coarser_level = nullptr;
-			}
-
-			/**
-			 * @brief Destroy the \c hpcg_data object by destroying the #coarser_level informartion, if any.
-			 */
-			virtual ~hpcg_data() {
-				if( coarser_level != nullptr ) {
-					delete coarser_level;
-				}
-			}
-
-			grb::RC zero_temp_vectors() {
-				grb::RC rc = this->system_data< IOType, NonzeroType >::zero_temp_vectors();
-				rc = rc ? rc : grb::set( u, 0 );
-				rc = rc ? rc : grb::set( p, 0 );
-				return rc;
-			}
-		};
-
-	} // namespace algorithms
-
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_HPCG_DATA
-
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
deleted file mode 100644
index 45486e99b..000000000
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ /dev/null
@@ -1,310 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg_matrix_building_utils.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build the matrices for HPCG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <numeric>
-#include <stdexcept>
-#include <utility>
-#include <limits.h>
-#include <iterator>
-#include <type_traits>
-
-#include <graphblas.hpp>
-
-#include "ndim_matrix_builders.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-
-		template< typename T > void partition_nonzeroes(
-				T num_nonzeroes,
-				T& first_offset,
-				T& last_offset
-		) {
-			const size_t num_procs{ spmd<>::nprocs() };
-			const T per_process{ ( num_nonzeroes + num_procs - 1 ) / num_procs }; // round up
-			first_offset = std::min( per_process * static_cast< T >( spmd<>::pid() ), num_nonzeroes );
-			last_offset = std::min( first_offset + per_process, num_nonzeroes );
-		}
-
-		template< typename IterT > void partition_iteration_range(
-			size_t num_nonzeroes,
-			IterT &begin,
-			IterT &end
-		) {
-			static_assert( std::is_base_of< std::random_access_iterator_tag,
-				typename std::iterator_traits< IterT >::iterator_category >::value,
-				"the given iterator is not a random access one" );
-			assert( num_nonzeroes == static_cast< size_t >( end - begin ) );
-			size_t first, last;
-			partition_nonzeroes( num_nonzeroes, first, last );
-			if( last < num_nonzeroes ) {
-				end = begin;
-				end += last;
-			}
-			begin += first;
-		}
-
-		/**
-		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
-		 *
-		 * This routine initializes \p M to a matrix representing a \p DIMS -dimensions system of sizes
-		 * \p sys_sizes, with an iteration halo of size \p halo_size . The matrix diagonal values are initialized
-		 * to \p diag_value while the other non-zero values are initialized to \p non_diag_value .
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed
-		 * @param sys_sizes the sizes of the physical system
-		 * @param halo_size the size of the halo of point to iterate in
-		 * @param diag_value diagonal value
-		 * @param non_diag_value value outside of the diagonal
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template<
-			std::size_t DIMS,
-			typename coord_t,
-			typename T,
-			enum grb::Backend B
-		> grb::RC build_ndims_system_matrix(
-			grb::Matrix< T, B > & M,
-			const grb::algorithms::hpcg_builder< DIMS, coord_t, T > & hpcg_system,
-			T diag_value,
-			T non_diag_value
-		) {
-			if( hpcg_system.system_size() > std::numeric_limits< coord_t >::max() ) {
-				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
-			}
-			/*
-			std::array< coord_t, DIMS > _sys_sizes;
-			for( size_t i = 0; i < DIMS; i++ ) _sys_sizes[i] = sys_sizes[i];
-			grb::algorithms::hpcg_builder< DIMS, coord_t, T > hpcg_system( _sys_sizes, halo_size );
-			*/
-			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > begin(
-				hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
-			grb::algorithms::matrix_generator_iterator< DIMS, coord_t, T > end(
-				hpcg_system.make_end_iterator( diag_value, non_diag_value )
-			);
-			partition_iteration_range( hpcg_system.num_neighbors(), begin, end );
-
-			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
-		}
-
-		/**
-		 * @brief Builds a coarsener matrix for an HPCG simulation.
-		 *
-		 * It initializes \p M as a rectangular matrix, with rows corresponding to the coarser system
-		 * (of dimensions \p coarser_sizes - output) and columns corresponding to the finer system
-		 * (of dimensions \p finer_sizes - input). The resulting coarsening matrix takes in input the finer system
-		 * and coarsens it by keeping one element every \a S , where \a S is the ratio between the finer and
-		 * the coarser dimension (computed for each dimension). In this way each \p DIMS -dimensional finer element
-		 * corresponds to its bounding coarser element.
-		 *
-		 * For the coarsening to be feasible, the sizes of the finer system \b must be a multiple of those of the
-		 * coarser system. If this condition is not met, an exception is thrown.
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed with proper dimensions
-		 * @param coarser_sizes sizes of the coarser system
-		 * @param finer_sizes sizes of the finer system; each one \b must be a multiple of the corresponding value
-		 *                    in \p coarser_size , otherwise an exception is thrown
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template<
-			std::size_t DIMS,
-			typename T,
-			enum grb::Backend B
-		> grb::RC build_ndims_coarsener_matrix(
-			grb::Matrix< T, B > & M,
-			const std::array< std::size_t, DIMS > & coarser_sizes,
-			const std::array< std::size_t, DIMS > & finer_sizes
-		) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
-			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
-				std::size_t step = finer_sizes[ i ] / coarser_sizes[ i ];
-				if( step * coarser_sizes[ i ] != finer_sizes[ i ] ) {
-					throw std::invalid_argument( "finer sizes should be a multiple of "
-												"coarser sizes" );
-				}
-			}
-			std::size_t const cols { std::accumulate( finer_sizes.cbegin(), finer_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be rectangular"
-											" with rows == <product of coarser sizes> "
-											"and cols == <product of finer sizes>" );
-			}
-			using coord_t = unsigned;
-			if( rows > std::numeric_limits< coord_t >::max() ) {
-				throw std::domain_error( "CoordT cannot store the row coordinates" );
-			}
-			if( cols > std::numeric_limits< coord_t >::max() ) {
-				throw std::domain_error( "CoordT cannot store the column coordinates" );
-			}
-			std::array< coord_t, DIMS > _coarser_sizes, _finer_sizes;
-			for( size_t i = 0; i < DIMS; i++ ) {
-				_coarser_sizes[i] = coarser_sizes[i];
-				_finer_sizes[i] = finer_sizes[i];
-			}
-			grb::algorithms::hpcg_coarsener_builder< DIMS, coord_t, T > coarsener( _coarser_sizes, _finer_sizes );
-			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, T > begin( coarsener.make_begin_iterator() );
-			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, T > end(
-				coarsener.make_end_iterator()
-			);
-			partition_iteration_range( coarsener.system_size(), begin, end );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
-		}
-
-		template< typename CoordT >
-		struct true_iter {
-
-			static const bool __TRUE = true;
-
-			using self_t = true_iter< CoordT >;
-			using iterator_category = std::random_access_iterator_tag;
-			using value_type = bool;
-			using pointer = const bool *;
-			using reference = const bool&;
-			using difference_type = long;
-
-			true_iter() = delete;
-
-			true_iter( CoordT first ): index( first ) {}
-
-			true_iter( const self_t & ) = default;
-
-			self_t & operator=( const self_t & ) = default;
-
-			bool operator!=( const self_t & other ) const {
-				return this->index != other.index;
-			}
-
-			self_t & operator++() noexcept {
-				(void) index++;
-				return *this;
-			}
-
-			self_t & operator+=( size_t increment ) noexcept {
-				index += increment;
-				return *this;
-			}
-
-			difference_type operator-( const self_t & other ) noexcept {
-				return static_cast< difference_type >( this->index - other.index );
-			}
-
-			pointer operator->() const {
-				return &__TRUE;
-			}
-
-			reference operator*() const {
-				return *(this->operator->());
-			}
-
-		private:
-			CoordT index;
-		};
-
-		template< typename CoordT > const bool true_iter< CoordT >::__TRUE;
-
-		/**
-		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
-		 *
-		 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
-		 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
-		 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
-		 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
-		 *
-		 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
-		 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
-		 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
-		 *
-		 * @tparam B GraphBLAS backend for the vector
-		 * @param masks output vector of color masks
-		 * @param matrix_size size of the system matrix
-		 * @param colors numbers of colors masks to build; it must be < \p matrix_size
-		 * @return grb::RC the success value returned when trying to build the vector
-		 */
-		template< enum grb::Backend B >
-		grb::RC build_static_color_masks(
-			std::size_t matrix_size,
-			const std::vector< std::vector< size_t > > &per_color_rows,
-			std::vector< grb::Vector< bool, B > > & masks
-		) {
-			if( ! masks.empty() ) {
-				throw std::invalid_argument( "vector of masks is expected to be empty" );
-			}
-			for( size_t i = 0; i < per_color_rows.size(); i++ ) {
-				const std::vector< size_t > & rows = per_color_rows[ i ];
-				/*
-				{
-					std::cout << "\ncolor " << i << std::endl;
-					for( size_t row : rows ) {
-						std::cout << row << " ";
-					}
-					std::cout << std::endl;
-				}
-				*/
-				masks.emplace_back( matrix_size );
-				grb::Vector< bool > & output_mask = masks.back();
-				std::vector< size_t >::const_iterator begin = rows.cbegin();
-				std::vector< size_t >::const_iterator end = rows.cend();
-				// partition_iteration_range( rows.size(), begin, end );
-				grb::RC rc = grb::buildVectorUnique( output_mask, begin , end, true_iter< size_t >( 0 ),
-					true_iter< size_t >( std::distance( begin, end ) ), IOMode::SEQUENTIAL );
-				if( rc != SUCCESS ) {
-					std::cerr << "error while creating output mask for color " << i << ": "
-						<< toString( rc ) << std::endl;
-					return rc;
-				}
-				/*
-				{
-					std::cout << "mask color " << i << std::endl;
-					size_t count = 0;
-					for( const auto & v : output_mask ) {
-						std::cout << v.first << " ";
-						count++;
-						if( count > 20 ) break;
-					}
-					std::cout << std::endl;
-				}
-				*/
-			}
-			return grb::SUCCESS;
-		}
-
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
deleted file mode 100644
index 7541a387f..000000000
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file multigrid_v_cycle.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief This file contains the routines for multi-grid solution refinement, including the main routine
- *        and those for coarsening and refinement of the tentative solution.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
-#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
-
-#include <cassert>
-#include <vector>
-
-#include <graphblas.hpp>
-
-#include "hpcg_data.hpp"
-#include "red_black_gauss_seidel.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-		/**
-		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
-		 */
-		namespace internal {
-
-			/**
-			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
-			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
-			 *
-			 * The coarsening information are stored inside \p coarsening_data.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
-			 * @param[in] r_fine fine residual vector
-			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing the information for coarsening
-			 * @param[in] ring the ring to perform the operations on
-			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType,
-				typename NonzeroType,
-				class Ring,
-				class Minus >
-			grb::RC compute_coarsening( const grb::Vector< IOType > & r_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
-				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
-									  minus ); // Ax_finer = r_fine - Ax_finer
-				assert( ret == SUCCESS );
-
-				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
-				// to *coarsening_data->system_size
-				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
-									  ring ); // r = coarsening_matrix * Ax_finer
-				return ret;
-			}
-
-			/**
-			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
-			 * \p x_fine.
-			 *
-			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[out] x_fine the solution vector to store the prolonged solution into
-			 * @param[in,out] coarsening_data information for coarsening
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 * unsuccessful operation otherwise
-			 */
-			template< typename IOType,
-				typename NonzeroType,
-				class Ring >
-			grb::RC compute_prolongation( grb::Vector< IOType > & x_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring ) {
-				RC ret { SUCCESS };
-				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
-				// to nrows(x_fine)
-				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
-
-				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix | grb::descriptors::dense >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
-				assert( ret == SUCCESS );
-				return ret;
-			}
-
-			/**
-			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
-			 * inside \p data.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *                     residual, system matrix colors, temporary vectors
-			 * @param[in] smoother_steps how many smoothing steps to run
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC run_smoother( system_data< IOType, NonzeroType > & data, const std::size_t smoother_steps, const Ring & ring ) {
-				RC ret { SUCCESS };
-
-				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : red_black_gauss_seidel( data, ring );
-					assert( ret == SUCCESS );
-				}
-				return ret;
-			}
-
-			/**
-			 * @brief Multi-grid V cycle implementation to refine a given solution.
-			 *
-			 * A full multi-grid run goes through the following steps:
-			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
-			 *    to improve on the initial solution stored into \p data.z
-			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
-			 * -# a multi-grid run is recursively performed on the coarser system
-			 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
-			 *    into \p data.z
-			 * -# this solution is further smoothed for \p postsmoother_steps steps
-			 *
-			 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
-			 *
-			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-			 * the failure code.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
-			 * @param[in,out] data \ref multi_grid_data object storing the relevant data for the multi-grid run of the current
-			 *                     clevel
-			 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
-			 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
-			 *                and only smoothing occurs on the current solution
-			 * @param[in] presmoother_steps number of pre-smoother steps
-			 * @param[in] postsmoother_steps number of post-smoother steps
-			 * @param[in] ring the ring to perform the operations on
-			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring, class Minus >
-			grb::RC multi_grid( system_data< IOType, NonzeroType > & data,
-				struct multi_grid_data< IOType, NonzeroType > * const coarsening_data,
-				const size_t presmoother_steps,
-				const size_t postsmoother_steps,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
-#ifdef HPCG_PRINT_STEPS
-				DBG_println( "mg BEGINNING {" );
-#endif
-
-				// clean destination vector
-				ret = ret ? ret : grb::set( data.z, 0 );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.r, "initial r" );
-#endif
-				if( coarsening_data == nullptr ) {
-					// compute one round of Gauss Seidel and return
-					ret = ret ? ret : run_smoother( data, 1, ring );
-					assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-					DBG_print_norm( data.z, "smoothed z" );
-					DBG_println( "} mg END" );
-#endif
-					return ret;
-				}
-
-				struct multi_grid_data< IOType, NonzeroType > & cd {
-					*coarsening_data
-				};
-
-				// pre-smoother
-				ret = ret ? ret : run_smoother( data, presmoother_steps, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "pre-smoothed z" );
-#endif
-
-				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( cd.Ax_finer, data.A, data.z, ring );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( cd.r, "coarse r" );
-#endif
-
-				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "prolonged z" );
-#endif
-
-				// post-smoother
-				ret = ret ? ret : run_smoother( data, postsmoother_steps, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "post-smoothed z" );
-				DBG_println( "} mg END" );
-#endif
-
-				return ret;
-			}
-
-		} // namespace internal
-	}     // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
index 35a15238d..5958ecb0d 100644
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
@@ -47,234 +47,52 @@
 #include <cstddef>
 #include <iterator>
 
-#include <graphblas/utils/geometry/linearized_halo_ndim_system.hpp>
+#include <graphblas/utils/geometry/halo_matrix_generator_iterator.hpp>
 
-#include <graphblas/utils/geometry/linearized_ndim_system.hpp>
-#include <graphblas/utils/geometry/linearized_ndim_iterator.hpp>
-#include <graphblas/utils/geometry/array_vector_storage.hpp>
 
 
 
 namespace grb {
-
 	namespace algorithms {
 
 		template<
 			size_t DIMS,
-			typename CoordT,
-			typename T
+			typename CoordType,
+			typename ValueType
 		>
-		class hpcg_builder;
-
-		template<
-			size_t DIMS,
-			typename CoordT,
-			typename T
-		>
-		struct matrix_generator_iterator {
-
-			using RowIndexType = CoordT; ///< numeric type of rows
-			using ColumnIndexType = CoordT;
-			using ValueType = T;
-			friend hpcg_builder< DIMS, CoordT, T >;
-
-			using linear_system_t = grb::utils::geometry::linearized_halo_ndim_system< RowIndexType, DIMS >;
-			using __iter_t = typename linear_system_t::iterator;
-			using self_t = matrix_generator_iterator< DIMS, CoordT, T >;
+		class HPCGBuilder {
+		public:
 
-			struct __value {
+			struct HPCGDiagGenerator {
 
-				friend self_t;
+				ValueType _diag;
+				ValueType _non_diag;
 
-				__value(
+				HPCGDiagGenerator(
 					ValueType diag,
-					ValueType non_diag,
-					RowIndexType i,
-					ColumnIndexType j
-				) noexcept :
-					diagonal_value( diag ),
-					non_diagonal_value( non_diag ),
-					_i( i ),
-					_j( j )
-				{}
+					ValueType non_diag
+				) : _diag( diag ),
+				_non_diag( non_diag ) {}
 
-				__value( const __value & ) = default;
-
-				__value & operator=( const __value & ) = default;
+				HPCGDiagGenerator & operator=( const HPCGDiagGenerator & ) = default;
 
-				inline RowIndexType i() const { return _i; }
-				inline ColumnIndexType j() const { return _j; }
-				inline ValueType v() const {
-					return j() == i() ? diagonal_value : non_diagonal_value;
+				inline ValueType operator()( const CoordType &i, const CoordType &j ) const noexcept {
+					return j == i ? _diag: _non_diag;
 				}
-
-			private:
-				ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
-				ValueType non_diagonal_value; ///< value to emit outside of the diagonal
-				RowIndexType _i;
-				ColumnIndexType _j;
 			};
 
-			// interface for std::random_access_iterator
-			using iterator_category = std::random_access_iterator_tag;
-			using value_type = __value;
-			using pointer = value_type;
-			using reference = value_type;
-			using difference_type = typename __iter_t::difference_type;
-
-			matrix_generator_iterator( const self_t & ) = default;
-
-			matrix_generator_iterator( self_t && ) = default;
-
-			self_t & operator=( const self_t & ) = default;
-
-			self_t & operator=( self_t && ) = default;
-
-			/**
-			 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
-			 *
-			 * This operator internally increments the columns coordinates until wrap-around, when it increments
-			 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
-			 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
-			 *
-			 * @return matrix_generator_iterator<DIMS, T>& \c this object, with the updated state
-			 */
-			self_t & operator++() noexcept {
-				(void) ++_sys_iter;
-				update_coords();
-				return *this;
-			}
-
-			self_t & operator+=( size_t offset ) {
-				_sys_iter += offset;
-				update_coords();
-				return *this;
-			}
-
-			difference_type operator-( const self_t &other ) const {
-				return this->_sys_iter - other._sys_iter;
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they differ.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator!=( const self_t &o ) const {
-				return this->_sys_iter != o._sys_iter;
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they are equal.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator==( const self_t &o ) const {
-				return ! operator!=( o );
-			}
-
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
-			reference operator*() const {
-				return _val;
-			}
-
-			pointer operator->() const {
-				return &_val;
-			}
-
-			/**
-			 * @brief Returns current row.
-			 */
-			inline RowIndexType i() const {
-				return _val.i();
-			}
-
-			/**
-			 * @brief Returns current column.
-			 */
-			inline ColumnIndexType j() const {
-				return _val.j();
-			}
-
-			/**
-			 * @brief Returns the current matrix value.
-			 *
-			 * @return ValueType #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
-			 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
-			 */
-			inline ValueType v() const {
-				return _val.v();
-			}
-
-			const __iter_t & it() const {
-				return this->_sys_iter;
-			}
+			using HaloSystemType = grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS >;
+			using Iterator = geometry::HaloMatrixGeneratorIterator< DIMS, CoordType, ValueType, HPCGDiagGenerator >;
 
-		private:
-			value_type _val;
-			const linear_system_t *_lin_system;
-			__iter_t _sys_iter;
-
-			/**
-			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
-			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
-			 *
-			 * @param sizes array with the sizes along the dimensions
-			 * @param _halo halo of points to iterate around; must be > 0
-			 * @param diag value to emit when on the diagonal
-			 * @param non_diag value to emit outside the diagonal
-			 */
-			matrix_generator_iterator(
-				const linear_system_t &system,
+			HPCGBuilder(
+				const std::array< CoordType, DIMS > &sizes,
+				CoordType _halo,
 				ValueType diag,
 				ValueType non_diag
-			) noexcept :
-				_val( diag, non_diag, 0, 0 ),
-				_lin_system( &system ),
-				_sys_iter( system.begin() )
-			{
-				update_coords();
-			}
-
-			void update_coords() {
-				_val._i = _sys_iter->get_element_linear();
-				_val._j = _sys_iter->get_neighbor_linear();
-			}
-		};
-
-
-		template<
-			size_t DIMS,
-			typename CoordT,
-			typename T
-		>
-		class hpcg_builder {
-
-			using system_t = grb::utils::geometry::linearized_halo_ndim_system< CoordT, DIMS >;
-
-			system_t system;
-			// const grb::utils::geometry::linearized_halo_ndim_system< CoordT, DIMS > system;
-			const CoordT halo;
-
-		public:
-
-			using hpcg_sys_iterator = matrix_generator_iterator< DIMS, CoordT, T >;
-
-			hpcg_builder(
-				const std::array< CoordT, DIMS > &sizes,
-				CoordT _halo
 			) :
+				halo( _halo ),
 				system( sizes, _halo ),
-				halo( _halo )
+				_diag_generator( diag, non_diag )
 			{
 				if( _halo <= 0 ) {
 					throw std::invalid_argument( "halo should be higher than 0" );
@@ -286,13 +104,14 @@ namespace grb {
 				}
 			}
 
-			hpcg_builder( const hpcg_builder< DIMS, CoordT, T> & ) = delete;
 
-			hpcg_builder( hpcg_builder< DIMS, CoordT, T> && ) = delete;
+			HPCGBuilder( const HPCGBuilder< DIMS, CoordType, ValueType > & ) = default;
+
+			HPCGBuilder( HPCGBuilder< DIMS, CoordType, ValueType > && ) = default;
 
-			hpcg_builder< DIMS, CoordT, T> & operator=( const hpcg_builder< DIMS, CoordT, T> & ) = delete;
+			HPCGBuilder< DIMS, CoordType, ValueType > & operator=( const HPCGBuilder< DIMS, CoordType, ValueType > & ) = default;
 
-			hpcg_builder< DIMS, CoordT, T> & operator=( hpcg_builder< DIMS, CoordT, T> && ) = delete;
+			HPCGBuilder< DIMS, CoordType, ValueType > & operator=( HPCGBuilder< DIMS, CoordType, ValueType > && ) = default;
 
 			size_t system_size() const {
 				return system.base_system_size();
@@ -302,27 +121,34 @@ namespace grb {
 				return system.halo_system_size();
 			}
 
-			const system_t & get_generator() const {
+			const HaloSystemType & get_generator() const {
 				return system;
 			}
 
-			hpcg_sys_iterator make_begin_iterator(
-				T diag,
-				T non_diag
-			) const {
-				return hpcg_sys_iterator( system, diag, non_diag );
+			Iterator make_begin_iterator() const {
+				return Iterator( system, _diag_generator );
 			}
 
-			hpcg_sys_iterator make_end_iterator(
-				T diag,
-				T non_diag
-			) const {
-				hpcg_sys_iterator result( system, diag, non_diag );
+			Iterator make_end_iterator() const {
+				Iterator result( system, _diag_generator );
 				result += num_neighbors() - 1; // do not trigger boundary checks
 				++result;
 				return result;
 			}
 
+			ValueType get_diag_value() const {
+				return _diag_generator._diag;
+			}
+
+			ValueType get_non_diag_value() const {
+				return _diag_generator._non_diag;
+			}
+
+
+		private:
+			const CoordType halo;
+			HaloSystemType system;
+			HPCGDiagGenerator _diag_generator;
 		};
 
 
@@ -356,7 +182,7 @@ namespace grb {
 
 		template<
 			size_t DIMS,
-			typename CoordT,
+			typename CoordType,
 			typename T
 		>
 		class hpcg_coarsener_builder;
@@ -377,26 +203,26 @@ namespace grb {
 		 */
 		template<
 			size_t DIMS,
-			typename CoordT,
+			typename CoordType,
 			typename T
 		>
 		struct coarsener_generator_iterator {
 
-			friend hpcg_coarsener_builder< DIMS, CoordT, T >;
+			friend hpcg_coarsener_builder< DIMS, CoordType, T >;
 
-			using RowIndexType = CoordT; ///< numeric type of rows
-			using ColumnIndexType = CoordT;
+			using RowIndexType = CoordType; ///< numeric type of rows
+			using ColumnIndexType = CoordType;
 			using ValueType = T;
 
-			using lin_system_t = grb::utils::geometry::linearized_ndim_system< CoordT,
-				grb::utils::geometry::array_vector_storage< CoordT, DIMS > >;
-			using __iter_t = typename lin_system_t::iterator;
-			using self_t = coarsener_generator_iterator< DIMS, CoordT, T >;
-			using array_t = std::array< CoordT, DIMS >;
+			using lin_system_t = grb::utils::geometry::LinearizedNDimSystem< CoordType,
+				grb::utils::geometry::ArrayVectorStorage< CoordType, DIMS > >;
+			using __iter_t = typename lin_system_t::Iterator;
+			using SelfType = coarsener_generator_iterator< DIMS, CoordType, T >;
+			using array_t = std::array< CoordType, DIMS >;
 
 			struct __value {
 
-				friend self_t;
+				friend SelfType;
 
 				__value(
 					RowIndexType i,
@@ -428,13 +254,13 @@ namespace grb {
 			using reference = const value_type&;
 			using difference_type = typename __iter_t::difference_type;
 
-			coarsener_generator_iterator( const self_t & o ) = default;
+			coarsener_generator_iterator( const SelfType & o ) = default;
 
-			coarsener_generator_iterator( self_t && o ) = default;
+			coarsener_generator_iterator( SelfType && o ) = default;
 
-			self_t & operator=( const self_t & ) = default;
+			SelfType & operator=( const SelfType & ) = default;
 
-			self_t & operator=( self_t && ) = default;
+			SelfType & operator=( SelfType && ) = default;
 
 			/**
 			 * @brief Increments the row and the column according to the respective physical sizes,
@@ -442,33 +268,33 @@ namespace grb {
 			 *
 			 * @return \code *this \endcode, i.e. the same object with the updates row and column
 			 */
-			self_t & operator++() noexcept {
+			SelfType & operator++() noexcept {
 				(void) ++_sys_iter;
 				update_coords();
 				return *this;
 			}
 
-			self_t & operator+=( size_t offset ) {
+			SelfType & operator+=( size_t offset ) {
 				_sys_iter += offset;
 				update_coords();
 				return *this;
 			}
 
-			difference_type operator-( const self_t &o ) const {
+			difference_type operator-( const SelfType &o ) const {
 				return this->_sys_iter - o._sys_iter;
 			}
 
 			/**
 			 * @brief Returns whether \c this and \p o differ.
 			 */
-			bool operator!=( const self_t &o ) const {
+			bool operator!=( const SelfType &o ) const {
 				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
 			 * @brief Returns whether \c this and \p o are equal.
 			 */
-			bool operator==( const self_t &o ) const {
+			bool operator==( const SelfType &o ) const {
 				return ! this->operator!=( o );
 			}
 
@@ -566,14 +392,14 @@ namespace grb {
 
 		template<
 			size_t DIMS,
-			typename CoordT,
+			typename CoordType,
 			typename T
 		>
 		class hpcg_coarsener_builder {
 		public:
 
-			using array_t = std::array< CoordT, DIMS >;
-			using hpcg_coarsener_iterator = coarsener_generator_iterator< DIMS, CoordT, T >;
+			using array_t = std::array< CoordType, DIMS >;
+			using hpcg_coarsener_iterator = coarsener_generator_iterator< DIMS, CoordType, T >;
 
 			hpcg_coarsener_builder(
 				const array_t &_coarser_sizes,
@@ -592,13 +418,13 @@ namespace grb {
 				}
 			}
 
-			hpcg_coarsener_builder( const hpcg_coarsener_builder< DIMS, CoordT, T> & ) = delete;
+			hpcg_coarsener_builder( const hpcg_coarsener_builder< DIMS, CoordType, T> & ) = delete;
 
-			hpcg_coarsener_builder( hpcg_coarsener_builder< DIMS, CoordT, T> && ) = delete;
+			hpcg_coarsener_builder( hpcg_coarsener_builder< DIMS, CoordType, T> && ) = delete;
 
-			hpcg_coarsener_builder< DIMS, CoordT, T> & operator=( const hpcg_coarsener_builder< DIMS, CoordT, T> & ) = delete;
+			hpcg_coarsener_builder< DIMS, CoordType, T> & operator=( const hpcg_coarsener_builder< DIMS, CoordType, T> & ) = delete;
 
-			hpcg_coarsener_builder< DIMS, CoordT, T> & operator=( hpcg_coarsener_builder< DIMS, CoordT, T> && ) = delete;
+			hpcg_coarsener_builder< DIMS, CoordType, T> & operator=( hpcg_coarsener_builder< DIMS, CoordType, T> && ) = delete;
 
 			size_t system_size() const {
 				return system.system_size();
@@ -616,8 +442,8 @@ namespace grb {
 			}
 
 		private:
-			const grb::utils::geometry::linearized_ndim_system< CoordT,
-				grb::utils::geometry::array_vector_storage< CoordT, DIMS > > system;
+			const grb::utils::geometry::LinearizedNDimSystem< CoordType,
+				grb::utils::geometry::ArrayVectorStorage< CoordType, DIMS > > system;
 
 			array_t steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
 			//// incremented when incrementing the row coordinates; is is the ration between
diff --git a/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
deleted file mode 100644
index 9bb5c7a95..000000000
--- a/include/graphblas/algorithms/hpcg/old_matrix_building_utils.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg_matrix_building_utils.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build the matrices for HPCG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_OLD_MATRIX_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_OLD_MATRIX_BUILDING_UTILS
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <numeric>
-#include <stdexcept>
-#include <utility>
-
-#include <graphblas.hpp>
-
-#include "old_ndim_matrix_builders.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-		namespace old {
-
-
-		/**
-		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
-		 *
-		 * This routine initializes \p M to a matrix representing a \p DIMS -dimensions system of sizes
-		 * \p sys_sizes, with an iteration halo of size \p halo_size . The matrix diagonal values are initialized
-		 * to \p diag_value while the other non-zero values are initialized to \p non_diag_value .
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed
-		 * @param sys_sizes the sizes of the physical system
-		 * @param halo_size the size of the halo of point to iterate in
-		 * @param diag_value diagonal value
-		 * @param non_diag_value value outside of the diagonal
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_system_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & sys_sizes, std::size_t halo_size, T diag_value, T non_diag_value ) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			if( grb::nrows( M ) != n || grb::nrows( M ) != grb::ncols( M ) ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be square"
-											" and in accordance with given system "
-											"sizes" );
-			}
-			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
-			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
-		}
-
-		/**
-		 * @brief Builds a coarsener matrix for an HPCG simulation.
-		 *
-		 * It initializes \p M as a rectangular matrix, with rows corresponding to the coarser system
-		 * (of dimensions \p coarser_sizes - output) and columns corresponding to the finer system
-		 * (of dimensions \p finer_sizes - input). The resulting coarsening matrix takes in input the finer system
-		 * and coarsens it by keeping one element every \a S , where \a S is the ratio between the finer and
-		 * the coarser dimension (computed for each dimension). In this way each \p DIMS -dimensional finer element
-		 * corresponds to its bounding coarser element.
-		 *
-		 * For the coarsening to be feasible, the sizes of the finer system \b must be a multiple of those of the
-		 * coarser system. If this condition is not met, an exception is thrown.
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed with proper dimensions
-		 * @param coarser_sizes sizes of the coarser system
-		 * @param finer_sizes sizes of the finer system; each one \b must be a multiple of the corresponding value
-		 *                    in \p coarser_size , otherwise an exception is thrown
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_coarsener_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & coarser_sizes, const std::array< std::size_t, DIMS > & finer_sizes ) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
-				std::size_t step = finer_sizes[ i ] / coarser_sizes[ i ];
-				if( step * coarser_sizes[ i ] != finer_sizes[ i ] ) {
-					throw std::invalid_argument( "finer sizes should be a multiple of "
-												"coarser sizes" );
-				}
-			}
-			std::size_t const cols { std::accumulate( finer_sizes.cbegin(), finer_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be rectangular"
-											" with rows == <product of coarser sizes> "
-											"and cols == <product of finer sizes>" );
-			}
-
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
-		}
-
-		/**
-		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
-		 *
-		 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
-		 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
-		 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
-		 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
-		 *
-		 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
-		 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
-		 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
-		 *
-		 * @tparam B GraphBLAS backend for the vector
-		 * @param masks output vector of color masks
-		 * @param matrix_size size of the system matrix
-		 * @param colors numbers of colors masks to build; it must be < \p matrix_size
-		 * @return grb::RC the success value returned when trying to build the vector
-		 */
-		template< enum grb::Backend B >
-		grb::RC build_static_color_masks( std::vector< grb::Vector< bool, B > > & masks, std::size_t matrix_size, std::size_t colors ) {
-			if( ! masks.empty() ) {
-				throw std::invalid_argument( "vector of masks is expected to be "
-											"empty" );
-			}
-			if( matrix_size < colors ) {
-				throw std::invalid_argument( "syztem size is < number of colors: too "
-											"small" );
-			}
-			grb::RC rc { grb::SUCCESS };
-			masks.reserve( colors );
-			for( std::size_t i { 0U }; i < colors; i++ ) {
-				// build in-place, assuming the compiler deduces the right constructor according to B
-				masks.emplace_back( matrix_size );
-				grb::Vector< bool > & mask = masks.back();
-				// grb::set(mask, false); // DO NOT initialize false's explicitly, otherwise
-				// RBGS will touch them too and the runtime will increase!
-				for( std::size_t j = i; j < matrix_size; j += colors ) {
-					rc = grb::setElement( mask, true, j );
-					assert( rc == grb::SUCCESS );
-					if( rc != grb::SUCCESS )
-						return rc;
-				}
-			}
-			return rc;
-		}
-
-		} //namespace old
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
deleted file mode 100644
index 9f64e9884..000000000
--- a/include/graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp
+++ /dev/null
@@ -1,562 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file ndim_matrix_builders.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
- *
- * In particular, the main matrices are:
- * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
- *   each dimension in priority order, where the first dimension has highest priority and the last
- *   dimension least priority; for each point (row), all its N-dimensional neighbours within
- *   a given distance are generated for the column
- * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
- *   each point to a corresponding system of finer sizes
- *
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_OLD_NDIM_MATRIX_BUILDERS
-#define _H_GRB_ALGORITHMS_OLD_NDIM_MATRIX_BUILDERS
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <initializer_list>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace grb {
-	namespace algorithms {
-		namespace old {
-
-		/**
-		 * @brief Base class that iterates on DIMS dimensions starting from the first one.
-		 *
-		 * The coordinates are assumed to generate the row number in a matrix whose number of rows is
-		 * the product of all sizes. This class generates row numbers for physical problems described as
-		 * systems of linear equations in an n-dimensional space.
-		 *
-		 * Example of iterations in a 3D (x, y, z) system of size (4,3,2), with generated row numbers
-		 * reported as '=> ROW':
-		 * - z[0]
-		 * - y[0]
-		 * - x[0] => 0, x[1] => 1, x[2] => 2, x[3] => 3
-		 * - y[1]
-		 * - x[0] => 4, x[1] => 5, x[2] => 6, x[3] => 7
-		 * - y[2]
-		 * - x[0] => 8, x[1] => 9, x[2] => 10, x[3] => 11
-		 * - z[1]
-		 * - y[0]
-		 * - x[0] => 12, x[1] => 13, x[2] => 14, x[3] => 15
-		 * - y[1]
-		 * - x[0] => 16, x[1] => 17, x[2] => 18, x[3] => 19
-		 * - y[2]
-		 * - x[0] => 20, x[1] => 21, x[2] => 22, x[3] => 23
-		 *
-		 * The main goal of this class is to be derived by other classes to generate matrices in an
-		 * STL-iterator-fashion; hence, this class contains all the code for basic coordinate-to-row-column
-		 * conversion in \p DIM dimensions and the basic logic to increment the row number.
-		 *
-		 * @tparam DIMS number os dimensions of the system
-		 */
-		template< std::size_t DIMS >
-		struct row_generator {
-
-			using row_coordinate_type = std::size_t; ///< numeric type of rows
-			using array_t = std::array< row_coordinate_type,
-				DIMS >; ///< type for the array storing the coordinates.
-
-			const array_t physical_sizes; ///< size of each dimension, starting from the one to be explored first
-
-			/**
-			 * @brief Construct a new row generator object
-			 * @param[in] _sizes array of sizes of each dimension; no dimension should be 0, otherwise an exception
-			 *                   is thrown
-			 * @param[in] first_row first row to iterate from; it is allowed to be beyond the matrix size, e.g. to create
-			 *                      an end iterator (no check occurs)
-			 */
-			row_generator( const array_t & _sizes, row_coordinate_type first_row ) : physical_sizes( _sizes ) {
-				static_assert( DIMS > 0, "DIMS should be higher than 0" );
-				for( const auto i : _sizes ) {
-					if( i == static_cast< row_coordinate_type >( 0U ) ) {
-						throw std::invalid_argument( "All dimension sizes must "
-													 "be > 0" );
-					}
-				}
-				row_to_coords( first_row );
-			}
-
-			row_generator( const row_generator & o ) = default;
-
-			row_generator( row_generator && o ) = default;
-
-		protected:
-			// x: row_coords[0], y: row_coords[1], z: row_coords[2], ...
-			array_t row_coords; ///< n-D coordinates from which to compute the row
-
-			/**
-			 * @brief converts a row number into a n-D coordinates according to the sizes in #physical_sizes
-			 *
-			 * In case the input is higher than the nunber of rows, the last coordinate is allowed to
-			 * go beyond its physical size. E.g., if the system has size (4,3,2) and \p rowcol is 24,
-			 * the coordinates are (0,0,3).
-			 *
-			 * @param[in] rowcol row number to convert; it can be any number
-			 */
-			void row_to_coords( row_coordinate_type rowcol ) {
-				std::size_t s = 1;
-				for( std::size_t i { 0 }; i < row_coords.size() - 1; i++ )
-					s *= physical_sizes[ i ];
-
-				for( typename array_t::size_type i { row_coords.size() - 1 }; i > 0; i-- ) {
-					row_coords[ i ] = rowcol / s;
-					rowcol -= row_coords[ i ] * s;
-					s /= physical_sizes[ i ];
-				}
-				row_coords[ 0 ] = rowcol % physical_sizes[ 0 ];
-			}
-
-			/**
-			 * @brief Pure function converting an array of coordinates into a row number, based on #physical_sizes.
-			 * @param a the #array_t array of coordinates to convert
-			 * @return #row_coordinate_type the row corresponding to the coordinates in \p a
-			 */
-			row_coordinate_type coords_to_rowcol( const array_t & a ) const {
-				row_coordinate_type row { 0 };
-				row_coordinate_type s { 1 };
-				for( typename array_t::size_type i { 0 }; i < a.size(); i++ ) {
-					row += s * a[ i ];
-					s *= physical_sizes[ i ];
-				}
-				return row;
-			}
-
-			/**
-			 * @brief Increment #row_coords in order to move to the next coordinate (according to the
-			 * n-dimensional iteration order) and update #current_row accordingly.
-			 *
-			 * To be used by derived classes in order to generate the matrix, e.g. via the \c operator()++
-			 * operator prescribed for STL-like iterators.
-			 */
-			void increment_row() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & coord = row_coords[ i ];
-					// must rewind dimension if we wrap-around
-					typename array_t::value_type new_coord = ( coord + 1 ) % physical_sizes[ i ];
-					rewind = new_coord < coord;
-					coord = new_coord;
-					++i;
-				} while( rewind && i < row_coords.size() - 1 ); // rewind only the first N-1 coordinates
-
-				// if we still have to rewind, increment the last coordinate, which is unbounded
-				if( rewind ) {
-					row_coords.back()++;
-				}
-			}
-		};
-
-		// ===============================================================
-
-		/**
-		 * @brief STL-like iterable class to generate the values for a matrix by iterating in an n-dimensional
-		 * space along the coordinates.
-		 *
-		 * For each \f$ X=(x0, x1, ...,xn) \f$ point of the underlying (n+1)-dimensional space,
-		 * this class iterates through the points of the n-dimensional halo of radius \p halo around \f$ X \f$,
-		 * generating the row number corresponding to \f$ X \f$ and the column number corresponding to
-		 * each halo point. At each coordinate \code (row, col) \endcode generated this way, the corresponding matrix value
-		 * being generated depends on whether \code row == col \endcode.
-		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam HALO halo size, determining the number of points to iterate around and thus the column coordinates
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T = double >
-		struct matrix_generator_iterator : public row_generator< DIMS > {
-
-			using row_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
-			using column_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
-			using nonzero_value_type = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
-
-			using RowIndexType = typename row_generator< DIMS >::row_coordinate_type;
-			using ColumnIndexType = typename row_generator< DIMS >::row_coordinate_type;
-			using iterator_category = std::forward_iterator_tag;
-			using pointer = const value_type;
-			using reference = const value_type&;
-			using difference_type = long;
-
-			// halo may in future become a DIM-size array to iterate in arbitrary shapes
-			const row_coordinate_type halo;              ///< number of points per dimension to iterate around
-			const nonzero_value_type diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
-			const nonzero_value_type non_diagonal_value; ///< value to emit outside of the diagonal
-
-			/**
-			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
-			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
-			 *
-			 * @param sizes array with the sizes along the dimensions
-			 * @param row current row to initialize the matrix on
-			 * @param _halo halo of points to iterate around; must be > 0
-			 * @param diag value to emit when on the diagonal
-			 * @param non_diag value to emit outside the diagonal
-			 */
-			matrix_generator_iterator( const array_t & sizes, row_coordinate_type row, row_coordinate_type _halo, nonzero_value_type diag, nonzero_value_type non_diag ) :
-				row_generator< DIMS >( sizes, row ), halo( _halo ), diagonal_value( diag ), non_diagonal_value( non_diag ) {
-				if( halo <= 0 ) {
-					throw std::invalid_argument( "halo should be higher than "
-												 "0" );
-				}
-				for( const auto i : sizes ) {
-					if( i < static_cast< row_coordinate_type >( 2 * halo + 1 ) ) {
-						throw std::invalid_argument( "Iteration halo goes "
-													 "beyond system sizes" );
-					}
-				}
-				current_values.first.first = row;
-				update_column_max_values();
-				reset_all_columns();
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = v();
-			}
-
-			matrix_generator_iterator( const matrix_generator_iterator & o ) = default;
-
-			matrix_generator_iterator( matrix_generator_iterator && o ) = default;
-
-			/**
-			 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
-			 *
-			 * This operator internally increments the columns coordinates until wrap-around, when it increments
-			 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
-			 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
-			 *
-			 * @return matrix_generator_iterator<DIMS, T>& \c this object, with the updated state
-			 */
-			matrix_generator_iterator< DIMS, T > & operator++() {
-				bool must_rewind = increment_column();
-				if( must_rewind ) {
-					this->increment_row();
-					// after changing row, we must find the first non-zero column
-					reset_all_columns();
-					current_values.first.first = this->coords_to_rowcol( this->row_coords );
-					update_column_max_values();
-				}
-				// trigger column update after row update, as a row update
-				// triggers a column update
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = this->v();
-				return *this;
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they differ.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator!=( const matrix_generator_iterator< DIMS, T > & o ) const {
-				if( o.i() != this->i() ) {
-					return true;
-				}
-				return o.j() != this->j();
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they are equal.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator==( const matrix_generator_iterator< DIMS, T > & o ) const {
-				return o.i() == this->i() && o.j() == this->j();
-			}
-
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
-			const value_type & operator*() const {
-				return current_values;
-			}
-
-			/**
-			 * @brief Returns current row.
-			 */
-			inline row_coordinate_type i() const {
-				return current_values.first.first;
-			}
-
-			/**
-			 * @brief Returns current column.
-			 */
-			inline column_coordinate_type j() const {
-				return current_values.first.second;
-			}
-
-			/**
-			 * @brief Returns the current matrix value.
-			 *
-			 * @return nonzero_value_type #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
-			 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
-			 */
-			inline nonzero_value_type v() const {
-				return j() == i() ? diagonal_value : non_diagonal_value;
-			}
-
-		private:
-			// offsets w.r.t. rows
-			array_t col_coords;        ///< coordinates corresponding to current column
-			array_t column_max_values; ///< maximum values for the column coordinates, to stop column increment
-			//// and reset the column coordinates
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
-
-			/**
-			 * @brief Updates the maximum values each column coordinate can reach, according to the row coordinates.
-			 *
-			 * To be called after each row coordinates update.
-			 */
-			void update_column_max_values() {
-				for( std::size_t i { 0 }; i < column_max_values.size(); i++ ) {
-					column_max_values[ i ] = std::min( this->physical_sizes[ i ] - 1, this->row_coords[ i ] + halo );
-				}
-			}
-
-			/**
-			 * @brief Resets the value of column dimension \p dim to the first possible value.
-			 *
-			 * The final value of #col_coords[dim] depends on the current row (#row_coords) and on the \p halo
-			 * and is \f$ max(0, \f$ #row_coords \f$[dim])\f$.
-			 *
-			 * @param dim the dimension to reset
-			 */
-			void reset_column_coords( std::size_t dim ) {
-				// cannot use std::max because row_coords is unsigned and can wrap-around
-				col_coords[ dim ] = this->row_coords[ dim ] <= halo ? 0 : ( this->row_coords[ dim ] - halo );
-			}
-
-			/**
-			 * @brief resets all values in #col_coords to the initial coordinates,
-			 * iterating from on the current row.
-			 */
-			void reset_all_columns() {
-				for( std::size_t i { 0 }; i < col_coords.size(); i++ ) {
-					reset_column_coords( i );
-				}
-			}
-
-			/**
-			 * @brief Increment the column according to the iteration order, thus resetting the column coordinates
-			 * when the last possible column value for the current row has been reached.
-			 *
-			 * @return true if the column coordinates have been reset, and thus also the row must be incremented
-			 * @return false if the column coordinates
-			 */
-			bool increment_column() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & col = col_coords[ i ];
-					// must rewind dimension if the column offset is already at the max value
-					// or if the column coordinates are already at the max value
-					rewind = ( col == column_max_values[ i ] );
-					if( rewind ) {
-						// col = this->row_coords[i] == 0 ? 0 : this->row_coords[i] - (halo);
-						reset_column_coords( i );
-					} else {
-						++col;
-					}
-					++i;
-				} while( rewind && i < col_coords.size() );
-
-				// if we change z, then we also must reset x and y; if only y, we must reset x, and so on
-				return rewind;
-			}
-		};
-
-		// ===============================================================
-
-		/**
-		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
-		 *
-		 * This class coarsens a finer system to a coarser system by projecting each input value (column),
-		 * espressed in finer coordinates, to an output (row) value espressed in coarser coordinates.
-		 * The coarser sizes are assumed to be row_generator#physical_sizes, while the finer sizes are here
-		 * stored inside #finer_sizes.
-		 *
-		 * The corresponding refinement matrix is obtained by transposing the coarsening matrix.
-		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T = double >
-		struct coarsener_generator_iterator : public row_generator< DIMS > {
-
-			using row_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
-			using column_coordinate_type = typename row_generator< DIMS >::row_coordinate_type;
-			using nonzero_value_type = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< row_coordinate_type, column_coordinate_type >, T >;
-
-			using RowIndexType = typename row_generator< DIMS >::row_coordinate_type;
-			using ColumnIndexType = typename row_generator< DIMS >::row_coordinate_type;
-			using iterator_category = std::forward_iterator_tag;
-			using pointer = const value_type;
-			using reference = const value_type&;
-			using difference_type = long;
-
-			// the sizes to project from
-			const array_t finer_sizes; ///< the size of the finer system (columns)
-			array_t steps;             ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
-
-			/**
-			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
-			 * setting its row at \p _current_row and the column at the corresponding value.
-			 *
-			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
-			 * construction will throw an exception.
-			 *
-			 * @param _coarser_sizes sizes of the coarser system (rows)
-			 * @param _finer_sizes sizes of the finer system (columns)
-			 * @param _current_row row (in the coarser system) to set the iterator on
-			 */
-			coarsener_generator_iterator( const array_t & _coarser_sizes, const array_t & _finer_sizes, row_coordinate_type _current_row ) :
-				row_generator< DIMS >( _coarser_sizes, _current_row ), finer_sizes( _finer_sizes ), steps( { 0 } ) {
-				for( std::size_t i { 0 }; i < DIMS; i++ ) {
-					// finer size MUST be an exact multiple of coarser_size
-					typename array_t::value_type step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
-					if( step == 0 || finer_sizes[ i ] / step != this->physical_sizes[ i ] ) {
-						throw std::invalid_argument( std::string( "finer size "
-																  "of "
-																  "dimension"
-																  " " ) +
-							std::to_string( i ) +
-							std::string( "is not an exact multiple of coarser "
-										 "size" ) );
-					}
-					steps[ i ] = step;
-				}
-				current_values.first.first = _current_row;
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
-			}
-
-			coarsener_generator_iterator( const coarsener_generator_iterator & o ) = default;
-
-			coarsener_generator_iterator( coarsener_generator_iterator && o ) = default;
-
-			/**
-			 * @brief Increments the row and the column according to the respective physical sizes,
-			 * thus iterating onto the coarsening matrix coordinates.
-			 *
-			 * @return \code *this \endcode, i.e. the same object with the updates row and column
-			 */
-			coarsener_generator_iterator< DIMS, T > & operator++() {
-				this->increment_row();
-				current_values.first.first = this->coords_to_rowcol( this->row_coords );
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
-				return *this;
-			}
-
-			/**
-			 * @brief Returns whether \c this and \p o differ.
-			 */
-			bool operator!=( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				if( this->i() != o.i() ) {
-					return true;
-				}
-				return this->j() != o.j();
-			}
-
-			/**
-			 * @brief Returns whether \c this and \p o are equal.
-			 */
-			bool operator==( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				return this->i() == o.i() && this->j() == o.j();
-			}
-
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
-			const value_type & operator*() const {
-				return current_values;
-			}
-
-			/**
-			 * @brief Returns the current row, according to the coarser system.
-			 */
-			inline row_coordinate_type i() const {
-				return current_values.first.first;
-			}
-
-			/**
-			 * @brief Returns the current column, according to the finer system.
-			 */
-			inline column_coordinate_type j() const {
-				return current_values.first.second;
-			}
-
-			/**
-			 * @brief Returns always 1, as the coarsening keeps the same value.
-			 */
-			inline nonzero_value_type v() const {
-				return static_cast< nonzero_value_type >( 1 );
-			}
-
-		private:
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
-
-			/**
-			 * @brief Returns the row coordinates converted to the finer system, to compute
-			 * the column value.
-			 */
-			column_coordinate_type coords_to_finer_col() const {
-				column_coordinate_type row { 0 };
-				column_coordinate_type s { 1 };
-				for( typename array_t::size_type i { 0 }; i < this->row_coords.size(); i++ ) {
-					s *= steps[ i ];
-					row += s * this->row_coords[ i ];
-					s *= this->physical_sizes[ i ];
-				}
-				return row;
-			}
-		};
-
-		} // namespace old
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 77bef1995..7a8db963d 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -22,62 +22,28 @@
  * @date 2021-04-30
  */
 
-#ifndef _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
 
 #include <array>
 #include <cassert>
 #include <cstddef>
 #include <memory>
 #include <type_traits>
+#include <algorithm>
+#include <cstdlib>
+#include <stdexcept>
 
 #include <graphblas.hpp>
-#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/iterators/partition_range.hpp>
 
-#include "hpcg_data.hpp"
-#include "matrix_building_utils.hpp"
+#include "ndim_matrix_builders.hpp"
 
 #include "coloring.hpp"
 
-#ifndef MASTER_PRINT
-#define INTERNAL_MASTER_PRINT
-#define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
-#endif
-
-
 namespace grb {
 	namespace algorithms {
 
-		/**
-		 * @brief Divide each value of \p source by \p step and store the result into \p destination.
-		 *
-		 * @tparam DIMS size of passed arrays
-		 */
-		template< std::size_t DIMS >
-		void divide_array( std::array< std::size_t, DIMS > & destination, const std::array< std::size_t, DIMS > & source, std::size_t step ) {
-			for( std::size_t i { 0 }; i < destination.size(); i++ ) {
-				destination[ i ] = source[ i ] / step;
-			}
-		}
-
-		/**
-		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
-		 * coarsening information.
-		 *
-		 * @tparam DIMS dimensions of the physical system
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T >
-		struct hpcg_system_params {
-			std::array< std::size_t, DIMS > physical_sys_sizes;
-			std::size_t halo_size;
-			T diag_value;
-			T non_diag_value;
-			std::size_t min_phys_size;
-			std::size_t max_levels;
-			std::size_t coarsening_step;
-		};
-
 		template< typename CoordType > void split_rows_by_color(
 			const std::vector< CoordType > & row_colors,
 			size_t num_colors,
@@ -89,190 +55,293 @@ namespace grb {
 			}
 		}
 
-		// SystemData must have a zero_temp_vectors()
-		template< std::size_t DIMS, typename IOType, typename NonzeroType, typename SystemData >
-		grb::RC build_base_system(
-			typename std::enable_if<
-				std::is_base_of< system_data< IOType, NonzeroType >, SystemData >::value,
-			SystemData& >::type system,
-			size_t system_size,
-			const std::array< std::size_t, DIMS > & physical_sys_sizes,
-			size_t halo_size,
-			NonzeroType diag_value,
-			NonzeroType non_diag_value,
-			std::array< double, 4 > & times
+		template <
+			size_t DIMS,
+			typename coord_t,
+			typename NonzeroType,
+			enum grb::Backend B
+		> grb::RC populate_system_matrix(
+			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &system_generator,
+			grb::Matrix< NonzeroType, B > &M
 		) {
-
-			grb::RC rc { grb::SUCCESS };
 			const size_t pid { spmd<>::pid() };
-			grb::utils::Timer timer;
-			static const char * const log_prefix = "  -- ";
 
-			using coord_t = size_t;
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			size_t n { std::accumulate( physical_sys_sizes.cbegin(), physical_sys_sizes.cend(),
-				1UL, std::multiplies< size_t >() ) };
-			if( n > std::numeric_limits< coord_t >::max() ) {
-				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+			if( pid == 0) {
+				std::cout << "- generating system matrix...";
 			}
-			std::array< coord_t, DIMS > sys_sizes;
-			for( size_t i = 0; i < DIMS; i++ ) sys_sizes[i] = physical_sys_sizes[i];
-			grb::algorithms::hpcg_builder< DIMS, coord_t, NonzeroType > system_generator( sys_sizes, halo_size );
-
-			MASTER_PRINT( pid, log_prefix << "generating system matrix..." );
-			timer.reset();
-			rc = build_ndims_system_matrix< DIMS, coord_t, NonzeroType >(
-				system.A,
-				system_generator,
-				diag_value, non_diag_value
+			typename grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >::Iterator begin(
+				system_generator.make_begin_iterator() );
+			typename grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >::Iterator end(
+				system_generator.make_end_iterator()
 			);
-			if( rc != grb::SUCCESS ) {
-				return rc;
+			grb::utils::partition_iteration_range_on_procs( system_generator.num_neighbors(), begin, end );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
+		}
+
+
+		template<
+			typename coord_t,
+			size_t DIMS,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC populate_coarsener(
+			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &finer_system_generator,
+			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &coarser_system_generator,
+			coarsening_data< IOType, NonzeroType > &coarsener
+		) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+
+			const std::array< coord_t, DIMS > &finer_sizes = finer_system_generator.get_generator().get_sizes();
+			const std::array< coord_t, DIMS > &coarser_sizes = coarser_system_generator.get_generator().get_sizes();
+			const size_t finer_size = finer_system_generator.system_size();
+			const size_t coarser_size = coarser_system_generator.system_size();
+
+			if( coarser_size >= finer_size ) {
+				throw std::invalid_argument( "wrong sizes");
 			}
-			times[ 0 ] = timer.time();
-			MASTER_PRINT( pid, " time (ms) " << times[ 0 ] << std::endl );
 
-			// set values of vectors
-			MASTER_PRINT( pid, log_prefix << "populating vectors..." );
-			timer.reset();
-			rc = set( system.A_diagonal, diag_value );
-			if( rc != grb::SUCCESS ) {
-				return rc;
+			size_t const rows { coarser_size };
+			size_t const cols { finer_size };
+
+			assert( finer_sizes.size() == coarser_sizes.size() );
+
+			for( size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
+				std::ldiv_t ratio = std::ldiv( finer_sizes[ i ], coarser_sizes[ i ] );
+				if( ratio.quot < 2 || ratio.rem != 0 ) {
+					throw std::invalid_argument( "finer sizes should be a multiple of coarser sizes" );
+				}
+			}
+			grb::Matrix< NonzeroType > &M = coarsener.coarsening_matrix;
+			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
+				throw std::invalid_argument( "wrong matrix dimensions: matrix should be rectangular"
+											" with rows == <coarser size> and cols == <finer size>" );
+			}
+
+			grb::algorithms::hpcg_coarsener_builder< DIMS, coord_t, NonzeroType > coarsener_builder( coarser_sizes, finer_sizes );
+			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, NonzeroType > begin( coarsener_builder.make_begin_iterator() );
+			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, NonzeroType > end( coarsener_builder.make_end_iterator() );
+			grb::utils::partition_iteration_range_on_procs( coarsener_builder.system_size(), begin, end );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
+		}
+
+		namespace internal {
+
+			template< typename CoordType > struct true_iter {
+
+				static const bool __TRUE = true;
+
+				using self_t = true_iter< CoordType >;
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = bool;
+				using pointer = const bool *;
+				using reference = const bool&;
+				using difference_type = long;
+
+				true_iter() = delete;
+
+				true_iter( CoordType first ): index( first ) {}
+
+				true_iter( const self_t & ) = default;
+
+				self_t & operator=( const self_t & ) = default;
+
+				bool operator!=( const self_t & other ) const {
+					return this->index != other.index;
+				}
+
+				self_t & operator++() noexcept {
+					(void) index++;
+					return *this;
+				}
+
+				self_t & operator+=( size_t increment ) noexcept {
+					index += increment;
+					return *this;
+				}
+
+				difference_type operator-( const self_t & other ) noexcept {
+					return static_cast< difference_type >( this->index - other.index );
+				}
+
+				pointer operator->() const {
+					return &__TRUE;
+				}
+
+				reference operator*() const {
+					return *(this->operator->());
+				}
+
+			private:
+				CoordType index;
+			};
+
+			template< typename CoordType > const bool true_iter< CoordType >::__TRUE;
+
+			/**
+			 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
+			 *
+			 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
+			 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
+			 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
+			 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
+			 *
+			 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
+			 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
+			 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
+			 *
+			 * @tparam B GraphBLAS backend for the vector
+			 * @param masks output vector of color masks
+			 * @param matrix_size size of the system matrix
+			 * @param colors numbers of colors masks to build; it must be < \p matrix_size
+			 * @return grb::RC the success value returned when trying to build the vector
+			 */
+			template< enum grb::Backend B >
+			grb::RC build_static_color_masks(
+				size_t matrix_size,
+				const std::vector< std::vector< size_t > > &per_color_rows,
+				std::vector< grb::Vector< bool, B > > & masks
+			) {
+				if( ! masks.empty() ) {
+					throw std::invalid_argument( "vector of masks is expected to be empty" );
+				}
+				for( size_t i = 0; i < per_color_rows.size(); i++ ) {
+					const std::vector< size_t > & rows = per_color_rows[ i ];
+					/*
+					{
+						std::cout << "\ncolor " << i << std::endl;
+						for( size_t row : rows ) {
+							std::cout << row << " ";
+						}
+						std::cout << std::endl;
+					}
+					*/
+					masks.emplace_back( matrix_size );
+					grb::Vector< bool > & output_mask = masks.back();
+					std::vector< size_t >::const_iterator begin = rows.cbegin();
+					std::vector< size_t >::const_iterator end = rows.cend();
+					// partition_iteration_range( rows.size(), begin, end );
+					grb::RC rc = grb::buildVectorUnique( output_mask, begin , end, true_iter< size_t >( 0 ),
+						true_iter< size_t >( std::distance( begin, end ) ), IOMode::SEQUENTIAL );
+					if( rc != SUCCESS ) {
+						std::cerr << "error while creating output mask for color " << i << ": "
+							<< toString( rc ) << std::endl;
+						return rc;
+					}
+					/*
+					{
+						std::cout << "mask color " << i << std::endl;
+						size_t count = 0;
+						for( const auto & v : output_mask ) {
+							std::cout << v.first << " ";
+							count++;
+							if( count > 20 ) break;
+						}
+						std::cout << std::endl;
+					}
+					*/
+				}
+				return grb::SUCCESS;
 			}
-			rc = system.zero_temp_vectors();
+
+		} // namespace internal
+
+		template<
+			typename coord_t,
+			size_t DIMS,
+			typename T
+		> grb::RC populate_smoothing_data(
+			const grb::algorithms::HPCGBuilder< DIMS, coord_t, T > &system_generator,
+			smoother_data< T > &smoothing_info
+		) {
+			const size_t pid { spmd<>::pid() };
+
+			grb::RC rc = set( smoothing_info.A_diagonal, system_generator.get_diag_value() );
 			if( rc != grb::SUCCESS ) {
+				if( pid == 0 ) {
+					std::cout << "error: " << __LINE__ << std::endl;
+				}
 				return rc;
 			}
-			times[ 1 ] = timer.time();
-			MASTER_PRINT( pid, " time (ms) " << times[ 1 ] << std::endl );
 
-			MASTER_PRINT( pid, log_prefix << "running coloring heuristics..." );
-			timer.reset();
+			if( pid == 0 ) {
+				std::cout << "- running coloring heuristics...";
+			}
 			std::vector< coord_t > colors, color_counters;
 			color_matrix_greedy( system_generator.get_generator(), colors, color_counters );
 			std::vector< std::vector< coord_t > > per_color_rows;
 			split_rows_by_color( colors, color_counters.size(), per_color_rows );
 			if( rc != grb::SUCCESS ) {
+				if( pid == 0 ) {
+					std::cout << "error: " << __LINE__ << std::endl;
+				}
 				return rc;
 			}
-			times[ 2 ] = timer.time();
-			MASTER_PRINT( pid, " found " << color_counters.size() << " colors, time (ms) "
-				<< times[ 2 ] << std::endl );
-
-
-			MASTER_PRINT( pid, log_prefix << "generating color masks..." );
-			timer.reset();
-			rc = build_static_color_masks( system_size, per_color_rows, system.color_masks );
-			if( rc != grb::SUCCESS ) {
-				return rc;
+			if( pid == 0 ) {
+				std::cout <<"- found " << color_counters.size() << " colors,"
+					<< " generating color masks...";
 			}
-			times[ 3 ] = timer.time();
-			MASTER_PRINT( pid, " time (ms) " << times[ 3 ] << std::endl );
-
-			return rc;
+			return internal::build_static_color_masks( system_generator.system_size(),
+				per_color_rows, smoothing_info.color_masks );
 		}
 
 		/**
-		 * @brief Generates an entire HPCG problem according to the parameters in \p params , storing it in \p holder .
+		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
+		 * coarsening information.
 		 *
-		 * @tparam DIMS dimensions of the system
+		 * @tparam DIMS dimensions of the physical system
 		 * @tparam T type of matrix values
-		 * @param holder std::unique_ptr to store the HPCG problem into
-		 * @param params parameters container to build the HPCG problem
-		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
-		 * otherwise the first unsuccessful return value
 		 */
-		template< std::size_t DIMS, typename T = double >
-		grb::RC build_hpcg_system(
-			std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder,
-			const hpcg_system_params< DIMS, T > & params
-		) {
-			// n is the system matrix size
-			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(),
-				params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-
-			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n ) };
-
-			assert( ! holder ); // should be empty
-			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
+		template< size_t DIMS, typename T >
+		struct hpcg_system_params {
+			std::array< size_t, DIMS > physical_sys_sizes;
+			size_t halo_size;
+			T diag_value;
+			T non_diag_value;
+			size_t min_phys_size;
+			size_t max_levels;
+			size_t coarsening_step;
+		};
 
-			// initialize the main (=uncoarsened) system matrix
-			grb::RC rc { grb::SUCCESS };
-			const size_t pid { spmd<>::pid() };
-			grb::utils::Timer timer;
+		template<
+			size_t DIMS,
+			typename coord_t,
+			typename T
+		> void build_hpcg_multigrid_generators(
+			const hpcg_system_params< DIMS, T > &params,
+			std::vector< grb::algorithms::HPCGBuilder< DIMS, coord_t, T > > &mg_generators
+		) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
 
-			std::array< double, 4 > times;
-			MASTER_PRINT( pid, "\n-- main system" << std::endl );
-			rc = build_base_system< DIMS, T, T, grb::algorithms::hpcg_data< T, T, T > >( *data, n, params.physical_sys_sizes, params.halo_size,
-				params.diag_value, params.non_diag_value, times );
-			if( rc != grb::SUCCESS ) {
-				MASTER_PRINT( pid, " error: " << toString( rc ) );
-				return rc;
+			size_t const current_size{ std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL,
+				std::multiplies< size_t >() ) };
+			if( current_size > std::numeric_limits< coord_t >::max() ) {
+				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+			}
+			size_t min_physical_size { *std::min_element( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend() ) };
+			if( min_physical_size < params.min_phys_size ) {
+				throw std::domain_error( "the initial system is too small" );
 			}
-			MASTER_PRINT( pid, "-- main system generation time (ms) "
-				"(system matrix,vectors,coloring,color masks):" << times[ 0 ] << "," << times[ 1 ]
-				<< "," << times[ 2 ] << "," << times[ 3 ] << std::endl;
-			);
 
-			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
-			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
-			assert( *coarser == nullptr );
-			std::array< std::size_t, DIMS > coarser_sizes;
-			std::array< std::size_t, DIMS > previous_sizes( params.physical_sys_sizes );
-			std::size_t min_physical_coarsened_size { *std::min_element( previous_sizes.cbegin(), previous_sizes.cend() ) / params.coarsening_step };
-			// coarsen system sizes into coarser_sizes
-			divide_array( coarser_sizes, previous_sizes, params.coarsening_step );
-			std::size_t coarsening_level = 0UL;
+			std::array< coord_t, DIMS > coord_sizes;
+			// type-translate coordinates
+			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), coord_sizes.begin() );
 
 			// generate linked list of hierarchical coarseners
-			while( min_physical_coarsened_size >= params.min_phys_size && coarsening_level < params.max_levels ) {
-				assert( *coarser == nullptr );
-				// compute size of finer and coarser matrices
-				std::size_t coarser_size { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-				std::size_t previous_size { std::accumulate( previous_sizes.cbegin(), previous_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-				// build data structures for new level
-				grb::algorithms::multi_grid_data< T, T > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
-				// install coarser level immediately to cleanup in case of build error
-				*coarser = new_coarser;
-
-				MASTER_PRINT( pid, "-- level " << coarsening_level << "\n  -- generating coarsening matrix...\n" );
-				timer.reset();
-				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
-				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
-				if( rc != grb::SUCCESS ) {
-					MASTER_PRINT( pid, " error: " << toString( rc ) );
-					return rc;
-				}
-				double coarsener_gen_time{ timer.time() };
+			for( size_t coarsening_level = 0UL;
+				min_physical_size >= params.min_phys_size && coarsening_level <= params.max_levels;
+				coarsening_level++ ) {
 
-				rc = build_base_system< DIMS, T, T, grb::algorithms::multi_grid_data< T, T > >( *new_coarser, coarser_size, coarser_sizes, params.halo_size,
-					params.diag_value, params.non_diag_value, times );
-				if( rc != grb::SUCCESS ) {
-					MASTER_PRINT( pid, " error: " << toString( rc ) );
-					return rc;
-				}
-				MASTER_PRINT( pid, "-- level generation time (ms) "
-					"(level,coarsening matrix,system matrix,vectors,coloring,color masks):"
-					<< coarsening_level << "," << coarsener_gen_time << "," << times[ 0 ] << "," << times[ 1 ]
-					<< "," << times[ 2 ] << "," << times[ 3 ] << std::endl;
-				);
+				// build generator
+				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value, params.non_diag_value );
 
 				// prepare for new iteration
-				coarser = &new_coarser->coarser_level;
-				min_physical_coarsened_size /= params.coarsening_step;
-				previous_sizes = coarser_sizes;
-				divide_array( coarser_sizes, coarser_sizes, params.coarsening_step );
-				coarsening_level++;
+				min_physical_size /= params.coarsening_step;
+				std::for_each( coord_sizes.begin(), coord_sizes.end(),
+					[ &params ]( coord_t &v ){ v /= params.coarsening_step; });
 			}
-			return rc;
 		}
 
 	} // namespace algorithms
 } // namespace grb
 
-#ifdef INTERNAL_MASTER_PRINT
-#undef INTERNAL_MASTER_PRINT
-#undef MASTER_PRINT
-#endif
-
-#endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#endif // _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/multigrid/coarsener.hpp b/include/graphblas/algorithms/multigrid/coarsener.hpp
new file mode 100644
index 000000000..47116c22a
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/coarsener.hpp
@@ -0,0 +1,197 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hpcg_data.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Implementation of the coarsener of HPCG
+ * @date 2022-11-08
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_COARSENER
+#define _H_GRB_ALGORITHMS_HPCG_COARSENER
+
+#include <vector>
+#include <memory>
+
+#include <graphblas.hpp>
+
+#include "multigrid_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+
+		template<
+			typename IOType,
+			typename NonzeroType
+		>
+		struct coarsening_data {
+
+			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
+			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
+			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
+
+			/**
+			 * @brief Construct a new \c coarsening_data by initializing internal data structures
+			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
+			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
+			 */
+			coarsening_data( size_t _finer_size, size_t coarser_size ) :
+				coarsening_matrix( coarser_size, _finer_size ),
+				Ax_finer( _finer_size ) {}
+
+			grb::RC zero_temp_vectors() {
+				return grb::set( Ax_finer, 0 );
+			}
+		};
+
+		namespace internal {
+
+			/**
+			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
+			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
+			 *
+			 * The coarsening information are stored inside \p coarsening_data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param[in] r_fine fine residual vector
+			 * @param[in,out] coarsening_data \ref multigrid_data data structure storing the information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			template<
+				typename IOType,
+				typename NonzeroType,
+				class Ring,
+				class Minus
+			> grb::RC compute_coarsening(
+				const grb::Vector< IOType > & r_fine, // fine residual
+				grb::Vector< IOType > & r_coarse, // fine residual
+				coarsening_data< IOType, NonzeroType > & coarsening_data,
+				const Ring & ring,
+				const Minus & minus
+			) {
+				RC ret { SUCCESS };
+				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer prima" );
+				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
+									  minus ); // Ax_finer = r_fine - Ax_finer
+				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer dopo" );
+				assert( ret == SUCCESS );
+
+				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
+				// to *coarsening_data->system_size
+				ret = ret ? ret : grb::set( r_coarse, 0 );
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( r_coarse, coarsening_data.coarsening_matrix,
+					coarsening_data.Ax_finer, ring ); // r = coarsening_matrix * Ax_finer
+				// DBG_print_norm( r_coarse, "+++ r_coarse" );
+				return ret;
+			}
+
+			/**
+			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
+			 * \p x_fine.
+			 *
+			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[out] x_fine the solution vector to store the prolonged solution into
+			 * @param[in,out] coarsening_data information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 * unsuccessful operation otherwise
+			 */
+			template<
+				typename IOType,
+				typename NonzeroType,
+				class Ring
+			> grb::RC compute_prolongation(
+				const grb::Vector< IOType > & z_coarse,
+				grb::Vector< IOType > & x_fine, // fine residual
+				grb::algorithms::coarsening_data< IOType, NonzeroType > & coarsening_data,
+				const Ring & ring
+			) {
+				RC ret { SUCCESS };
+				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
+				// to nrows(x_fine)
+				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
+
+				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix | grb::descriptors::dense >(
+					coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, z_coarse, ring );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
+				assert( ret == SUCCESS );
+				return ret;
+			}
+
+		} // namespace internal
+
+		template<
+			typename IOType,
+			typename NonzeroType,
+			class Ring,
+			class Minus
+		> struct single_point_coarsener {
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+
+			using MultiGridInputType = multigrid_data< IOType, NonzeroType >;
+
+			// default value: override with your own
+			std::vector< std::unique_ptr< grb::algorithms::coarsening_data< IOType, NonzeroType > > > coarsener_levels;
+			Ring ring;
+			Minus minus;
+
+
+			// single_point_coarsener() = default;
+
+			inline grb::RC coarsen_residual(
+				const MultiGridInputType &finer,
+				MultiGridInputType &coarser
+			) {
+				// first compute the residual
+				coarsening_data< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
+				grb::RC ret = grb::set( coarsener.Ax_finer, 0 );
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( coarsener.Ax_finer, finer.A, finer.z, ring );
+				// DBG_print_norm( coarsener.Ax_finer, "temp Axf" );
+				return internal::compute_coarsening( finer.r, coarser.r, coarsener, ring, minus );
+			}
+
+			inline grb::RC prolong_solution(
+				const MultiGridInputType &coarser,
+				MultiGridInputType &finer
+			) {
+				return internal::compute_prolongation( coarser.z, finer.z, *coarsener_levels[ finer.level ], ring );
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_COARSENER
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
new file mode 100644
index 000000000..714555426
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -0,0 +1,56 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <memory>
+#include <cstddef>
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+
+namespace grb {
+	namespace algorithms {
+
+		template<
+			typename MGInfoType,
+			typename CoarsenerInfoType,
+			typename SmootherInfoType
+		> void allocate_multigrid_data(
+			const std::vector< size_t > &mg_sizes,
+			std::vector< std::unique_ptr< MGInfoType > > &system_levels,
+			std::vector< std::unique_ptr< CoarsenerInfoType > > &coarsener_levels,
+			std::vector< std::unique_ptr< SmootherInfoType > > &smoother_levels
+		) {
+			if( mg_sizes.size() == 0 ) {
+				throw std::invalid_argument( "at least one size should be available" );
+			}
+			size_t finer_size = mg_sizes[ 0 ];
+			system_levels.emplace_back( new MGInfoType( 0, finer_size ) ); // create main system
+			smoother_levels.emplace_back( new SmootherInfoType( finer_size ) ); // create smoother for main
+			for( size_t i = 1; i < mg_sizes.size(); i++ ) {
+				size_t coarser_size = mg_sizes[ i ];
+				coarsener_levels.emplace_back( new CoarsenerInfoType( finer_size, coarser_size ) );
+				system_levels.emplace_back( new MGInfoType( i, coarser_size ) );
+				smoother_levels.emplace_back( new SmootherInfoType( coarser_size ) );
+				finer_size = coarser_size;
+			}
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
new file mode 100644
index 000000000..2ac3c0770
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -0,0 +1,360 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hpcg.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief File with the main routine to run a full HPCG simulation, comprising multi-grid runs
+ *        with Red-Black Gauss-Seidel smoothing.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_CG
+#define _H_GRB_ALGORITHMS_MULTIGRID_CG
+
+#include <type_traits>
+#include <utility>
+
+#include <graphblas.hpp>
+
+#include "multigrid_data.hpp"
+
+#include <graphblas/utils/Timer.hpp>
+
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * @brief Data stucture to store the data for a full HPCG run: system vectors and matrix,
+		 * coarsening information and temporary vectors.
+		 *
+		 * This data structures contains all the needed vectors and matrices to solve a linear system
+		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
+		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
+		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
+		 * building a \code multigrid_data<IOType, NonzeroType> \endcode object and storing its pointer into
+		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
+		 * user's intervention.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 * @tparam InputType type of the values of the right-hand side vector #b
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename InputType
+		> struct mg_cg_data {
+
+			grb::Vector< InputType > b; ///< right-side vector of known values
+			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
+			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
+			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
+			///< to set the initial solution value
+
+
+			/**
+			 * @brief Construct a new \c hpcg_data object by building vectors and matrices and by setting
+			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
+			 *
+			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
+			 */
+			mg_cg_data( size_t sys_size ) :
+				b( sys_size ),
+				u( sys_size ),
+				p( sys_size ),
+				x( sys_size ) {}
+
+			grb::RC zero_temp_vectors() {
+				grb::RC rc = grb::set( u, 0 );
+				rc = rc ? rc : grb::set( p, 0 );
+				return rc;
+			}
+		};
+
+		template <
+			typename IOType,
+			typename ResidualType,
+			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
+			class Minus = operators::subtract< IOType >
+		>
+		struct cg_options {
+			bool with_preconditioning;
+			size_t max_iterations;
+			ResidualType tolerance;
+			bool print_iter_stats;
+			Ring ring;
+			Minus minus;
+		};
+
+
+		template < typename ResidualType > struct cg_out_data {
+			size_t iterations;
+			ResidualType norm_residual;
+		};
+
+		/**
+		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
+		 *
+		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the HPCG algorithm.
+		 * The implementation here closely follows the reference HPCG benchmark used for the HPCG500 rank,
+		 * visible at https://github.com/hpcg-benchmark/hpcg.
+		 * The only difference is the usage of a Red-Black Gauss-Seidel smoother instead of the standard one
+		 * for performance reasons, as the standard Gauss-Seidel algorithm is inherently sequential and not
+		 * expressible in terms of standard linear algebra operations.
+		 * In particular, this implementation (as the standard one) couples a standard CG algorithm with a V-cycle
+		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
+		 * availability of coarsening information, which should be stored inside \p data; otherwise,
+		 * the refinement is not performed and only the CG algorithm is run. For more information on inputs
+		 * and on coarsening information, you may consult the \ref hpcg_data class documentation.
+		 *
+		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
+		 * and populated with the proper values; in particular
+		 * - hpcg_data#x with the initial tentative solution (iterative solutions are also stored here)
+		 * - hpcg_data#A with the system matrix
+		 * - hpcg_data#b with the right-hand side vector \f$ b \f$
+		 * - hpcg_data#A_diagonal with the diagonal values of the matrix
+		 * - hpcg_data#color_masks with the color masks for this level
+		 * - hpcg_data#coarser_level with the information for the coarser multi-grid run (if any)
+		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
+		 * but not necessarily populated with values, as they are internally populated when needed; hence,
+		 * any previous values are overwritten.
+		 *
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
+		 * the failure code.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam ResidualType type of the residual norm
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam InputType type of values of the right-hand side vector b
+		 * @tparam Ring the ring of algebraic operators zero-values
+		 * @tparam Minus the minus operator for subtractions
+		 *
+		 * @param[in,out] data \ref hpcg_data object storing inputs, outputs and temporary vectors used for the computation,
+		 *                     as long as the information for the recursive multi-grid runs
+		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
+		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
+		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
+		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
+		 *                           the simulation stops even if the residual norm is above \p tolerance
+		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
+		 *                      the simulation at
+		 * @param[out] iterations numbers of iterations performed
+		 * @param[out] norm_residual norm of the final residual
+		 * @param[in] ring the ring to perform the operations on
+		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+		 *                          unsuccessful operation otherwise
+		 */
+		template<
+			typename IOType,
+			typename ResidualType,
+			typename NonzeroType,
+			typename InputType,
+			typename MultiGridrunnerType,
+			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
+			class Minus = operators::subtract< IOType >
+		> grb::RC mg_cg(
+			multigrid_data< IOType, NonzeroType > &grid_base,
+			mg_cg_data< IOType, NonzeroType, InputType > &data,
+			const cg_options< IOType, ResidualType > &cg_opts,
+			MultiGridrunnerType &multigrid_runner,
+			cg_out_data< ResidualType > &out_data
+		) {
+			ResidualType alpha;
+
+			const grb::Matrix< NonzeroType > &A { grid_base.A };
+			grb::Vector< IOType > &r { grid_base.r };  // residual vector
+			grb::Vector< IOType > &z { grid_base.z };  // pre-conditioned residual vector
+			grb::Vector< IOType > &x { data.x };
+			const grb::Vector< InputType > &b { data.b };
+			grb::Vector< IOType > &p { data.p };  // direction vector
+			grb::Vector< IOType > &Ap { data.u }; // temp vector
+			grb::RC ret { SUCCESS };
+
+			ret = ret ? ret : grb::set( Ap, 0 );
+			ret = ret ? ret : grb::set( r, 0 );
+			ret = ret ? ret : grb::set( p, 0 );
+
+			ret = ret ? ret : grb::set( p, x );
+			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, cg_opts.ring ); // Ap = A * x
+			assert( ret == SUCCESS );
+
+			ret = ret ? ret : grb::eWiseApply( r, b, Ap, cg_opts.minus ); // r = b - Ap;
+			assert( ret == SUCCESS );
+
+			ResidualType norm_residual = cg_opts.ring.template getZero< ResidualType >();
+			ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring ); // norm_residual = r' * r;
+			assert( ret == SUCCESS );
+
+			// compute sqrt to avoid underflow
+			norm_residual = std::sqrt( norm_residual );
+
+			// initial norm of residual
+			out_data.norm_residual = norm_residual;
+			const ResidualType norm_residual_initial { norm_residual };
+			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
+			size_t iter { 0 };
+
+			grb::utils::Timer timer;
+
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( p, "start p" );
+			DBG_print_norm( Ap, "start Ap" );
+			DBG_print_norm( r, "start r" );
+#endif
+
+			do {
+#ifdef HPCG_PRINT_STEPS
+				DBG_println( "========= iteration " << iter << " =========" );
+#endif
+				if( cg_opts.with_preconditioning ) {
+					if( cg_opts.print_iter_stats ) {
+						timer.reset();
+					}
+					ret = ret ? ret : multigrid_runner( grid_base );
+					assert( ret == SUCCESS );
+					if( cg_opts.print_iter_stats ) {
+						double duration = timer.time();
+						std::cout << "iteration, pre-conditioner: " << iter << ","
+							<< duration << std::endl;
+					}
+				} else {
+					ret = ret ? ret : grb::set( z, r ); // z = r;
+					assert( ret == SUCCESS );
+				}
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( z, "initial z" );
+#endif
+
+				ResidualType pAp;
+
+				if( iter == 0 ) {
+					ret = ret ? ret : grb::set( p, z ); //  p = z;
+					assert( ret == SUCCESS );
+
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+				} else {
+					old_r_dot_z = r_dot_z;
+
+					r_dot_z = cg_opts.ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+
+					beta = r_dot_z / old_r_dot_z;
+					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
+					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, cg_opts.ring ); // Ap += beta * p + z;
+					std::swap( Ap, p );                                         // p = Ap;
+					assert( ret == SUCCESS );
+				}
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( p, "middle p" );
+#endif
+
+				ret = ret ? ret : grb::set( Ap, 0 );
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, cg_opts.ring ); // Ap = A * p;
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( Ap, "middle Ap" );
+#endif
+				pAp = cg_opts.ring.template getZero< ResidualType >();
+				ret = ret ? ret : grb::dot( pAp, Ap, p, cg_opts.ring ); // pAp = p' * Ap
+				assert( ret == SUCCESS );
+
+				alpha = r_dot_z / pAp;
+
+				ret = ret ? ret : grb::eWiseMul( x, alpha, p, cg_opts.ring ); // x += alpha * p;
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( x, "end x" );
+#endif
+
+				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, cg_opts.ring ); // r += - alpha * Ap;
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( r, "end r" );
+#endif
+
+				norm_residual = cg_opts.ring.template getZero< ResidualType >();
+				ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring ); // residual = r' * r;
+				assert( ret == SUCCESS );
+
+				norm_residual = std::sqrt( norm_residual );
+
+				if( cg_opts.print_iter_stats ) {
+					std::cout << "iteration, residual: " << iter << "," << norm_residual << std::endl;
+				}
+
+				++iter;
+				out_data.iterations = iter;
+				out_data.norm_residual = norm_residual;
+			} while( iter < cg_opts.max_iterations &&
+				norm_residual / norm_residual_initial > cg_opts.tolerance && ret == SUCCESS );
+
+			return ret;
+		}
+
+
+
+
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename InputType,
+			typename ResidualType,
+			typename MultiGridRunnerType,
+			class Ring,
+			class Minus
+
+		> struct mg_cg_runner {
+
+			using HPCGInputType = mg_cg_data< IOType, NonzeroType, InputType >;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+			// static_assert( std::is_copy_constructible< MultiGridRunnerType >::value,
+			// 	"cannot construct the Multi-Grid runner by copy" );
+			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
+				"cannot construct the Multi-Grid runner by move" );
+
+			// default value: override with your own
+			cg_options< IOType, ResidualType, Ring, Minus > cg_opts{ true, 10, 0.0, false, Ring(), Minus() };
+
+			MultiGridRunnerType mg_runner;
+
+			mg_cg_runner(
+				MultiGridRunnerType &&_mg_runner
+			) : mg_runner( std::move( _mg_runner ) ) {}
+
+			inline grb::RC operator()(
+				typename MultiGridRunnerType::MultiGridInputType &grid_base,
+				mg_cg_data< IOType, NonzeroType, InputType > &data,
+				cg_out_data< ResidualType > &out_data
+			) {
+				return mg_cg( grid_base, data, cg_opts, mg_runner, out_data );
+			}
+
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_CG
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
new file mode 100644
index 000000000..e76063aec
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -0,0 +1,105 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hpcg_data.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Data structures to store HPCG input/output data.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_DATA
+#define _H_GRB_ALGORITHMS_HPCG_DATA
+
+#include <vector>
+#include <cstddef>
+
+#include <graphblas.hpp>
+
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * @brief Data container for all multi-grid inputs and outputs.
+		 *
+		 * @tparam IOType Type of values of the vectors for intermediate results
+		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
+		 *                     and the coarsening matrix #Ax_finer
+		 *
+		 * This data structure stores information for a full multi-grid V cycle, i.e.
+		 * - input and output vectors for solution, residual and temporary vectors
+		 * - coarsening information, in particular the #coarsening_matrix that
+		 *   coarsens a larger system of size #finer_size to the current system
+		 *   of size #system_size
+		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
+		 *   if no further coarsening is desired; note that this information is automatically
+		 *   destructed on object destruction (if any)
+		 *
+		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
+		 * thus having size #system_size; this also holds for the system matrix #A,
+		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
+		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
+		 * coming from another \code multigrid_data<IOType, NonzeroType> \endcode object whose #system_size equals
+		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
+		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
+		 * the information for a further coarsening is stored inside #coarser_level, so that the
+		 * hierarchy of coarsened levels is reflected inside this data structure.
+		 *
+		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
+		 * but their values are \b not initialized.
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType
+		> struct multigrid_data {
+
+			const size_t level;
+			const size_t system_size; ///< size of the system, i.e. side of the #A
+			grb::Matrix< NonzeroType > A;                   ///< system matrix
+			grb::Vector< IOType > z;                        ///< multi-grid solution
+			grb::Vector< IOType > r;                        ///< residual
+
+			multigrid_data(
+				size_t _level,
+				size_t sys_size
+			) :
+				level( _level ),
+				system_size( sys_size ),
+				A( sys_size, sys_size ),
+				z( sys_size ),
+				r( sys_size ) {}
+
+			// for safety, disable copy semantics
+			multigrid_data( const multigrid_data< IOType, NonzeroType > & o ) = delete;
+
+			multigrid_data<IOType, NonzeroType > & operator=( const multigrid_data< IOType, NonzeroType > & ) = delete;
+
+			grb::RC zero_temp_vectors() {
+				grb::RC rc = grb::set( z, 0 );
+				rc = rc ? rc : grb::set( r, 0 );
+				return rc;
+			}
+		};
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_DATA
+
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
new file mode 100644
index 000000000..77b785e2d
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -0,0 +1,237 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file multigrid_v_cycle.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief This file contains the routines for multi-grid solution refinement, including the main routine
+ *        and those for coarsening and refinement of the tentative solution.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+
+#include <cassert>
+#include <vector>
+#include <type_traits>
+#include <memory>
+#include <utility>
+
+#include <graphblas.hpp>
+
+#include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
+
+#include "multigrid_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+		/**
+		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
+		 */
+		namespace internal {
+
+
+
+		} // namespace internal
+
+		/**
+		 * @brief Multi-grid V cycle implementation to refine a given solution.
+		 *
+		 * A full multi-grid run goes through the following steps:
+		 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
+		 *    to improve on the initial solution stored into \p data.z
+		 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
+		 * -# a multi-grid run is recursively performed on the coarser system
+		 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
+		 *    into \p data.z
+		 * -# this solution is further smoothed for \p postsmoother_steps steps
+		 *
+		 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
+		 *
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
+		 * the failure code.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam Ring the ring of algebraic operators zero-values
+		 * @tparam Minus the minus operator for subtractions
+		 *
+		 * @param[in,out] data \ref multigrid_data object storing the relevant data for the multi-grid run of the current
+		 *                     clevel
+		 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
+		 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
+		 *                and only smoothing occurs on the current solution
+		 * @param[in] presmoother_steps number of pre-smoother steps
+		 * @param[in] postsmoother_steps number of post-smoother steps
+		 * @param[in] ring the ring to perform the operations on
+		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+		 *                          unsuccessful operation otherwise
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename MGSysIterType,
+			typename MGSmootherType,
+			typename CoarsenerType,
+			class Ring,
+			class Minus
+		> grb::RC multi_grid(
+			MGSysIterType mgiter_begin,
+			const MGSysIterType mgiter_end,
+			MGSmootherType &smoother,
+			CoarsenerType &coarsener,
+			const Ring &ring,
+			const Minus &minus
+		) {
+			static_assert( std::is_base_of< multigrid_data< IOType, NonzeroType >,
+				typename std::decay< decltype( *mgiter_begin ) >::type >::value, "the iterator type MGSysIterType"
+				" must reference an object of type multigrid_data< IOType, NonzeroType >" );
+
+			RC ret { SUCCESS };
+			assert( mgiter_begin != mgiter_end );
+			multigrid_data< IOType, NonzeroType > &finer_system = *mgiter_begin;
+			++mgiter_begin;
+
+#ifdef HPCG_PRINT_STEPS
+			DBG_println( "mg BEGINNING {" );
+#endif
+
+
+			// clean destination vector
+			ret = ret ? ret : grb::set( finer_system.z, 0 );
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( finer_system.r, "initial r" );
+#endif
+			if( !( mgiter_begin != mgiter_end ) ) {
+				// compute one round of Gauss Seidel and return
+				ret = ret ? ret : smoother.nonrecursive_smooth( finer_system );
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				DBG_print_norm( finer_system.z, "smoothed z" );
+				DBG_println( "} mg END" );
+#endif
+				return ret;
+			}
+			multigrid_data< IOType, NonzeroType > &coarser_system = *mgiter_begin;
+
+			// pre-smoother
+			ret = ret ? ret : smoother.pre_smooth( finer_system );
+			assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( finer_system.z, "pre-smoothed z" );
+#endif
+
+			ret = ret ? ret : coarsener.coarsen_residual( finer_system, coarser_system );
+			assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( coarser_system.r, "coarse r" );
+#endif
+
+			ret = ret ? ret : multi_grid< IOType, NonzeroType, MGSysIterType,
+				MGSmootherType, CoarsenerType, Ring, Minus >( mgiter_begin, mgiter_end,
+				smoother, coarsener, ring, minus );
+			assert( ret == SUCCESS );
+
+			ret = ret ? ret : coarsener.prolong_solution( coarser_system, finer_system );
+			assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( finer_system.z, "prolonged z" );
+#endif
+
+			// post-smoother
+			ret = ret ? ret : smoother.post_smooth( finer_system );
+			assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+			DBG_print_norm( finer_system.z, "post-smoothed z" );
+			DBG_println( "} mg END" );
+#endif
+
+			return ret;
+		}
+
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename InputType,
+			typename MGSmootherType,
+			typename CoarsenerType,
+			class Ring,
+			class Minus
+		> struct multigrid_runner {
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+			static_assert( std::is_move_constructible< MGSmootherType >::value,
+				"MGSmootherType must be move-constructible");
+			static_assert( std::is_move_constructible< CoarsenerType >::value,
+				"CoarsenerType must be move-constructible");
+
+			using MultiGridInputType = multigrid_data< IOType, NonzeroType >;
+
+			// check the interface between HPCG and MG match
+			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType,
+				MultiGridInputType >::value, "input type of the Smoother kernel must match the input from Multi-Grid" );
+
+			MGSmootherType smoother_runner;
+			CoarsenerType coarsener_runner;
+			std::vector< std::unique_ptr< MultiGridInputType > > system_levels;
+			Ring ring;
+			Minus minus;
+
+			struct Extractor {
+				MultiGridInputType & operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference &ref
+				) {
+					return *ref.get();
+				}
+
+				const MultiGridInputType & operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::const_reference &ref
+				) const {
+					return *ref.get();
+				}
+			};
+
+			using UniquePtrExtractor = grb::utils::IteratorValueAdaptor<
+				typename std::vector< std::unique_ptr< MultiGridInputType > >::iterator,
+				Extractor
+			>;
+
+
+			multigrid_runner(
+				MGSmootherType &&_smoother_runner,
+				CoarsenerType &&_coarsener_runner
+			) : smoother_runner( std::move( _smoother_runner ) ),
+				coarsener_runner( std::move(  _coarsener_runner ) ) {}
+
+			inline grb::RC operator()(
+				MultiGridInputType &system
+			) {
+				return multi_grid< IOType, NonzeroType, UniquePtrExtractor, MGSmootherType, CoarsenerType, Ring, Minus >(
+					UniquePtrExtractor( system_levels.begin() += system.level ), UniquePtrExtractor( system_levels.end() ),
+					smoother_runner, coarsener_runner, ring, minus );
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
similarity index 57%
rename from include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
rename to include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 6fdc3c9a3..615b4340b 100644
--- a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -29,8 +29,31 @@
 
 #include <graphblas.hpp>
 
+#include "multigrid_data.hpp"
+
 namespace grb {
 	namespace algorithms {
+
+		template< typename IOType > struct smoother_data {
+
+			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
+			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
+			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
+
+			smoother_data( size_t sys_size ) :
+				A_diagonal( sys_size ),
+				smoother_temp( sys_size ) { }
+
+			// for safety, disable copy semantics
+			smoother_data( const smoother_data & o ) = delete;
+
+			smoother_data & operator=( const smoother_data & ) = delete;
+
+			grb::RC zero_temp_vectors() {
+				return grb::set( smoother_temp, 0 );
+			}
+		};
+
 		namespace internal {
 
 			/**
@@ -50,14 +73,19 @@ namespace grb {
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC __rbgs_single_step( const grb::Matrix< NonzeroType > & A,
+			template<
+				typename IOType,
+				typename NonzeroType,
+				class Ring
+			> grb::RC rbgs_single_step(
+				const grb::Matrix< NonzeroType > & A,
 				const grb::Vector< IOType > & A_diagonal,
 				const grb::Vector< IOType > & r,
 				grb::Vector< IOType > & x,
 				grb::Vector< IOType > & smoother_temp,
 				const grb::Vector< bool > & color_mask,
-				const Ring & ring ) {
+				const Ring & ring
+			) {
 				RC ret { SUCCESS };
 				ret = ret ? ret : grb::set( smoother_temp, 0 );
 
@@ -105,23 +133,100 @@ namespace grb {
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC red_black_gauss_seidel( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
+			template<
+				typename IOType,
+				typename NonzeroType,
+				class Ring
+			> grb::RC red_black_gauss_seidel(
+				multigrid_data< IOType, NonzeroType > &data,
+				smoother_data< IOType > &smoothing_info,
+				const Ring & ring
+			) {
 				RC ret { SUCCESS };
 				// forward step
-				std::vector< grb::Vector< bool > >::const_iterator end { data.color_masks.cend() };
-				for( std::vector< grb::Vector< bool > >::const_iterator it { data.color_masks.cbegin() }; it != end && ret == SUCCESS; ++it ) {
-					ret = ret ? ret : __rbgs_single_step( data.A, data.A_diagonal, data.r, data.z, data.smoother_temp, *it, ring );
+				std::vector< grb::Vector< bool > >::const_iterator end { smoothing_info.color_masks.cend() };
+				for( std::vector< grb::Vector< bool > >::const_iterator it {
+					smoothing_info.color_masks.cbegin() }; it != end && ret == SUCCESS; ++it ) {
+					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
+						smoothing_info.smoother_temp, *it, ring );
 				}
 				// backward step
-				std::vector< grb::Vector< bool > >::const_reverse_iterator rend { data.color_masks.crend() };
-				for( std::vector< grb::Vector< bool > >::const_reverse_iterator rit { data.color_masks.crbegin() }; rit != rend && ret == SUCCESS; ++rit ) {
-					ret = ret ? ret : __rbgs_single_step( data.A, data.A_diagonal, data.r, data.z, data.smoother_temp, *rit, ring );
+				std::vector< grb::Vector< bool > >::const_reverse_iterator rend { smoothing_info.color_masks.crend() };
+				for( std::vector< grb::Vector< bool > >::const_reverse_iterator rit {
+					smoothing_info.color_masks.crbegin() }; rit != rend && ret == SUCCESS; ++rit ) {
+					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
+						smoothing_info.smoother_temp, *rit, ring );
 				}
 				return ret;
 			}
 
 		} // namespace internal
+
+		template <
+			typename IOType,
+			typename NonzeroType,
+			class Ring
+		> struct red_black_smoother_runner {
+			size_t presmoother_steps ;
+			size_t postsmoother_steps;
+			size_t non_recursive_smooth_steps;
+			std::vector< std::unique_ptr< smoother_data< IOType > > > levels;
+			Ring ring;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring operator with default values" );
+
+			using SmootherInputType = multigrid_data< IOType, NonzeroType >;
+
+			inline grb::RC pre_smooth(
+				SmootherInputType& data
+			) {
+				return run_smoother( data, presmoother_steps );
+			}
+
+			inline grb::RC post_smooth(
+				SmootherInputType& data
+			) {
+				return run_smoother( data, postsmoother_steps );
+			}
+
+			inline grb::RC nonrecursive_smooth(
+				SmootherInputType& data
+			) {
+				return run_smoother( data, non_recursive_smooth_steps );
+			}
+
+			/**
+			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
+			 * inside \p data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
+			 *                     residual, system matrix colors, temporary vectors
+			 * @param[in] smoother_steps how many smoothing steps to run
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			grb::RC run_smoother(
+				SmootherInputType &data,
+				const size_t smoother_steps
+			) {
+				RC ret { SUCCESS };
+
+				smoother_data< IOType > &smoothing_info = *( levels.at( data.level ).get() );
+
+				for( size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
+					ret = ret ? ret : internal::red_black_gauss_seidel( data, smoothing_info, ring );
+					assert( ret == SUCCESS );
+				}
+				return ret;
+			}
+		};
+
 	}     // namespace algorithms
 } // namespace grb
 
diff --git a/include/graphblas/utils/geometry/array_vector_storage.hpp b/include/graphblas/utils/geometry/array_vector_storage.hpp
index 451364754..45fbab04e 100644
--- a/include/graphblas/utils/geometry/array_vector_storage.hpp
+++ b/include/graphblas/utils/geometry/array_vector_storage.hpp
@@ -1,67 +1,100 @@
 
-#ifndef _ARRAY_VECTOR_STORAGE_H_
-#define _ARRAY_VECTOR_STORAGE_H_
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file array_vector_storage.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Extension of std::array<> exposing a larger interface and the underlying
+ * 	storage structure.
+ *
+ * @date 2022-10-24
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
 
 #include <array>
 #include <stdexcept>
 #include <algorithm>
+#include <cstddef>
 
 namespace grb {
 	namespace utils {
 		namespace geometry {
 
-template< typename T, std::size_t DIMS > class array_vector_storage: public std::array< T, DIMS > {
-
-public:
-
-	using vector_storage = std::array< T, DIMS >&;
-	using const_vector_storage = const std::array< T, DIMS >&;
-
-	array_vector_storage( std::size_t _dimensions ) {
-		static_assert( DIMS > 0, "cannot allocate 0-sized array" );
-		if( _dimensions != DIMS ) {
-			throw std::invalid_argument("given dimensions must match the type dimensions");
-		}
-	}
-
-	array_vector_storage() = delete;
-
-	// only copy constructor/assignment, since there's no external storage
-	array_vector_storage( const array_vector_storage< T, DIMS >& o ) noexcept {
-		std::copy_n( o.cbegin(), DIMS, this->begin() );
-	}
-
-	/*
-	array_vector_storage( array_vector_storage< T >&& o ) {
-		std::copy_n( o._storage.cbegin(), DIMS, this->_storage.cbegin() );
-	}
-	*/
-
-	array_vector_storage< T, DIMS >& operator=( const array_vector_storage< T, DIMS > &original ) noexcept {
-		std::copy_n( original.begin(), DIMS, this->begin() );
-		return *this;
-	}
-
-	//array_vector_storage< T, DIMS >& operator=( array_vector_storage< T, DIMS > &&original ) = delete;
-
-	~array_vector_storage() {}
-
-	constexpr std::size_t dimensions() const {
-		return DIMS;
-	}
-
-	inline vector_storage storage() {
-		return *this;
-	}
-
-	inline const_vector_storage storage() const {
-		return *this;
-	}
-
-};
+			/**
+			 * Array with fixed size based on std::array with an interface compliant to what other classes
+			 * in the geometry namespace expect, like storage() and dimensions() methods.
+			 *
+			 * It describes a vector of dimensions #dimensions().
+			 *
+			 * @tparam DataType the data type of the vector elements
+			 * @tparam DIMS the dimensions of the vector
+			 */
+			template<
+				typename DataType,
+				size_t DIMS
+			> class ArrayVectorStorage: public std::array< DataType, DIMS > {
+
+			public:
+
+				using VectorStorageType = std::array< DataType, DIMS >&;
+				using ConstVectorStorageType = const std::array< DataType, DIMS >&;
+
+				ArrayVectorStorage( size_t _dimensions ) {
+					static_assert( DIMS > 0, "cannot allocate 0-sized array" );
+					if( _dimensions != DIMS ) {
+						throw std::invalid_argument("given dimensions must match the type dimensions");
+					}
+				}
+
+				ArrayVectorStorage() = delete;
+
+				// only copy constructor/assignment, since there's no external storage
+				ArrayVectorStorage( const ArrayVectorStorage< DataType, DIMS > &o ) noexcept {
+					std::copy_n( o.cbegin(), DIMS, this->begin() );
+				}
+
+				ArrayVectorStorage( ArrayVectorStorage< DataType, DIMS > &&o ) = delete;
+
+				ArrayVectorStorage< DataType, DIMS >& operator=(
+					const ArrayVectorStorage< DataType, DIMS > &original
+				) noexcept {
+					std::copy_n( original.begin(), DIMS, this->begin() );
+					return *this;
+				}
+
+				ArrayVectorStorage< DataType, DIMS >& operator=( ArrayVectorStorage< DataType, DIMS > &&original ) = delete;
+
+				constexpr size_t dimensions() const {
+					return DIMS;
+				}
+
+				inline VectorStorageType storage() {
+					return *this;
+				}
+
+				inline ConstVectorStorageType storage() const {
+					return *this;
+				}
+			};
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _ARRAY_VECTOR_STORAGE_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
diff --git a/include/graphblas/utils/geometry/dynamic_vector_storage.hpp b/include/graphblas/utils/geometry/dynamic_vector_storage.hpp
new file mode 100644
index 000000000..a0def1980
--- /dev/null
+++ b/include/graphblas/utils/geometry/dynamic_vector_storage.hpp
@@ -0,0 +1,154 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
+
+#include <cstddef>
+#include <cstddef>
+#include <algorithm>
+
+/**
+ * @file dynamic_vector_storage.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Extension of a heap-allocated array exposing the underlying storage and iterators.
+ *
+ * @date 2022-10-24
+ */
+
+namespace grb {
+	namespace utils {
+		namespace geometry {
+
+			/**
+			 * Array with fixed size (i.e. decided at object creation) allocated on the heap with an interface compliant
+			 * to what other classes in the geometry namespace expect, like storage() and dimensions() methods.
+			 *
+			 * It describes a vector of dimensions #dimensions().
+			 *
+			 * @tparam DataType the data type of the vector elements
+			 */
+			template< typename DataType > class DynamicVectorStorage {
+
+				size_t _dimensions;
+				DataType* _storage;
+
+				void clean() {
+					if( this->_storage != nullptr ) {
+						delete[] this->_storage;
+					}
+				}
+
+			public:
+
+				// iterator fields
+				using reference = DataType&;
+				using const_reference = const DataType&;
+				using iterator = DataType*;
+				using const_iterator = const DataType*;
+				using pointer = DataType*;
+				using const_pointer = const DataType*;
+
+				using VectorStorageType = DataType*;
+				using ConstVectorStorageType = DataType*;
+				using SelfType = DynamicVectorStorage< DataType >;
+
+				DynamicVectorStorage( size_t __dimensions ):
+					_dimensions( __dimensions ) {
+					if( __dimensions == 0 ) {
+						throw std::invalid_argument("dimensions cannot be 0");
+					}
+					this->_storage = new DataType[ __dimensions ];
+				}
+
+				DynamicVectorStorage() = delete;
+
+				DynamicVectorStorage( const SelfType &o ):
+					_dimensions( o._dimensions ),
+					_storage( new DataType[ o._dimensions ] )
+				{
+					std::copy_n( o._storage, o._dimensions, this->_storage );
+				}
+
+				DynamicVectorStorage( SelfType &&o ) = delete;
+
+				SelfType& operator=( const SelfType &original ) {
+					if( original._dimensions != this->_dimensions ) {
+						this->clean();
+						this->_storage = new DataType[ original._dimensions];
+					}
+					this->_dimensions = original._dimensions;
+					std::copy_n( original._storage, original._dimensions, this->_storage );
+					return *this;
+				}
+
+				SelfType& operator=( SelfType &&original ) = delete;
+
+				~DynamicVectorStorage() {
+					this->clean();
+				}
+
+				size_t dimensions() const {
+					return this->_dimensions;
+				}
+
+				inline iterator begin() {
+					return this->_storage;
+				}
+
+				inline iterator end() {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline const_iterator begin() const {
+					return this->_storage;
+				}
+
+				inline const_iterator end() const {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline const_iterator cbegin() const {
+					return this->_storage;
+				}
+
+				inline const_iterator cend() const {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline VectorStorageType storage() {
+					return this->_storage;
+				}
+
+				inline ConstVectorStorageType storage() const {
+					return this->_storage;
+				}
+
+				inline reference operator[]( size_t pos ) {
+					return *( this->_storage + pos);
+				}
+
+				inline const_reference operator[]( size_t pos ) const {
+					return *( this->_storage + pos );
+				}
+			};
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
diff --git a/include/graphblas/utils/geometry/generic_vector_storage.hpp b/include/graphblas/utils/geometry/generic_vector_storage.hpp
deleted file mode 100644
index 166dad3b8..000000000
--- a/include/graphblas/utils/geometry/generic_vector_storage.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-
-#ifndef _GENERIC_VECTOR_STORAGE_H_
-#define _GENERIC_VECTOR_STORAGE_H_
-
-#include <cstddef>
-#include <algorithm>
-
-namespace grb {
-	namespace utils {
-		namespace geometry {
-
-template< typename T > class generic_vector_storage {
-
-	std::size_t _dimensions;
-	T* _storage;
-
-	void clean() {
-		if( this->_storage != nullptr ) {
-			delete[] this->_storage;
-		}
-	}
-
-public:
-
-	using reference = T&;
-	using const_reference = const T&;
-	using iterator = T*;
-	using const_iterator = const T*;
-	using pointer = T*;
-	using const_pointer = const T*;
-	using vector_storage = T*;
-	using const_vector_storage = T*;
-
-	generic_vector_storage( std::size_t __dimensions ):
-		_dimensions( __dimensions ) {
-		if( __dimensions == 0 ) {
-			throw std::invalid_argument("dimensions cannot be 0");
-		}
-		this->_storage = new T[ __dimensions ];
-	}
-
-	generic_vector_storage() = delete;
-
-	generic_vector_storage( const generic_vector_storage< T >& o ):
-		_dimensions( o._dimensions ), _storage( new T[ o._dimensions ] ) {
-		std::copy_n( o._storage, o._dimensions, this->_storage );
-	}
-
-	generic_vector_storage( generic_vector_storage< T >&& o ) = delete;
-
-	generic_vector_storage< T >& operator=( const generic_vector_storage< T > &original ) {
-		if( original._dimensions != this->_dimensions ) {
-			this->clean();
-			this->_storage = new T[ original._dimensions];
-		}
-		this->_dimensions = original._dimensions;
-		std::copy_n( original._storage, original._dimensions, this->_storage );
-		return *this;
-	}
-
-	generic_vector_storage< T >& operator=( generic_vector_storage< T > &&original ) = delete;
-
-	~generic_vector_storage() {
-		this->clean();
-	}
-
-	std::size_t dimensions() const {
-		return this->_dimensions;
-	}
-
-	inline iterator begin() {
-		return this->_storage;
-	}
-
-	inline iterator end() {
-		return this->_storage + this->_dimensions;
-	}
-
-	inline const_iterator begin() const {
-		return this->_storage;
-	}
-
-	inline const_iterator end() const {
-		return this->_storage + this->_dimensions;
-	}
-
-	inline const_iterator cbegin() const {
-		return this->_storage;
-	}
-
-	inline const_iterator cend() const {
-		return this->_storage + this->_dimensions;
-	}
-
-	inline vector_storage storage() {
-		return this->_storage;
-	}
-
-	inline const_vector_storage storage() const {
-		return this->_storage;
-	}
-
-	inline reference operator[]( std::size_t pos ) {
-		return *( this->_storage + pos);
-	}
-
-	inline const_reference operator[]( std::size_t pos ) const {
-		return *( this->_storage + pos );
-	}
-
-};
-
-		} // namespace geometry
-	} // namespace utils
-} // namespace grb
-
-#endif // _GENERIC_VECTOR_STORAGE_H_
diff --git a/include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp
new file mode 100644
index 000000000..6eb469f21
--- /dev/null
+++ b/include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp
@@ -0,0 +1,207 @@
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
+#define _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
+
+#include <cstddef>
+
+#include "linearized_halo_ndim_system.hpp"
+#include "linearized_ndim_system.hpp"
+#include "linearized_ndim_iterator.hpp"
+#include "array_vector_storage.hpp"
+
+namespace grb {
+	namespace algorithms {
+		namespace geometry {
+
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType,
+				typename ValueCallable
+			>
+			struct HaloMatrixGeneratorIterator {
+
+				static_assert( std::is_copy_constructible< ValueCallable >::value,
+					"ValueCallable must be copy-constructible" );
+
+				using RowIndexType = CoordType; ///< numeric type of rows
+				using ColumnIndexType = CoordType;
+
+				using LinearSystemType = grb::utils::geometry::LinearizedHaloNDimSystem< RowIndexType, DIMS >;
+				using SelfType = HaloMatrixGeneratorIterator< DIMS, CoordType, ValueType, ValueCallable >;
+				using Iterator = typename LinearSystemType::Iterator;
+
+				struct HaloPoint {
+
+					friend SelfType;
+
+					HaloPoint(
+						const ValueCallable &value_producer,
+						RowIndexType i,
+						ColumnIndexType j
+					) noexcept :
+						_value_producer( value_producer ),
+						_i( i ),
+						_j( j )
+					{}
+
+					HaloPoint( const HaloPoint & ) = default;
+
+					HaloPoint & operator=( const HaloPoint & ) = default;
+
+					inline RowIndexType i() const { return _i; }
+					inline ColumnIndexType j() const { return _j; }
+					inline ValueType v() const {
+						return _value_producer( _i, _j);
+					}
+
+				private:
+					// ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
+					// ValueType non_diagonal_value; ///< value to emit outside of the diagonal
+					ValueCallable _value_producer;
+					RowIndexType _i;
+					ColumnIndexType _j;
+				};
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = HaloPoint;
+				using pointer = value_type;
+				using reference = value_type;
+				using difference_type = typename Iterator::difference_type;
+
+				/**
+				 * @brief Construct a new \c HaloMatrixGeneratorIterator object, setting the current row as \p row
+				 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
+				 *
+				 * @param sizes array with the sizes along the dimensions
+				 * @param _halo halo of points to iterate around; must be > 0
+				 * @param diag value to emit when on the diagonal
+				 * @param non_diag value to emit outside the diagonal
+				 */
+				HaloMatrixGeneratorIterator(
+					const LinearSystemType &system,
+					const ValueCallable &value_producer
+				) noexcept :
+					_val( value_producer, 0, 0 ),
+					_lin_system( &system ),
+					_sys_iter( system.begin() )
+				{
+					update_coords();
+				}
+
+				HaloMatrixGeneratorIterator( const SelfType & ) = default;
+
+				// HaloMatrixGeneratorIterator( SelfType && ) = default;
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				// SelfType & operator=( SelfType && ) = default;
+
+				/**
+				 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
+				 *
+				 * This operator internally increments the columns coordinates until wrap-around, when it increments
+				 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
+				 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
+				 *
+				 * @return HaloMatrixGeneratorIterator<DIMS, T>& \c this object, with the updated state
+				 */
+				SelfType & operator++() noexcept {
+					(void) ++_sys_iter;
+					update_coords();
+					return *this;
+				}
+
+				SelfType & operator+=( size_t offset ) {
+					_sys_iter += offset;
+					update_coords();
+					return *this;
+				}
+
+				difference_type operator-( const SelfType &other ) const {
+					return this->_sys_iter - other._sys_iter;
+				}
+
+				/**
+				 * @brief Operator to compare \c this against \p o  and return whether they differ.
+				 *
+				 * @param o object to compare \c this against
+				 * @return true of the row or the column is different between \p o and \c this
+				 * @return false if both row and column of \p o and \c this are equal
+				 */
+				bool operator!=( const SelfType &o ) const {
+					return this->_sys_iter != o._sys_iter;
+				}
+
+				/**
+				 * @brief Operator to compare \c this against \p o  and return whether they are equal.
+				 *
+				 * @param o object to compare \c this against
+				 * @return true of the row or the column is different between \p o and \c this
+				 * @return false if both row and column of \p o and \c this are equal
+				 */
+				bool operator==( const SelfType &o ) const {
+					return ! operator!=( o );
+				}
+
+				/**
+				 * @brief Operator returning the triple to directly access row, column and element values.
+				 *
+				 * Useful when building the matrix by copying the triple of coordinates and value,
+				 * like for the BSP1D backend.
+				 */
+				reference operator*() const {
+					return _val;
+				}
+
+				pointer operator->() const {
+					return &_val;
+				}
+
+				/**
+				 * @brief Returns current row.
+				 */
+				inline RowIndexType i() const {
+					return _val.i();
+				}
+
+				/**
+				 * @brief Returns current column.
+				 */
+				inline ColumnIndexType j() const {
+					return _val.j();
+				}
+
+				/**
+				 * @brief Returns the current matrix value.
+				 *
+				 * @return ValueType #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
+				 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
+				 */
+				inline ValueType v() const {
+					return _val.v();
+				}
+
+				const Iterator & it() const {
+					return this->_sys_iter;
+				}
+
+			private:
+				value_type _val;
+				const LinearSystemType *_lin_system;
+				Iterator _sys_iter;
+
+				void update_coords() {
+					_val._i = _sys_iter->get_element_linear();
+					_val._j = _sys_iter->get_neighbor_linear();
+				}
+			};
+
+
+
+		} // namespace geometry
+	} // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
index 4d7fd62ce..04928ac09 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
@@ -1,6 +1,6 @@
 
-#ifndef _LINEARIZED_HALO_NDIM_GEOMETRY_H_
-#define _LINEARIZED_HALO_NDIM_GEOMETRY_H_
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
+#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
 
 #include <cstddef>
 #include <vector>
@@ -8,225 +8,226 @@
 #include <cassert>
 #include <stdexcept>
 #include <string>
+#include <cstddef>
 
-#include "linearized_ndim_system.hpp"
 #include "array_vector_storage.hpp"
-#include "generic_vector_storage.hpp"
+#include "dynamic_vector_storage.hpp"
+#include "linearized_ndim_system.hpp"
 #include "ndim_vector.hpp"
 
 namespace grb {
 	namespace utils {
 		namespace geometry {
 
-template< typename CoordT, std::size_t DIMS > void __compute_neighbors_range(
-	const array_vector_storage< CoordT, DIMS >& _system_sizes,
-	const CoordT halo,
-	const array_vector_storage< CoordT, DIMS >& system_coordinates,
-	array_vector_storage< CoordT, DIMS >& neighbors_start,
-	array_vector_storage< CoordT, DIMS >& neighbors_range ) {
-
-	for( CoordT i{0}; i < DIMS/* - 1*/; i++ ) {
-		const CoordT start{ system_coordinates[i] <= halo ? 0 : system_coordinates[i] - halo };
-		const CoordT end{ std::min( system_coordinates[i] + halo, _system_sizes[i] - 1 ) };
-		neighbors_start[i] = start;
-		neighbors_range[i] = end - start + 1;
-	}
-	/*
-	const std::size_t last{ DIMS - 1 };
-	const CoordT start{ system_coordinates[ last ] <= halo ? 0 : system_coordinates[ last ] - halo };
-	const CoordT end{ system_coordinates[ last ] + halo }; // can extend beyond actual DIMS-dimensional space
-	neighbors_start[ last ] = start;
-	neighbors_range[ last ] = end - start + 1;
-	*/
-}
-
-
-
-
-
-
-template< typename CoordT, std::size_t DIMS > std::size_t __neighbour_to_system_coords(
-	const std::array< CoordT, DIMS > & sizes,
-	std::size_t system_size,
-	const std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > > & dimension_neighbors,
-	CoordT halo,
-	CoordT neighbor,
-	array_vector_storage< CoordT, DIMS > & result) {
-
-	if( neighbor > system_size ) {
-		throw std::invalid_argument("neighbor number ( " + std::to_string(neighbor)
-			+ " ) >= system size ( " + std::to_string( system_size ) + " )");
-	}
-
-	array_vector_storage< CoordT, DIMS > halo_coords( DIMS );
-#ifdef DBG
-	std::size_t * const halo_coords_end{ halo_coords.data() + DIMS };
-#endif
-	std::fill_n( halo_coords.begin(), DIMS, 0 );
-
-	for( std::size_t _dim{DIMS}; _dim > 0; _dim--) {
-
-		const std::size_t dimension{_dim - 1};
-		const std::size_t dimension_size{ sizes[dimension] };
-		const ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > & neighbors{ dimension_neighbors[dimension] };
-
-		CoordT * const halo_coords_begin{ halo_coords.data() + dimension };
-
-#ifdef DBG
-		std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour " << neighbor << std::endl;
-		std::cout << "\thalo : ";
-		print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
-#endif
-
-		std::size_t h{0};
-		std::size_t previous_neighs{ 0 };
-		*halo_coords_begin = h;
-		std::size_t halo_max_neighs{ neighbors.at( halo_coords_begin ) };
-		//std::cout << "\tinitial halo_max_neighs " << halo_max_neighs << std::endl;
-		while( h < halo && neighbor >= previous_neighs + halo_max_neighs ) {
-			h++;
-			*halo_coords_begin = h;
-			previous_neighs += halo_max_neighs;
-			halo_max_neighs = neighbors.at( halo_coords_begin );
-		}
-#ifdef DBG
-		std::cout << "- initial halo - neighbour " << neighbor << std::endl;
-		std::cout << "\th " << h << std::endl;
-		std::cout << "\thalo : ";
-		print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
-		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-#endif
-
-
-		if ( h < halo ){
-			result[dimension] = h;
-			neighbor -= previous_neighs;
-#ifdef DBG
-			std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-			continue;
-		}
-		// saturation occurred
-		const std::size_t distance_from_halo{ ( neighbor - previous_neighs ) / halo_max_neighs };
-#ifdef DBG
-		std::cout << "- before middle elements - neighbour " << neighbor << std::endl;
-		std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
-		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-		std::cout << "\tdistance_from_halo " << distance_from_halo << std::endl;
-		std::cout << "\tdimension_size " << dimension_size << std::endl;
-#endif
-		if ( distance_from_halo < dimension_size - 2 * halo ) {
-			result[dimension] =  distance_from_halo + halo;
-			neighbor -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
-#ifdef DBG
-			std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-			continue;
-		}
-		previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
-#ifdef DBG
-		std::cout << "- after middle elements -neighbour " << neighbor << std::endl;
-		std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
-		std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-#endif
-
-		h = halo - 1;
-		*halo_coords_begin = h;
-		halo_max_neighs = neighbors.at( halo_coords_begin );
-		while( h > 0 && neighbor >= previous_neighs + halo_max_neighs ) {
-			h--;
-			*halo_coords_begin = h;
-			previous_neighs += halo_max_neighs;
-			halo_max_neighs = neighbors.at( halo_coords_begin );
-		}
-		neighbor -= previous_neighs;
-#ifdef DBG
-		std::cout << "- final halo - neighbour " << neighbor << std::endl;
-		std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
-#endif
-		// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
-		// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
-		result[dimension] = dimension_size - 1 - h;
-#ifdef DBG
-		std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-	}
-
-	return neighbor;
-}
-
-
-template< typename CoordT > std::size_t __accumulate_dimension_neighbours(
-	const ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >& prev_neighs,
-    CoordT* coords_buffer,
-	std::size_t halo,
-    std::size_t local_size ) {
-	std::size_t neighs{0};
-	std::size_t h{0};
-	for( ; h < halo && local_size > 1; h++ ) {
-		*coords_buffer = h;
-
-		const std::size_t local_neighs{ prev_neighs.at( coords_buffer ) };
-		neighs += 2 * local_neighs; // the 2 sides
-		local_size -= 2;
-	}
-	*coords_buffer = h;
-	neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
-	return neighs;
-}
-
-template< typename CoordT > void __populate_halo_neighbors( std::size_t halo,
-    ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >& container ) {
-
-	using it_type = typename ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >::domain_iterator;
-	it_type end{ container.domain_end() };
-	for( it_type it{ container.domain_begin() }; it != end; ++it ) {
-		std::size_t res{1};
-		for( std::size_t h: it->get_position() ) res *= (h + 1 + halo);
-		container.at( it->get_position() ) = res;
-	}
-}
-
-template< typename CoordT, std::size_t DIMS > std::size_t __init_halo_search(
-    typename linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > >::const_vector_reference sizes,
-    std::size_t halo,
-	std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > >& dimension_limits ) {
-
-    using nd_vec = ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > >;
-    using nd_vec_iterator = typename nd_vec::domain_iterator;
-
-	std::vector<std::size_t> halo_sizes( DIMS, halo + 1);
-	dimension_limits.emplace_back(halo_sizes);
-
-	// initialize values
-	__populate_halo_neighbors< CoordT >( halo, dimension_limits[0] );
-	for( std::size_t i{1}; i < DIMS; i++ ) {
-		std::vector<std::size_t> halos( DIMS - i, halo + 1 );
-		dimension_limits.emplace_back(halos);
-	}
-
-    std::array< CoordT, DIMS > prev_coords_buffer; // store at most DIMS values
-    CoordT* const prev_coords{ prev_coords_buffer.data() };
-	CoordT* const second{ prev_coords + 1 }; // store previous coordinates from second position
-	for( std::size_t dimension{1}; dimension < DIMS; dimension++ ) {
-		const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
-		nd_vec& current_neighs{dimension_limits[dimension]};
-
-		nd_vec_iterator end{ current_neighs.domain_end() };
-		for( nd_vec_iterator it{ current_neighs.domain_begin() }; it != end; ++it ) {
-			typename nd_vec::const_domain_vector_reference current_halo_coords{ it->get_position() };
-
-			std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
-			std::size_t local_size{ sizes[dimension - 1] };
-			const std::size_t neighs{ __accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size) };
-			current_neighs.at(current_halo_coords) = neighs;
-		}
-	}
-	return __accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
-}
+			template< typename CoordType, size_t DIMS > void __compute_neighbors_range(
+				const ArrayVectorStorage< CoordType, DIMS >& _system_sizes,
+				const CoordType halo,
+				const ArrayVectorStorage< CoordType, DIMS >& system_coordinates,
+				ArrayVectorStorage< CoordType, DIMS >& neighbors_start,
+				ArrayVectorStorage< CoordType, DIMS >& neighbors_range ) {
+
+				for( CoordType i{0}; i < DIMS/* - 1*/; i++ ) {
+					const CoordType start{ system_coordinates[i] <= halo ? 0 : system_coordinates[i] - halo };
+					const CoordType end{ std::min( system_coordinates[i] + halo, _system_sizes[i] - 1 ) };
+					neighbors_start[i] = start;
+					neighbors_range[i] = end - start + 1;
+				}
+				/*
+				const size_t last{ DIMS - 1 };
+				const CoordT start{ system_coordinates[ last ] <= halo ? 0 : system_coordinates[ last ] - halo };
+				const CoordT end{ system_coordinates[ last ] + halo }; // can extend beyond actual DIMS-dimensional space
+				neighbors_start[ last ] = start;
+				neighbors_range[ last ] = end - start + 1;
+				*/
+			}
+
+
+
+
+
+
+			template< typename CoordType, size_t DIMS > size_t __neighbour_to_system_coords(
+				const std::array< CoordType, DIMS > & sizes,
+				size_t system_size,
+				const std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > > & dimension_neighbors,
+				CoordType halo,
+				CoordType neighbor,
+				ArrayVectorStorage< CoordType, DIMS > & result) {
+
+				if( neighbor > system_size ) {
+					throw std::invalid_argument("neighbor number ( " + std::to_string(neighbor)
+						+ " ) >= system size ( " + std::to_string( system_size ) + " )");
+				}
+
+				ArrayVectorStorage< CoordType, DIMS > halo_coords( DIMS );
+			#ifdef DBG
+				size_t * const halo_coords_end{ halo_coords.data() + DIMS };
+			#endif
+				std::fill_n( halo_coords.begin(), DIMS, 0 );
+
+				for( size_t _dim{DIMS}; _dim > 0; _dim--) {
+
+					const size_t dimension{_dim - 1};
+					const size_t dimension_size{ sizes[dimension] };
+					const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > & neighbors{ dimension_neighbors[dimension] };
+
+					CoordType * const halo_coords_begin{ halo_coords.data() + dimension };
+
+			#ifdef DBG
+					std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour " << neighbor << std::endl;
+					std::cout << "\thalo : ";
+					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+			#endif
+
+					size_t h{0};
+					size_t previous_neighs{ 0 };
+					*halo_coords_begin = h;
+					size_t halo_max_neighs{ neighbors.at( halo_coords_begin ) };
+					//std::cout << "\tinitial halo_max_neighs " << halo_max_neighs << std::endl;
+					while( h < halo && neighbor >= previous_neighs + halo_max_neighs ) {
+						h++;
+						*halo_coords_begin = h;
+						previous_neighs += halo_max_neighs;
+						halo_max_neighs = neighbors.at( halo_coords_begin );
+					}
+			#ifdef DBG
+					std::cout << "- initial halo - neighbour " << neighbor << std::endl;
+					std::cout << "\th " << h << std::endl;
+					std::cout << "\thalo : ";
+					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+			#endif
+
+
+					if ( h < halo ){
+						result[dimension] = h;
+						neighbor -= previous_neighs;
+			#ifdef DBG
+						std::cout << "end neighbour " << neighbor << std::endl;
+			#endif
+						continue;
+					}
+					// saturation occurred
+					const size_t distance_from_halo{ ( neighbor - previous_neighs ) / halo_max_neighs };
+			#ifdef DBG
+					std::cout << "- before middle elements - neighbour " << neighbor << std::endl;
+					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+					std::cout << "\tdistance_from_halo " << distance_from_halo << std::endl;
+					std::cout << "\tdimension_size " << dimension_size << std::endl;
+			#endif
+					if ( distance_from_halo < dimension_size - 2 * halo ) {
+						result[dimension] =  distance_from_halo + halo;
+						neighbor -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
+			#ifdef DBG
+						std::cout << "end neighbour " << neighbor << std::endl;
+			#endif
+						continue;
+					}
+					previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
+			#ifdef DBG
+					std::cout << "- after middle elements -neighbour " << neighbor << std::endl;
+					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+			#endif
+
+					h = halo - 1;
+					*halo_coords_begin = h;
+					halo_max_neighs = neighbors.at( halo_coords_begin );
+					while( h > 0 && neighbor >= previous_neighs + halo_max_neighs ) {
+						h--;
+						*halo_coords_begin = h;
+						previous_neighs += halo_max_neighs;
+						halo_max_neighs = neighbors.at( halo_coords_begin );
+					}
+					neighbor -= previous_neighs;
+			#ifdef DBG
+					std::cout << "- final halo - neighbour " << neighbor << std::endl;
+					std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
+			#endif
+					// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
+					// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
+					result[dimension] = dimension_size - 1 - h;
+			#ifdef DBG
+					std::cout << "end neighbour " << neighbor << std::endl;
+			#endif
+				}
+
+				return neighbor;
+			}
+
+
+			template< typename CoordType > size_t __accumulate_dimension_neighbours(
+				const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >& prev_neighs,
+				CoordType* coords_buffer,
+				size_t halo,
+				size_t local_size ) {
+				size_t neighs{0};
+				size_t h{0};
+				for( ; h < halo && local_size > 1; h++ ) {
+					*coords_buffer = h;
+
+					const size_t local_neighs{ prev_neighs.at( coords_buffer ) };
+					neighs += 2 * local_neighs; // the 2 sides
+					local_size -= 2;
+				}
+				*coords_buffer = h;
+				neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
+				return neighs;
+			}
+
+			template< typename CoordType > void __populate_halo_neighbors( size_t halo,
+				NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >& container ) {
+
+				using it_type = typename NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >::DomainIterator;
+				it_type end{ container.domain_end() };
+				for( it_type it{ container.domain_begin() }; it != end; ++it ) {
+					size_t res{1};
+					for( size_t h: it->get_position() ) res *= (h + 1 + halo);
+					container.at( it->get_position() ) = res;
+				}
+			}
+
+			template< typename CoordType, size_t DIMS > size_t __init_halo_search(
+				typename LinearizedNDimSystem< CoordType, ArrayVectorStorage< CoordType, DIMS > >::ConstVectorReference sizes,
+				size_t halo,
+				std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >& dimension_limits ) {
+
+				using nd_vec = NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >;
+				using nd_vec_iterator = typename nd_vec::DomainIterator;
+
+				std::vector<size_t> halo_sizes( DIMS, halo + 1);
+				dimension_limits.emplace_back(halo_sizes);
+
+				// initialize values
+				__populate_halo_neighbors< CoordType >( halo, dimension_limits[0] );
+				for( size_t i{1}; i < DIMS; i++ ) {
+					std::vector<size_t> halos( DIMS - i, halo + 1 );
+					dimension_limits.emplace_back(halos);
+				}
+
+				std::array< CoordType, DIMS > prev_coords_buffer; // store at most DIMS values
+				CoordType* const prev_coords{ prev_coords_buffer.data() };
+				CoordType* const second{ prev_coords + 1 }; // store previous coordinates from second position
+				for( size_t dimension{1}; dimension < DIMS; dimension++ ) {
+					const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
+					nd_vec& current_neighs{dimension_limits[dimension]};
+
+					nd_vec_iterator end{ current_neighs.domain_end() };
+					for( nd_vec_iterator it{ current_neighs.domain_begin() }; it != end; ++it ) {
+						typename nd_vec::ConstDomainVectorReference current_halo_coords{ it->get_position() };
+
+						std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
+						size_t local_size{ sizes[dimension - 1] };
+						const size_t neighs{ __accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size) };
+						current_neighs.at(current_halo_coords) = neighs;
+					}
+				}
+				return __accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
+			}
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _LINEARIZED_HALO_NDIM_GEOMETRY_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
index ede3af52c..9829fdb46 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
@@ -1,12 +1,29 @@
 
-#ifndef _LINEARIZED_HALO_NDIM_ITERATOR_H_
-#define _LINEARIZED_HALO_NDIM_ITERATOR_H_
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
 
 #include <cstddef>
 #include <vector>
 #include <utility>
 #include <iterator>
 #include <limits>
+#include <cstddef>
 
 #include "linearized_ndim_system.hpp"
 #include "array_vector_storage.hpp"
@@ -16,362 +33,353 @@ namespace grb {
 	namespace utils {
 		namespace geometry {
 
-// forward declaration
-template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_system;
-
-template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_iterator {
-
-	using system_t = linearized_halo_ndim_system< CoordT, DIMS >;
-	using vector_t = array_vector_storage< CoordT, DIMS >;
-	using vector_iter = linearized_ndim_iterator< CoordT, vector_t >;
-public:
-
-	//using vector_t = typename vector_iter::vector_t;
-	using const_vector_reference = typename vector_iter::const_vector_reference;
-
-
-
-	struct halo_ndim_point {
-	private:
-
-		// for linearization
-		const system_t* _system;
-
-		// for iteration
-		vector_iter _element_iter; // coordinates iterator
-
-		//vector_t* _element;
-		//std::size_t _coordinates_linear;
-		vector_t _neighbor; //the actual neighbor
-		//std::size_t _neighbor_linear;
-		CoordT _position;
-
-	public:
-
-		friend linearized_halo_ndim_iterator< CoordT, DIMS>;
-
-		halo_ndim_point() = delete;
-
-		halo_ndim_point( const halo_ndim_point& ) = default;
-
-		halo_ndim_point( halo_ndim_point&& ) = delete;
-
-		halo_ndim_point( const system_t& system ) noexcept :
-			_system( &system ),
-			_element_iter( system ),
-			_neighbor( DIMS ),
-			_position( 0 )
-		{
-			std::fill_n( this->_neighbor.begin(), DIMS, 0 );
-		}
-
-		halo_ndim_point& operator=( const halo_ndim_point& ) = default;
-
-		//halo_ndim_point& operator=( halo_ndim_point&& ) = delete;
-
-		const_vector_reference get_element() const {
-			return this->_element_iter->get_position();
-		}
-
-		std::size_t get_element_linear() const {
-			return this->_system->ndim_to_linear( this->_element_iter->get_position() );
-		}
-
-		const_vector_reference get_neighbor() const {
-			return this->_neighbor;
-		}
-
-		std::size_t get_neighbor_linear() const {
-			return this->_system->ndim_to_linear( this->_neighbor );
-		}
-
-		CoordT get_position() const {
-			return this->_position;
-		}
-	};
-
-
-
-
-
-
-	using const_point_reference = const struct halo_ndim_point&;
-	using const_point_pointer = const struct halo_ndim_point*;
-
-	// interface for std::random_access_iterator
-	using iterator_category = std::random_access_iterator_tag;
-	using value_type = halo_ndim_point;
-	using pointer = const halo_ndim_point*;
-	using reference = const halo_ndim_point&;
-	using difference_type = signed long;
-
-private:
-
-	halo_ndim_point _point;
-	linearized_ndim_system< CoordT, vector_t > _neighbors_linearizer;
-	vector_iter _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
-	vector_t _neighbors_start;
-	vector_iter _neighbor_end;
-
-	inline void __update_neighbor() {
-		for( std::size_t i{0}; i < DIMS; i++ ) {
-			//(this->_point)._neighbor[i] = this->_neighbors_start[i] + (*(this->_neighbor_iter))[i];
-			this->_point._neighbor[i] = this->_neighbors_start[i] + this->_neighbor_iter->get_position()[i];
-		}
-	}
-
-	/*
-	void __update_neighbor_linear() {
-		(this->_point)._neighbor_linear =
-			this->_system.ndim_to_linear( this->_point._neighbor );
-	}
-	*/
-
-	inline void on_neighbor_iter_update() {
-		this->__update_neighbor();
-		//this->__update_neighbor_linear();
-	}
-
-	/*
-	void __update_coordinates_linear() {
-		(this->_point)._coordinates_linear =
-			this->_system.ndim_to_linear( *this->_element_iter );
-	}
-	*/
-
-	void on_element_update() {
-		//this->__update_coordinates_linear();
-		// reset everything
-		vector_t neighbors_range( DIMS );
-		this->_point._system->compute_neighbors_range(
-			//*(this->_point._element_iter),
-			this->_point._element_iter->get_position(),
-			this->_neighbors_start,
-			neighbors_range
-		);
-		/*
-		std::cout << "\t=== start ";
-		print( this->_neighbors_start ) << " range ";
-		print( neighbors_range )  << std::endl;
-		*/
-		// re-target _neighbors_linearizer
-		this->_neighbors_linearizer.retarget( neighbors_range );
-	}
-
-	void on_element_advance() {
-		this->on_element_update();
-
-		this->_neighbor_iter = vector_iter( this->_neighbors_linearizer );
-		this->_neighbor_end = vector_iter::make_system_end_iterator( this->_neighbors_linearizer );
-
-		this->on_neighbor_iter_update();
-	}
-
-public:
-
-	linearized_halo_ndim_iterator() = delete;
-
-	linearized_halo_ndim_iterator( const system_t& system ) noexcept :
-		_point( system ),
-		_neighbors_linearizer( DIMS, system.halo() + 1 ),
-		_neighbor_iter( this->_neighbors_linearizer ),
-		_neighbors_start( DIMS ),
-		_neighbor_end( vector_iter::make_system_end_iterator( this->_neighbors_linearizer ) )
-	{
-		std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
-	}
-
-
-	/*
-	linearized_halo_ndim_iterator( const linearized_halo_ndim_iterator< CoordT, DIMS >& original ) noexcept:
-		_coordinates_linearizer( original._coordinates_linearizer ),
-		_halo( original._halo ),
-		_dimension_limits( original._dimension_limits ),
-		_neighbors_linearizer( original._neighbors_linearizer ),
-		_element_iter( original._element_iter ),
-		_neighbor_iter( original._neighbor_iter ),
-		_neighbor_end( original._neighbor_end ),
-		_neighbors_start( original._neighbors_start ),
-		_point( original._point ) {}
-	*/
-
-	linearized_halo_ndim_iterator( const linearized_halo_ndim_iterator< CoordT, DIMS >& ) = default;
-
-	//linearized_halo_ndim_iterator( linearized_halo_ndim_iterator< CoordT, DIMS >&& original ) = delete;
-
-	/*
-	linearized_halo_ndim_iterator< CoordT, DIMS >& operator=(
-		const linearized_halo_ndim_iterator< CoordT, DIMS >& original ) noexcept {
-		this->_coordinates_linearizer = original._coordinates_linearizer;
-		this->_halo = original._halo;
-		this->_dimension_limits = original._dimension_limits;
-		this->_neighbors_linearizer = original._neighbors_linearizer;
-		this->_element_iter = original._element_iter;
-		this->_coordinates_linear = original._coordinates_linear;
-		this->_neighbor_iter = original._neighbor_iter;
-		this->_neighbor_end = original._neighbor_end;
-		this->_neighbor = original._neighbor;
-		this->_neighbors_start = original._neighbors_start;
-		this->_neighbor_linear = original._neighbor_linear;
-	}
-	*/
-
-	linearized_halo_ndim_iterator< CoordT, DIMS >& operator=( const linearized_halo_ndim_iterator< CoordT, DIMS >& ) = default;
-
-	//linearized_halo_ndim_iterator< CoordT, DIMS >& operator=( linearized_halo_ndim_iterator< CoordT, DIMS >&& ) = delete;
-
-	bool operator!=( const linearized_halo_ndim_iterator< CoordT, DIMS >& other ) const {
-		//return (this->_point)._coordinates_linear != (other._point)._coordinates_linear
-		//	|| (this->_point)._neighbor_linear != (other._point)._neighbor_linear;
-		return this->_point._position != other._point._position; // use linear coordinate
-	}
-
-	const_point_reference operator*() const {
-		return this->_point;
-	}
-
-	const_point_pointer operator->() const {
-		return &(this->_point);
-	}
-
-	bool has_more_neighbours() const {
-		return this->_neighbor_iter != this->_neighbor_end;
-	}
-
-	void next_neighbour() {
-		/*
-		std::cout << "sizes: " << this->_neighbors_linearizer.get_sizes()
-			<< " offset " << this->_neighbor_iter->get_position() << " -> "
-			<< this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() )
-			<< std::endl;
-		*/
-		++(this->_neighbor_iter);
-		this->on_neighbor_iter_update();
-		this->_point._position++;
-	}
-
-	bool has_more_elements() const {
-		return this->_point.get_element_linear() != (this->_point._system)->base_system_size();
-	}
-
-	void next_element() {
-		std::size_t num_neighbours = this->_neighbors_linearizer.system_size();
-		std::size_t neighbour_position_offset =
-			this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() );
-		// std::cout << " num_neighbours " << num_neighbours << " offset " << neighbour_position_offset << std::endl;
-		++(this->_point._element_iter);
-		this->on_element_advance();
-		// this->_point._position++;
-		this->_point._position -= neighbour_position_offset;
-		this->_point._position += num_neighbours;
-	}
-
-	linearized_halo_ndim_iterator< CoordT, DIMS >& operator++() noexcept {
-		++(this->_neighbor_iter);
-		if( !has_more_neighbours() ) {
-			++(this->_point._element_iter);
-			//this->_coordinates_linear = this->_coordinates_linearizer.ndim_to_linear( this->_element_iter );
-			this->on_element_advance();
-
-		} else {
-			this->on_neighbor_iter_update();
-		}
-		this->_point._position++;
-		return *this;
-	}
-
-
-
-	linearized_halo_ndim_iterator< CoordT, DIMS >& operator+=( std::size_t offset ) {
-		if( offset == 1UL ) {
-			return this->operator++();
-		}
-		const std::size_t final_position { this->_point._position + offset };
-		if( final_position > this->_point._system->halo_system_size() ) {
-			throw std::range_error( "neighbor linear value beyond system" );
-		}
-		vector_t final_element( DIMS );
-		std::size_t neighbor_index{ (this->_point._system->neighbour_linear_to_element( final_position, final_element )) };
-
-		// std::cout << "\t=== element " << offset << " -- ";
-		// std::cout << final_element[0] << " " << final_element[0] << std::endl;
-
-		this->_point._element_iter = vector_iter( *this->_point._system, final_element.cbegin() );
-		//this->_point._element = &( *this->_element_iter );
-		this->_point._position = final_position;
-
-		this->on_element_update();
-		this->_neighbors_linearizer.linear_to_ndim( neighbor_index, final_element );
-
-		this->_neighbor_iter = vector_iter( this->_neighbors_linearizer, final_element.cbegin() );
-		this->_neighbor_end = vector_iter::make_system_end_iterator( this->_neighbors_linearizer );
-		this->on_neighbor_iter_update();
-
-		return *this;
-	}
-
-	difference_type operator-( const linearized_halo_ndim_iterator< CoordT, DIMS >& other ) const {
-		/*
-		if( _point.get_position() < a_point.get_position() ) {
-			throw std::invalid_argument( "first iterator is in a lower position than second" );
-		}
-		*/
-		std::size_t a_pos{ _point.get_position() }, b_pos{ other._point.get_position() };
-		// std::cout << "diff " << a_pos << " - " << b_pos << std::endl;
-		std::size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
-		using diff_t = typename linearized_halo_ndim_iterator< CoordT, DIMS >::difference_type;
-
-		if( highest - lowest > static_cast< std::size_t >(
-			std::numeric_limits< diff_t >::max() ) ) {
-			throw std::invalid_argument( "iterators are too distant" );
-		}
-
-		return ( static_cast< diff_t >( a_pos - b_pos ) );
-	}
-
-
-
-
-	// implementation depending on logic in operator++
-	static linearized_halo_ndim_iterator< CoordT, DIMS > make_system_end_iterator(
-		const system_t& system
-	) {
-		linearized_halo_ndim_iterator< CoordT, DIMS > result( system );
-
-		/*
-		std::cout << "result 0: element ";
-		print(result->get_element()) << " neighbor ";
-		print(result->get_neighbor())  << std::endl;
-		*/
-
-		// go to the very first point outside of space
-		result._point._element_iter = vector_iter::make_system_end_iterator( system );
-		/*
-		std::cout << "result 1: element ";
-		print(result->get_element()) << " neighbor ";
-		print(result->get_neighbor())  << std::endl;
-		*/
-
-		result.on_element_advance();
-		result._point._position = system.halo_system_size();
-		//std::cout << "got sys size " << system.halo_system_size() << std::endl;
-
-		return result;
-	}
-
-};
-
-/*
-template< typename CoordT, std::size_t DIMS > linearized_halo_ndim_iterator< CoordT, DIMS >
-	operator+( const linearized_halo_ndim_iterator< CoordT, DIMS >& original, std::size_t increment ) {
-	linearized_halo_ndim_iterator< CoordT, DIMS > res( original );
-	return ( res += increment );
-}
-*/
+			// forward declaration
+			template<
+				typename SizeType,
+				size_t DIMS
+			> class LinearizedHaloNDimSystem;
+
+			template<
+				typename SizeType,
+				size_t DIMS
+			> class LinearizedHaloNDimIterator {
+
+				using SystemType = LinearizedHaloNDimSystem< SizeType, DIMS >;
+				using VectorType = ArrayVectorStorage< SizeType, DIMS >;
+				using VectorIteratorType = LinearizedNDimIterator< SizeType, VectorType >;
+
+			public:
+				//using VectorType = typename VectorIteratorType::VectorType;
+				using ConstVectorReference = typename VectorIteratorType::ConstVectorReference;
+				using SelfType = LinearizedHaloNDimIterator< SizeType, DIMS >;
+
+				struct HaloNDimElement {
+				private:
+
+					// for linearization
+					const SystemType* _system;
+
+					// for iteration
+					VectorIteratorType _element_iter; // coordinates iterator
+
+					//VectorType* _element;
+					//size_t _coordinates_linear;
+					VectorType _neighbor; //the current neighbor
+					//size_t _neighbor_linear;
+					SizeType _position;
+
+				public:
+					friend SelfType;
+
+					HaloNDimElement() = delete;
+
+					HaloNDimElement( const HaloNDimElement& ) = default;
+
+					HaloNDimElement( HaloNDimElement&& ) = delete;
+
+					HaloNDimElement( const SystemType& system ) noexcept :
+						_system( &system ),
+						_element_iter( system ),
+						_neighbor( DIMS ),
+						_position( 0 )
+					{
+						std::fill_n( this->_neighbor.begin(), DIMS, 0 );
+					}
+
+					HaloNDimElement& operator=( const HaloNDimElement& ) = default;
+
+					//HaloNDimElement& operator=( HaloNDimElement&& ) = delete;
+
+					ConstVectorReference get_element() const {
+						return this->_element_iter->get_position();
+					}
+
+					size_t get_element_linear() const {
+						return this->_system->ndim_to_linear( this->_element_iter->get_position() );
+					}
+
+					ConstVectorReference get_neighbor() const {
+						return this->_neighbor;
+					}
+
+					size_t get_neighbor_linear() const {
+						return this->_system->ndim_to_linear( this->_neighbor );
+					}
+
+					SizeType get_position() const {
+						return this->_position;
+					}
+				};
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = HaloNDimElement;
+				using pointer = const HaloNDimElement*;
+				using reference = const HaloNDimElement&;
+				using difference_type = signed long;
+
+			private:
+				HaloNDimElement _point;
+				LinearizedNDimSystem< SizeType, VectorType > _neighbors_linearizer;
+				VectorIteratorType _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
+				VectorType _neighbors_start;
+				VectorIteratorType _neighbor_end;
+
+				inline void __update_neighbor() {
+					for( size_t i{0}; i < DIMS; i++ ) {
+						//(this->_point)._neighbor[i] = this->_neighbors_start[i] + (*(this->_neighbor_iter))[i];
+						this->_point._neighbor[i] = this->_neighbors_start[i] + this->_neighbor_iter->get_position()[i];
+					}
+				}
+
+				/*
+				void __update_neighbor_linear() {
+					(this->_point)._neighbor_linear =
+						this->_system.ndim_to_linear( this->_point._neighbor );
+				}
+				*/
+
+				inline void on_neighbor_iter_update() {
+					this->__update_neighbor();
+					//this->__update_neighbor_linear();
+				}
+
+				/*
+				void __update_coordinates_linear() {
+					(this->_point)._coordinates_linear =
+						this->_system.ndim_to_linear( *this->_element_iter );
+				}
+				*/
+
+				void on_element_update() {
+					//this->__update_coordinates_linear();
+					// reset everything
+					VectorType neighbors_range( DIMS );
+					this->_point._system->compute_neighbors_range(
+						//*(this->_point._element_iter),
+						this->_point._element_iter->get_position(),
+						this->_neighbors_start,
+						neighbors_range
+					);
+					/*
+					std::cout << "\t=== start ";
+					print( this->_neighbors_start ) << " range ";
+					print( neighbors_range )  << std::endl;
+					*/
+					// re-target _neighbors_linearizer
+					this->_neighbors_linearizer.retarget( neighbors_range );
+				}
+
+				void on_element_advance() {
+					this->on_element_update();
+
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_linearizer );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer );
+
+					this->on_neighbor_iter_update();
+				}
+
+			public:
+
+				LinearizedHaloNDimIterator() = delete;
+
+				LinearizedHaloNDimIterator( const SystemType& system ) noexcept :
+					_point( system ),
+					_neighbors_linearizer( DIMS, system.halo() + 1 ),
+					_neighbor_iter( this->_neighbors_linearizer ),
+					_neighbors_start( DIMS ),
+					_neighbor_end( VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer ) )
+				{
+					std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
+				}
+
+
+				/*
+				LinearizedHaloNDimIterator( const LinearizedHaloNDimIterator< SizeType, DIMS >& original ) noexcept:
+					_coordinates_linearizer( original._coordinates_linearizer ),
+					_halo( original._halo ),
+					_dimension_limits( original._dimension_limits ),
+					_neighbors_linearizer( original._neighbors_linearizer ),
+					_element_iter( original._element_iter ),
+					_neighbor_iter( original._neighbor_iter ),
+					_neighbor_end( original._neighbor_end ),
+					_neighbors_start( original._neighbors_start ),
+					_point( original._point ) {}
+				*/
+
+				LinearizedHaloNDimIterator( const SelfType & ) = default;
+
+				//LinearizedHaloNDimIterator( SelfType &&original ) = delete;
+
+				/*
+				LinearizedHaloNDimIterator< SizeType, DIMS >& operator=(
+					const LinearizedHaloNDimIterator< SizeType, DIMS >& original ) noexcept {
+					this->_coordinates_linearizer = original._coordinates_linearizer;
+					this->_halo = original._halo;
+					this->_dimension_limits = original._dimension_limits;
+					this->_neighbors_linearizer = original._neighbors_linearizer;
+					this->_element_iter = original._element_iter;
+					this->_coordinates_linear = original._coordinates_linear;
+					this->_neighbor_iter = original._neighbor_iter;
+					this->_neighbor_end = original._neighbor_end;
+					this->_neighbor = original._neighbor;
+					this->_neighbors_start = original._neighbors_start;
+					this->_neighbor_linear = original._neighbor_linear;
+				}
+				*/
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				//SelfType & operator=( SelfType && ) = delete;
+
+				bool operator!=( const SelfType &other ) const {
+					//return (this->_point)._coordinates_linear != (other._point)._coordinates_linear
+					//	|| (this->_point)._neighbor_linear != (other._point)._neighbor_linear;
+					return this->_point._position != other._point._position; // use linear coordinate
+				}
+
+				reference operator*() const {
+					return this->_point;
+				}
+
+				pointer operator->() const {
+					return &(this->_point);
+				}
+
+				bool has_more_neighbours() const {
+					return this->_neighbor_iter != this->_neighbor_end;
+				}
+
+				void next_neighbour() {
+					/*
+					std::cout << "sizes: " << this->_neighbors_linearizer.get_sizes()
+						<< " offset " << this->_neighbor_iter->get_position() << " -> "
+						<< this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() )
+						<< std::endl;
+					*/
+					++(this->_neighbor_iter);
+					this->on_neighbor_iter_update();
+					this->_point._position++;
+				}
+
+				bool has_more_elements() const {
+					return this->_point.get_element_linear() != (this->_point._system)->base_system_size();
+				}
+
+				void next_element() {
+					size_t num_neighbours = this->_neighbors_linearizer.system_size();
+					size_t neighbour_position_offset =
+						this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() );
+					// std::cout << " num_neighbours " << num_neighbours << " offset " << neighbour_position_offset << std::endl;
+					++(this->_point._element_iter);
+					this->on_element_advance();
+					// this->_point._position++;
+					this->_point._position -= neighbour_position_offset;
+					this->_point._position += num_neighbours;
+				}
+
+				SelfType & operator++() noexcept {
+					++(this->_neighbor_iter);
+					if( !has_more_neighbours() ) {
+						++(this->_point._element_iter);
+						//this->_coordinates_linear = this->_coordinates_linearizer.ndim_to_linear( this->_element_iter );
+						this->on_element_advance();
+
+					} else {
+						this->on_neighbor_iter_update();
+					}
+					this->_point._position++;
+					return *this;
+				}
+
+				SelfType & operator+=( size_t offset ) {
+					if( offset == 1UL ) {
+						return this->operator++();
+					}
+					const size_t final_position { this->_point._position + offset };
+					if( final_position > this->_point._system->halo_system_size() ) {
+						throw std::range_error( "neighbor linear value beyond system" );
+					}
+					VectorType final_element( DIMS );
+					size_t neighbor_index{ (this->_point._system->neighbour_linear_to_element( final_position, final_element )) };
+
+					// std::cout << "\t=== element " << offset << " -- ";
+					// std::cout << final_element[0] << " " << final_element[0] << std::endl;
+
+					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
+					//this->_point._element = &( *this->_element_iter );
+					this->_point._position = final_position;
+
+					this->on_element_update();
+					this->_neighbors_linearizer.linear_to_ndim( neighbor_index, final_element );
+
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_linearizer, final_element.cbegin() );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer );
+					this->on_neighbor_iter_update();
+
+					return *this;
+				}
+
+				difference_type operator-( const SelfType &other ) const {
+					/*
+					if( _point.get_position() < a_point.get_position() ) {
+						throw std::invalid_argument( "first iterator is in a lower position than second" );
+					}
+					*/
+					size_t a_pos{ _point.get_position() }, b_pos{ other._point.get_position() };
+					// std::cout << "diff " << a_pos << " - " << b_pos << std::endl;
+					size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
+					using diff_t = typename LinearizedHaloNDimIterator< SizeType, DIMS >::difference_type;
+
+					if( highest - lowest > static_cast< size_t >(
+						std::numeric_limits< diff_t >::max() ) ) {
+						throw std::invalid_argument( "iterators are too distant" );
+					}
+
+					return ( static_cast< diff_t >( a_pos - b_pos ) );
+				}
+
+
+
+
+				// implementation depending on logic in operator++
+				static SelfType make_system_end_iterator( const SystemType& system ) {
+					SelfType result( system );
+
+					/*
+					std::cout << "result 0: element ";
+					print(result->get_element()) << " neighbor ";
+					print(result->get_neighbor())  << std::endl;
+					*/
+
+					// go to the very first point outside of space
+					result._point._element_iter = VectorIteratorType::make_system_end_iterator( system );
+					/*
+					std::cout << "result 1: element ";
+					print(result->get_element()) << " neighbor ";
+					print(result->get_neighbor())  << std::endl;
+					*/
+
+					result.on_element_advance();
+					result._point._position = system.halo_system_size();
+					//std::cout << "got sys size " << system.halo_system_size() << std::endl;
+
+					return result;
+				}
+
+			};
+
+			/*
+			template< typename SizeType, size_t DIMS > LinearizedHaloNDimIterator< SizeType, DIMS >
+				operator+( const LinearizedHaloNDimIterator< SizeType, DIMS >& original, size_t increment ) {
+				LinearizedHaloNDimIterator< SizeType, DIMS > res( original );
+				return ( res += increment );
+			}
+			*/
 
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _LINEARIZED_HALO_NDIM_ITERATOR_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp b/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
index f915492ac..af296cc9f 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
@@ -1,11 +1,12 @@
 
-#ifndef _LINEARIZED_HALO_NDIM_SYSTEM_H_
-#define _LINEARIZED_HALO_NDIM_SYSTEM_H_
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
 
 #include <cstddef>
 #include <vector>
 #include <array>
 #include <cassert>
+#include <cstddef>
 
 #include "array_vector_storage.hpp"
 #include "linearized_ndim_system.hpp"
@@ -16,96 +17,100 @@ namespace grb {
 	namespace utils {
 		namespace geometry {
 
-// only with array_vector_storage
-template< typename CoordT, std::size_t DIMS > class linearized_halo_ndim_system:
-	public linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > > {
-public:
-
-	using iterator = linearized_halo_ndim_iterator< CoordT, DIMS >;
-    using const_vector_reference = typename array_vector_storage< CoordT, DIMS >::const_vector_storage;
-	using self_t = linearized_halo_ndim_system< CoordT, DIMS >;
-	using base_t = linearized_ndim_system< CoordT, array_vector_storage< CoordT, DIMS > >;
-
-    linearized_halo_ndim_system( const_vector_reference sizes, CoordT halo ):
-		base_t( sizes.cbegin(), sizes.cend() ),
-        _halo( halo ) {
-
-		for( CoordT __size : sizes ) {
-			if ( __size < 2 * halo + 1 ) {
-				throw std::invalid_argument(
-					std::string( "the halo (" + std::to_string(halo) +
-					std::string( ") goes beyond a system size (" ) +
-					std::to_string( __size) + std::string( ")" ) ) );
-			}
-		}
-
-        this->_system_size = __init_halo_search< CoordT, DIMS >(
-				this->get_sizes(),
-				_halo, this->_dimension_limits );
-		assert( this->_dimension_limits.size() == DIMS );
-    }
-
-    linearized_halo_ndim_system() = delete;
-
-    linearized_halo_ndim_system( const self_t & ) = default;
-
-    linearized_halo_ndim_system( self_t && ) = delete;
-
-    ~linearized_halo_ndim_system() noexcept {}
-
-    self_t & operator=( const self_t & ) = default;
-
-    self_t & operator=( self_t && ) = delete;
-
-	iterator begin() const {
-		return iterator( *this );
-	}
-
-	iterator end() const {
-		return iterator::make_system_end_iterator( *this );
-	}
-
-	std::size_t halo_system_size() const {
-		return this->_system_size;
-	}
-
-	std::size_t base_system_size() const {
-		return this->base_t::system_size();
-	}
-
-    std::size_t halo() const {
-        return this->_halo;
-    }
-
-    void compute_neighbors_range(
-        const array_vector_storage< CoordT, DIMS >& system_coordinates,
-	    array_vector_storage< CoordT, DIMS >& neighbors_start,
-	    array_vector_storage< CoordT, DIMS >& neighbors_range) const noexcept {
-        __compute_neighbors_range( this->get_sizes(),
-            this->_halo,
-            system_coordinates,
-            neighbors_start,
-            neighbors_range
-        );
-    }
-
-    std::size_t neighbour_linear_to_element (
-        CoordT neighbor,
-	    array_vector_storage< CoordT, DIMS > & result) const noexcept {
-        return __neighbour_to_system_coords( this->get_sizes(),
-        this->_system_size, this->_dimension_limits, this->_halo, neighbor, result );
-    }
-
-private:
-
-    const CoordT _halo;
-    std::vector< ndim_vector< CoordT, CoordT, generic_vector_storage< CoordT > > > _dimension_limits;
-    std::size_t _system_size;
-
-};
+			// only with ArrayVectorStorage
+			template<
+				typename SizeType,
+				size_t DIMS
+			> class LinearizedHaloNDimSystem:
+				public LinearizedNDimSystem< SizeType, ArrayVectorStorage< SizeType, DIMS > > {
+			public:
+
+				using VectorType = ArrayVectorStorage< SizeType, DIMS >;
+				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
+				using SelfType = LinearizedHaloNDimSystem< SizeType, DIMS >;
+				using BaseType = LinearizedNDimSystem< SizeType, VectorType >;
+				using Iterator = LinearizedHaloNDimIterator< SizeType, DIMS >;
+
+				LinearizedHaloNDimSystem( ConstVectorStorageType sizes, SizeType halo ):
+					BaseType( sizes.cbegin(), sizes.cend() ),
+					_halo( halo ) {
+
+					for( SizeType __size : sizes ) {
+						if ( __size < 2 * halo + 1 ) {
+							throw std::invalid_argument(
+								std::string( "the halo (" + std::to_string(halo) +
+								std::string( ") goes beyond a system size (" ) +
+								std::to_string( __size) + std::string( ")" ) ) );
+						}
+					}
+
+					this->_system_size = __init_halo_search< SizeType, DIMS >(
+							this->get_sizes(),
+							_halo, this->_dimension_limits );
+					assert( this->_dimension_limits.size() == DIMS );
+				}
+
+				LinearizedHaloNDimSystem() = delete;
+
+				LinearizedHaloNDimSystem( const SelfType & ) = default;
+
+				LinearizedHaloNDimSystem( SelfType && ) = delete;
+
+				~LinearizedHaloNDimSystem() noexcept {}
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				SelfType & operator=( SelfType && ) = delete;
+
+				Iterator begin() const {
+					return Iterator( *this );
+				}
+
+				Iterator end() const {
+					return Iterator::make_system_end_iterator( *this );
+				}
+
+				size_t halo_system_size() const {
+					return this->_system_size;
+				}
+
+				size_t base_system_size() const {
+					return this->BaseType::system_size();
+				}
+
+				size_t halo() const {
+					return this->_halo;
+				}
+
+				void compute_neighbors_range(
+					const VectorType &system_coordinates,
+					VectorType &neighbors_start,
+					VectorType &neighbors_range) const noexcept {
+					__compute_neighbors_range( this->get_sizes(),
+						this->_halo,
+						system_coordinates,
+						neighbors_start,
+						neighbors_range
+					);
+				}
+
+				size_t neighbour_linear_to_element (
+					SizeType neighbor,
+					VectorType &result) const noexcept {
+					return __neighbour_to_system_coords( this->get_sizes(),
+					this->_system_size, this->_dimension_limits, this->_halo, neighbor, result );
+				}
+
+			private:
+
+				const SizeType _halo;
+				std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > _dimension_limits;
+				size_t _system_size;
+
+			};
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _LINEARIZED_HALO_NDIM_SYSTEM_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
diff --git a/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp b/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
index 20a6473cc..60f424164 100644
--- a/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
+++ b/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
@@ -1,178 +1,197 @@
 
-#ifndef _NDIM_ITERATOR_H_
-#define _NDIM_ITERATOR_H_
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
 
 #include <cstddef>
 #include <algorithm>
 #include <stdexcept>
 #include <type_traits>
 #include <limits>
+#include <cstddef>
 
 #include "array_vector_storage.hpp"
 
-
 namespace grb {
 	namespace utils {
 		namespace geometry {
 
-// forward declaration for default
-template< typename T, typename StorageT > class linearized_ndim_system;
-
-template< typename T, typename StorageT > class linearized_ndim_iterator {
-public:
-
-	using storage_t = StorageT;
-	using lin_t = linearized_ndim_system< T, storage_t >;
-	using const_vector_reference = const storage_t&;
-	using self_t = linearized_ndim_iterator< T, StorageT >;
-
-	struct ndim_point {
-	private:
-
-		const lin_t* system; // pointer because of copy assignment
-		storage_t coords;
-
-	public:
-
-		friend self_t;
-
-		ndim_point() = delete;
-
-		ndim_point( const ndim_point& ) = default;
+			// forward declaration for default
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimSystem;
 
-		ndim_point( ndim_point&& ) = delete;
-
-		ndim_point( const lin_t& _system ) noexcept :
-			system( &_system ),
-			coords( _system.dimensions() )
-		{
-			std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
-		}
-
-		ndim_point& operator=( const ndim_point& ) = default;
-
-		inline const_vector_reference get_position() const {
-			return coords;
-		}
-
-		std::size_t get_linear_position() const {
-			return system->ndim_to_linear( coords );
-		}
-	};
-
-
-	// interface for std::random_access_iterator
-	using iterator_category = std::random_access_iterator_tag;
-	using value_type = ndim_point;
-	using pointer = const value_type*;
-	using reference = const value_type&;
-	using difference_type = signed long;
-
-	linearized_ndim_iterator( const lin_t &_system ) noexcept :
-		_p( _system )
-	{}
-
-	template< typename IterT > linearized_ndim_iterator( const lin_t &_system, IterT begin ) noexcept :
-		_p( _system )
-	{
-		std::copy_n( begin, _system.dimensions(), this->_p.coords.begin() );
-	}
-
-	linearized_ndim_iterator() = delete;
-
-	linearized_ndim_iterator( const self_t& original ):
-		_p( original._p ) {}
-
-	self_t& operator=( const self_t& original ) = default;
-
-	//linearized_ndim_iterator( self_t&& original ) = delete;
-
-	//self_t operator=( self_t&& ) = delete;
-
-	~linearized_ndim_iterator() {}
-
-    self_t & operator++() noexcept {
-		bool rewind{ true };
-		// rewind only the first N-1 coordinates
-		for( std::size_t i { 0 }; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
-			T& coord = this->_p.coords[ i ];
-			// must rewind dimension if we wrap-around
-			/*
-			T new_coord = ( coord + 1 ) % this->_p.system->get_sizes()[ i ];
-			rewind = new_coord < coord;
-			coord = new_coord;
-			*/
-			T plus = coord + 1;
-			rewind = plus >= this->_p.system->get_sizes()[ i ];
-			coord = rewind ? 0 : plus;
-		}
-		// if we still have to rewind, increment the last coordinate, which is unbounded
-		if( rewind ) {
-			this->_p.coords[ this->_p.system->dimensions() - 1 ]++;
-		}
-		return *this;
-	}
-
-    self_t & operator+=( std::size_t offset ) {
-		std::size_t linear{ _p.get_linear_position() + offset };
-		if( linear > _p.system->system_size() ) {
-			throw std::invalid_argument("increment is too large");
-		}
-		_p.system->linear_to_ndim( linear, _p.coords );
-		return *this;
-	}
-
-	difference_type operator-( const self_t &other ) const {
-		std::size_t a_pos{ _p.get_linear_position() },
-			b_pos{ other._p.get_linear_position() };
-		std::size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
-
-		if( highest - lowest > static_cast< std::size_t >(
-			std::numeric_limits< difference_type >::max() ) ) {
-			throw std::invalid_argument( "iterators are too distant" );
-		}
-
-		return ( static_cast< difference_type >( a_pos - b_pos ) );
-	}
-
-	reference operator*() const {
-        return this->_p;
-    }
-
-	pointer operator->() const {
-		return &( this->_p );
-	}
-
-    bool operator!=( const self_t &o ) const {
-		const std::size_t dims{ this->_p.system->dimensions() };
-		if( dims != o._p.system->dimensions() ) {
-			throw std::invalid_argument("system sizes do not match");
-		}
-        bool equal{ true };
-		for( std::size_t i{0}; i < dims && equal; i++) {
-			equal &= ( this->_p.coords[i] == o._p.coords[i] );
-		}
-		return !equal;
-    }
-
-	// implementation depending on logic in operator++
-	static self_t
-		make_system_end_iterator( const lin_t &_system ) {
-		// fill with 0s
-		self_t iter( _system );
-		std::size_t last{ iter->system->dimensions() - 1 };
-		// store last size in last position
-		iter._p.coords[ last ] = iter->system->get_sizes()[ last ];
-		return iter;
-	}
-
-private:
-	ndim_point _p;
-
-};
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimIterator {
+			public:
+
+				using VectorType = InternalVectorType;
+				using LinNDimSysType = LinearizedNDimSystem< SizeType, VectorType >;
+				using ConstVectorReference = const VectorType&;
+				using SelfType = LinearizedNDimIterator< SizeType, InternalVectorType >;
+
+				struct NDimPoint {
+				private:
+
+					const LinNDimSysType* system; // pointer because of copy assignment
+					VectorType coords;
+
+				public:
+
+					friend SelfType;
+
+					NDimPoint() = delete;
+
+					NDimPoint( const NDimPoint& ) = default;
+
+					NDimPoint( NDimPoint&& ) = delete;
+
+					NDimPoint( const LinNDimSysType& _system ) noexcept :
+						system( &_system ),
+						coords( _system.dimensions() )
+					{
+						std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
+					}
+
+					NDimPoint& operator=( const NDimPoint& ) = default;
+
+					inline ConstVectorReference get_position() const {
+						return coords;
+					}
+
+					size_t get_linear_position() const {
+						return system->ndim_to_linear( coords );
+					}
+				};
+
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = NDimPoint;
+				using pointer = const value_type*;
+				using reference = const value_type&;
+				using difference_type = signed long;
+
+				LinearizedNDimIterator( const LinNDimSysType &_system ) noexcept :
+					_p( _system )
+				{}
+
+				template< typename IterT > LinearizedNDimIterator( const LinNDimSysType &_system, IterT begin ) noexcept :
+					_p( _system )
+				{
+					std::copy_n( begin, _system.dimensions(), this->_p.coords.begin() );
+				}
+
+				LinearizedNDimIterator() = delete;
+
+				LinearizedNDimIterator( const SelfType &original ):
+					_p( original._p ) {}
+
+				SelfType& operator=( const SelfType &original ) = default;
+
+				// LinearizedNDimIterator( SelfType && ) = delete;
+
+				// SelfType operator=( SelfType && ) = delete;
+
+				~LinearizedNDimIterator() {}
+
+				SelfType & operator++() noexcept {
+					bool rewind{ true };
+					// rewind only the first N-1 coordinates
+					for( size_t i { 0 }; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
+						SizeType& coord = this->_p.coords[ i ];
+						// must rewind dimension if we wrap-around
+						/*
+						SizeType new_coord = ( coord + 1 ) % this->_p.system->get_sizes()[ i ];
+						rewind = new_coord < coord;
+						coord = new_coord;
+						*/
+						SizeType plus = coord + 1;
+						rewind = plus >= this->_p.system->get_sizes()[ i ];
+						coord = rewind ? 0 : plus;
+					}
+					// if we still have to rewind, increment the last coordinate, which is unbounded
+					if( rewind ) {
+						this->_p.coords[ this->_p.system->dimensions() - 1 ]++;
+					}
+					return *this;
+				}
+
+				SelfType & operator+=( size_t offset ) {
+					size_t linear{ _p.get_linear_position() + offset };
+					if( linear > _p.system->system_size() ) {
+						throw std::invalid_argument("increment is too large");
+					}
+					_p.system->linear_to_ndim( linear, _p.coords );
+					return *this;
+				}
+
+				difference_type operator-( const SelfType &other ) const {
+					size_t a_pos{ _p.get_linear_position() },
+						b_pos{ other._p.get_linear_position() };
+					size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
+					if( highest - lowest > static_cast< size_t >(
+						std::numeric_limits< difference_type >::max() ) ) {
+						throw std::invalid_argument( "iterators are too distant" );
+					}
+					return ( static_cast< difference_type >( a_pos - b_pos ) );
+				}
+
+				reference operator*() const {
+					return this->_p;
+				}
+
+				pointer operator->() const {
+					return &( this->_p );
+				}
+
+				bool operator!=( const SelfType &o ) const {
+					const size_t dims{ this->_p.system->dimensions() };
+					if( dims != o._p.system->dimensions() ) {
+						throw std::invalid_argument("system sizes do not match");
+					}
+					bool equal{ true };
+					for( size_t i{0}; i < dims && equal; i++) {
+						equal &= ( this->_p.coords[i] == o._p.coords[i] );
+					}
+					return !equal;
+				}
+
+				// implementation depending on logic in operator++
+				static SelfType make_system_end_iterator( const LinNDimSysType &_system ) {
+					// fill with 0s
+					SelfType iter( _system );
+					size_t last{ iter->system->dimensions() - 1 };
+					// store last size in last position
+					iter._p.coords[ last ] = iter->system->get_sizes()[ last ];
+					return iter;
+				}
+
+			private:
+				NDimPoint _p;
+
+			};
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _NDIM_ITERATOR_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_ndim_system.hpp b/include/graphblas/utils/geometry/linearized_ndim_system.hpp
index 2916208ed..87352aa19 100644
--- a/include/graphblas/utils/geometry/linearized_ndim_system.hpp
+++ b/include/graphblas/utils/geometry/linearized_ndim_system.hpp
@@ -1,6 +1,22 @@
 
-#ifndef _NDIM_SYSTEM_LINEARIZER_H_
-#define _NDIM_SYSTEM_LINEARIZER_H_
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
+#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
 
 #include <cstddef>
 #include <algorithm>
@@ -9,166 +25,192 @@
 #include <stdexcept>
 #include <cassert>
 #include <string>
+#include <cstddef>
 
 #include "ndim_system.hpp"
 #include "linearized_ndim_iterator.hpp"
-#include "array_vector_storage.hpp"
+// #include "array_vector_storage.hpp"
 
+/**
+ * @file linearized_ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of \p LinearizedNDimSystem.
+ *
+ * @date 2022-10-24
+ */
 
 namespace grb {
 	namespace utils {
 		namespace geometry {
 
-template< typename IterIn, typename IterOut >
-	std::size_t __compute_offsets( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
-	std::size_t prod{1};
-	for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
-		*out_begin = prod;
-		prod *= *in_begin;
-	}
-	return prod;
-}
-
-// container for system sizes, doing only ndim <--> linear translation
-template< typename T, typename StorageT > class linearized_ndim_system:
-	public ndim_system< T, StorageT > {
-public:
-
-	using base_t = ndim_system< T, StorageT >;
-	using storage_t = StorageT;
-	using self_t = linearized_ndim_system< T, StorageT >;
-
-	using vector_reference = typename base_t::vector_reference;
-	using const_vector_reference = typename base_t::const_vector_reference;
-	using vector_storage = typename storage_t::vector_storage;
-	using const_vector_storage = typename storage_t::const_vector_storage;
-	using iterator = linearized_ndim_iterator< T, storage_t >;
-
-	template< typename IterT > linearized_ndim_system( IterT begin, IterT end) noexcept :
-		base_t( begin, end ),
-		offsets( std::distance( begin, end ) )
-	{
-		this->_system_size = __compute_offsets( begin, end, this->offsets.begin() ) ;
-	}
-
-	linearized_ndim_system() = delete;
-
-	linearized_ndim_system( const self_t &original ) = default;
-
-
-	linearized_ndim_system( self_t &&original ) noexcept:
-		base_t( std::move(original) ), offsets( std::move( original.offsets ) ),
-		_system_size( original._system_size ) {
-			original._system_size = 0;
-	}
-
-	linearized_ndim_system( const std::vector<std::size_t> & _sizes ) noexcept :
-		linearized_ndim_system( _sizes.cbegin(), _sizes.cend() ) {}
-
-	linearized_ndim_system( std::size_t _dimensions, std::size_t max_value ) noexcept :
-		base_t( _dimensions, max_value ),
-		offsets( _dimensions ),
-		_system_size( _dimensions )
-	{
-		T v{1};
-		for( std::size_t i{0}; i < _dimensions; i++ ) {
-			this->offsets[i] = v;
-			v *= max_value;
-		}
-		this->_system_size = v;
-	}
-
-	~linearized_ndim_system() {}
-
-	self_t& operator=( const self_t & ) = default;
-
-	//linearized_ndim_system& operator=( linearized_ndim_system &&original ) = delete;
-
-	inline std::size_t system_size() const {
-		return this->_system_size;
-	}
-
-	inline const_vector_reference get_offsets() const {
-		return this->offsets;
-	}
-
-	void linear_to_ndim(std::size_t linear, vector_reference output ) const {
-		if( linear > this->_system_size ) {
-			throw std::range_error( "linear value beyond system" );
-		}
-		for( std::size_t _i{ this->offsets.dimensions() }; _i > 0; _i-- ) {
-			const std::size_t dim{ _i - 1 };
-			const std::size_t coord{ linear / this->offsets[dim] };
-			output[dim] = coord;
-			linear -= ( coord * this->offsets[dim] );
-		}
-		assert( linear == 0 );
-	}
-
-	std::size_t ndim_to_linear_check( const_vector_reference ndim_vector) const {
-		return this->ndim_to_linear_check( ndim_vector.storage() );
-	}
-
-	std::size_t ndim_to_linear_check( const_vector_storage ndim_vector ) const {
-        std::size_t linear { 0 };
-        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
-			if( ndim_vector[i] >= this->get_sizes()[i] ) {
-				throw std::invalid_argument( "input vector beyond system sizes" );
-			}
-        }
-        return ndim_to_linear( ndim_vector );
-	}
-
-	std::size_t ndim_to_linear( const_vector_reference ndim_vector) const {
-		return this->ndim_to_linear( ndim_vector.storage() );
-	}
-
-	std::size_t ndim_to_linear( const_vector_storage ndim_vector ) const {
-        std::size_t linear { 0 };
-        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
-            linear += this->offsets[i] * ndim_vector[i];
-        }
-        return linear;
-	}
-
-	std::size_t ndim_to_linear_offset( const_vector_storage ndim_vector ) const {
-        std::size_t linear { 0 };
-		std::size_t steps{ 1 };
-        for( std::size_t i { 0 }; i < this->dimensions(); i++ ) {
-            linear += steps * ndim_vector[i];
-			steps *= this->_sizes[i];
-        }
-        return linear;
-	}
-
-	// must be same dimensionality
-	void retarget( const_vector_reference _new_sizes ) {
-		if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
-			throw std::invalid_argument("new system must have same dimensions as previous: new "
-				+ std::to_string( _new_sizes.dimensions() ) + ", old "
-				+ std::to_string( this->_sizes.dimensions() ) );
-		}
-		this->_sizes = _new_sizes; // copy
-		this->_system_size = __compute_offsets( _new_sizes.begin(), _new_sizes.end(), this->offsets.begin() ) ;
-	}
-
-	iterator begin() const {
-		return iterator( *this );
-	}
-
-	iterator end() const {
-		return iterator::make_system_end_iterator( *this );
-	}
-
-private:
-	storage_t offsets;
-	std::size_t _system_size;
-
-};
+			/**
+			 * Extends a \p NDimSystem by linearizing it, i.e. it provides facilities to map a vector in
+			 * NDimSystem#dimensions() dimensions to a linear value ranging from \a 0 to #system_size()
+			 * and vice versa. Such a linearized representation allows user logic to iterate over the system:
+			 * iterators are indeed available via #begin()/#end().
+			 *
+			 * Further facilities are methods to map users' vectors from linear to NDimSystem#dimensions()-dimensional
+			 * or vice versa and also to "retaget" the system, i.e. to represent a system of same dimensionality
+			 * but different sizes.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimSystem: public NDimSystem< SizeType, InternalVectorType > {
+
+			public:
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
+
+				using BaseType = NDimSystem< SizeType, InternalVectorType >;
+				using SelfType = LinearizedNDimSystem< SizeType, InternalVectorType >;
+				using VectorType = typename BaseType::VectorType;
+
+				using VectorReference = typename BaseType::VectorReference;
+				using ConstVectorReference = typename BaseType::ConstVectorReference;
+				using VectorStorageType = typename VectorType::VectorStorageType;
+				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
+				using Iterator = LinearizedNDimIterator< SizeType, InternalVectorType >;
+
+				template< typename IterT > LinearizedNDimSystem( IterT begin, IterT end) noexcept :
+					BaseType( begin, end ),
+					offsets( std::distance( begin, end ) )
+				{
+					this->_system_size = compute_offsets( begin, end, this->offsets.begin() ) ;
+				}
+
+				LinearizedNDimSystem( const std::vector< size_t > &_sizes ) noexcept :
+					LinearizedNDimSystem( _sizes.cbegin(), _sizes.cend() ) {}
+
+				LinearizedNDimSystem( size_t _dimensions, size_t max_value ) noexcept :
+					BaseType( _dimensions, max_value ),
+					offsets( _dimensions ),
+					_system_size( _dimensions )
+				{
+					SizeType v{1};
+					for( size_t i{0}; i < _dimensions; i++ ) {
+						this->offsets[i] = v;
+						v *= max_value;
+					}
+					this->_system_size = v;
+				}
+
+				LinearizedNDimSystem() = delete;
+
+				LinearizedNDimSystem( const SelfType &original ) = default;
+
+				LinearizedNDimSystem( SelfType &&original ) noexcept:
+					BaseType( std::move(original) ), offsets( std::move( original.offsets ) ),
+					_system_size( original._system_size ) {
+						original._system_size = 0;
+				}
+
+				~LinearizedNDimSystem() {}
+
+				SelfType& operator=( const SelfType & ) = default;
+
+				SelfType& operator=( SelfType &&original ) = delete;
+
+				inline size_t system_size() const {
+					return this->_system_size;
+				}
+
+				inline ConstVectorReference get_offsets() const {
+					return this->offsets;
+				}
+
+				void linear_to_ndim( size_t linear, VectorReference output ) const {
+					if( linear > this->_system_size ) {
+						throw std::range_error( "linear value beyond system" );
+					}
+					for( size_t _i{ this->offsets.dimensions() }; _i > 0; _i-- ) {
+						const size_t dim{ _i - 1 };
+						const size_t coord{ linear / this->offsets[dim] };
+						output[dim] = coord;
+						linear -= ( coord * this->offsets[dim] );
+					}
+					assert( linear == 0 );
+				}
+
+				size_t ndim_to_linear_check( ConstVectorReference ndim_vector) const {
+					return this->ndim_to_linear_check( ndim_vector.storage() );
+				}
+
+				size_t ndim_to_linear_check( ConstVectorStorageType ndim_vector ) const {
+					size_t linear { 0 };
+					for( size_t i { 0 }; i < this->dimensions(); i++ ) {
+						if( ndim_vector[i] >= this->get_sizes()[i] ) {
+							throw std::invalid_argument( "input vector beyond system sizes" );
+						}
+					}
+					return ndim_to_linear( ndim_vector );
+				}
+
+				size_t ndim_to_linear( ConstVectorReference ndim_vector) const {
+					return this->ndim_to_linear( ndim_vector.storage() );
+				}
+
+				size_t ndim_to_linear( ConstVectorStorageType ndim_vector ) const {
+					size_t linear { 0 };
+					for( size_t i { 0 }; i < this->dimensions(); i++ ) {
+						linear += this->offsets[i] * ndim_vector[i];
+					}
+					return linear;
+				}
+
+				// probably same as ndim_to_linear !!!
+				size_t ndim_to_linear_offset( ConstVectorStorageType ndim_vector ) const {
+					size_t linear{ 0 };
+					size_t steps{ 1 };
+					for( size_t i{ 0 }; i < this->dimensions(); i++ ) {
+						linear += steps * ndim_vector[i];
+						steps *= this->_sizes[i];
+					}
+					return linear;
+				}
+
+				// must be same dimensionality
+				void retarget( ConstVectorReference _new_sizes ) {
+					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
+						throw std::invalid_argument("new system must have same dimensions as previous: new "
+							+ std::to_string( _new_sizes.dimensions() ) + ", old "
+							+ std::to_string( this->_sizes.dimensions() ) );
+					}
+					this->_sizes = _new_sizes; // copy
+					this->_system_size = compute_offsets( _new_sizes.begin(), _new_sizes.end(), this->offsets.begin() ) ;
+				}
+
+				Iterator begin() const {
+					return Iterator( *this );
+				}
+
+				Iterator end() const {
+					return Iterator::make_system_end_iterator( *this );
+				}
+
+			private:
+
+				VectorType offsets;
+				size_t _system_size;
+
+				 template<
+					typename IterIn,
+					typename IterOut
+				> static size_t compute_offsets( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
+					size_t prod{1};
+					for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
+						*out_begin = prod;
+						prod *= *in_begin;
+					}
+					return prod;
+				}
+			};
 
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _NDIM_SYSTEM_LINEARIZER_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
diff --git a/include/graphblas/utils/geometry/ndim_system.hpp b/include/graphblas/utils/geometry/ndim_system.hpp
index 41434f3c4..f9a97c18d 100644
--- a/include/graphblas/utils/geometry/ndim_system.hpp
+++ b/include/graphblas/utils/geometry/ndim_system.hpp
@@ -1,69 +1,133 @@
 
-#ifndef _NDIM_SYSTEM_H_
-#define _NDIM_SYSTEM_H_
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
 
 #include <cstddef>
 #include <algorithm>
 #include <vector>
 #include <utility>
+#include <type_traits>
+#include <cstddef>
 
 #include "array_vector_storage.hpp"
 
+/**
+ * @file ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of \p NDimSystem.
+ *
+ * @date 2022-10-24
+ */
 
 namespace grb {
 	namespace utils {
 		namespace geometry {
 
-template< typename T, typename StorageT > class ndim_system {
-
-public:
-	using storage_t = StorageT;
-	using vector_reference = storage_t&;
-	using const_vector_reference = const storage_t&;
-	using self_t = ndim_system< T, StorageT >;
-
-	template< typename IterT > ndim_system( IterT begin, IterT end) noexcept :
-		_sizes( std::distance( begin, end ) )
-	{
-		std::copy( begin, end, this->_sizes.begin() );
-	}
-
-	ndim_system() = delete;
-
-	ndim_system( const self_t & ) = default;
-
-	ndim_system( const std::vector<std::size_t> & _sizes ) noexcept :
-		self_t( _sizes.cbegin(), _sizes.cend() ) {}
-
-	ndim_system( std::size_t _dimensions, std::size_t max_value ) noexcept :
-		_sizes( _dimensions )
-	{
-		std::fill_n( this->_sizes.begin(), _dimensions, max_value );
-	}
-
-	ndim_system( self_t &&original ) noexcept: _sizes( std::move( original._sizes ) ) {}
-
-	~ndim_system() {}
-
-	self_t & operator=( const self_t &original ) = default;
-
-	//self_t & operator=( self_t &&original ) = delete;
-
-	inline std::size_t dimensions() const noexcept {
-		return _sizes.dimensions();
-	}
-
-	inline const_vector_reference get_sizes() const noexcept {
-		return this->_sizes;
-	}
-
-protected:
-
-	storage_t _sizes;
-};
+			/**
+			 * Describes a #dimensions()-dimensional system by storing its size along each dimension.
+			 *
+			 * It is meant to represent a grid of #dimensions() dimensions and size #get_sizes()[d]
+			 * for each dimension \a d in the interval <em>[0, #dimensions())<\em>.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class NDimSystem {
+
+			public:
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
+
+				using VectorType = InternalVectorType;
+				using VectorReference = VectorType&;
+				using ConstVectorReference = const VectorType&;
+				using SelfType = NDimSystem< SizeType, InternalVectorType >;
+
+				/**
+				 * Construct a new NDimSystem object from an iterable range.
+				 *
+				 * The dimension is computed as \a std::distance(begin,end), i.e.
+				 * \p IterT should be a random-access iterator for performance.
+				 *
+				 * @tparam IterT iterator type
+				 * @param begin range begin
+				 * @param end end of range
+				 */
+				template< typename IterT > NDimSystem( IterT begin, IterT end) noexcept :
+					_sizes( std::distance( begin, end ) )
+				{
+					std::copy( begin, end, this->_sizes.begin() );
+				}
+
+				/**
+				 * Construct a new NDimSystem object from an std::vector<>, taking its values
+				 * as system sizes and its length as number of dimensions.
+				 */
+				NDimSystem( const std::vector< size_t > &_sizes ) noexcept :
+					SelfType( _sizes.cbegin(), _sizes.cend() ) {}
+
+				/**
+				 * Construct a new NDimSystem object of dimensions \p dimensions
+				 *  and with all sizes initialized to \p max_size
+				 */
+				NDimSystem( size_t _dimensions, size_t max_size ) noexcept :
+					_sizes( _dimensions )
+				{
+					std::fill_n( this->_sizes.begin(), _dimensions, max_size );
+				}
+
+				NDimSystem() = delete;
+
+				NDimSystem( const SelfType & ) = default;
+
+				// NDimSystem( SelfType && ) = default;
+
+				// NDimSystem( SelfType &&original ) noexcept: _sizes( std::move( original._sizes ) ) {}
+				NDimSystem( SelfType && ) = delete;
+
+				~NDimSystem() {}
+
+				SelfType & operator=( const SelfType &original ) = default;
+
+				SelfType & operator=( SelfType &&original ) = delete;
+
+				inline size_t dimensions() const noexcept {
+					return _sizes.dimensions();
+				}
+
+				/**
+				 * Get the sizes of the represented system as an iterable \p InternalStorageType
+				 * 	object.
+				 */
+				inline ConstVectorReference get_sizes() const noexcept {
+					return this->_sizes;
+				}
+
+			protected:
+
+				InternalVectorType _sizes;
+			};
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
diff --git a/include/graphblas/utils/geometry/ndim_vector.hpp b/include/graphblas/utils/geometry/ndim_vector.hpp
index 9c9ad3b6a..eca89137e 100644
--- a/include/graphblas/utils/geometry/ndim_vector.hpp
+++ b/include/graphblas/utils/geometry/ndim_vector.hpp
@@ -1,14 +1,12 @@
 
-#ifndef _NDIM_VECTOR_H_
-#define _NDIM_VECTOR_H_
+#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
+#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
 
 #include <utility>
 #include <vector>
-#include <array>
-#include <stdexcept>
-#include <cassert>
-#include <iterator>
 #include <type_traits>
+#include <cstddef>
+#include <algorithm>
 
 #include "linearized_ndim_system.hpp"
 
@@ -16,107 +14,127 @@ namespace grb {
 	namespace utils {
 		namespace geometry {
 
-template< typename OutT, typename CoordsT, typename StorageT  > class ndim_vector {
-
-public:
-
-	using const_domain_vector_reference =
-		typename linearized_ndim_system< CoordsT, StorageT >::const_vector_reference;
-	using domain_vector_storage = typename StorageT::const_vector_storage;
-	using domain_iterator = typename linearized_ndim_system< CoordsT, StorageT >::iterator;
-
-private:
-
-	const linearized_ndim_system< CoordsT, StorageT > _linearizer;
-	OutT* data;
-
-	inline std::size_t get_coordinate( domain_vector_storage coordinates ) const {
-		return this->_linearizer.ndim_to_linear( coordinates );
-	}
-
-	inline std::size_t get_coordinate( domain_iterator coordinates ) const {
-		return this->_linearizer.ndim_to_linear( coordinates );
-	}
-
-    void clean_mem() {
-        if ( this->data == nullptr ) {
-            delete[] this->data;
-        }
-    }
-
-public:
-
-	ndim_vector() = delete;
-
-	template< typename IterT > ndim_vector( IterT begin, IterT end): _linearizer( begin, end ) {
-		static_assert( std::is_default_constructible< OutT >::value,
-			"the stored type is not default constructible" );
-		this->data = new OutT[ _linearizer.system_size() ];
-	}
-
-	ndim_vector( const std::vector<std::size_t> & _sizes ):
-		ndim_vector( _sizes.cbegin(), _sizes.cend() ) {}
-
-	// ndim_vector( const ndim_vector< OutT, CoordsT, StorageT >& original ):
-	// 	_linearizer( original._linearizer ) {
-    //     this->data = new std::size_t[ original.data_size() ];
-	// 	std::copy_n( original.data, original.data_size(), this->data );
-    // }
-	ndim_vector( const ndim_vector< OutT, CoordsT, StorageT >& original ) = delete;
-
-
-	ndim_vector( ndim_vector< OutT, CoordsT, StorageT >&& original ) noexcept:
-		_linearizer( std::move( original._linearizer ) ) {
-        this->data = original.data;
-        original.data = nullptr;
-    }
-	// ndim_vector( ndim_vector< OutT, CoordsT, StorageT >&& original ) = delete;
-
-	ndim_vector< OutT, CoordsT, StorageT >& operator=(
-			const ndim_vector< OutT, CoordsT, StorageT > &original ) = delete;
-
-	ndim_vector< OutT, CoordsT, StorageT >& operator=(
-			ndim_vector< OutT, CoordsT, StorageT > &&original ) = delete;
-
-    ~ndim_vector() {
-        this->clean_mem();
-    }
-
-	std::size_t dimensions() const {
-		return this->_linearizer.dimensions();
-	}
-
-	std::size_t data_size() const {
-		return this->_linearizer.system_size();
-	}
-
-	inline OutT& at( const_domain_vector_reference coordinates ) {
-		return this->data[ this->get_coordinate( coordinates.storage() ) ];
-	}
-
-	inline const OutT& at( const_domain_vector_reference coordinates ) const {
-		return this->data[ this->get_coordinate( coordinates.storage() ) ];
-	}
-
-	inline OutT& at( domain_vector_storage coordinates ) {
-		return this->data[ this->get_coordinate( coordinates ) ];
-	}
-
-	inline const OutT& at( domain_vector_storage coordinates ) const {
-		return this->data[ this->get_coordinate( coordinates ) ];
-	}
-
-	domain_iterator domain_begin() const {
-		return this->_linearizer.begin();
-	}
-
-	domain_iterator domain_end() const {
-		return this->_linearizer.end();
-	}
-};
+			/**
+			 * Maps an N-dimensional vector to an array of data.
+			 *
+			 * The user constructs an object by passing the sizes (as an N-dimensional vector)
+			 * of the iteration space and accesses the stored data via an N-dimensional vector of coordinates.
+			 *
+			 * Example: if the user constructs an \p NDimVector with 3D sizes \a [2,3,4], she can access data
+			 * via a 3D coordinates vector of ranges \a [0-1]x[0-2]x[0-3] (here \a x denoting the cartesian product)
+			 * by using the #at() method.
+			 *
+			 * This facility allows associating a value of type \p DataType to, for example,
+			 * each element of an N-dimensional grid.
+			 *
+			 * @tparam DataType type of data stored in the array
+			 * @tparam SizeType type for the components of the N-dimensional vector:
+			 * 	the maximum number of stored data is thus \f$ std::numeric_limits<SizeType>::max()^N \f$
+			 * @tparam InternalVectorType storage type of the internal N-dimensional vector
+			 */
+			template<
+				typename DataType,
+				typename SizeType,
+				typename InternalVectorType
+			> class NDimVector {
+
+			public:
+				static_assert( std::is_default_constructible< DataType >::value,
+					"the stored type is not default constructible" );
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be integral" );
+
+				using ConstDomainVectorReference =
+					typename LinearizedNDimSystem< SizeType, InternalVectorType >::ConstVectorReference;
+				using ConstDomainVectorStorageType = typename InternalVectorType::ConstVectorStorageType;
+				using DomainIterator = typename LinearizedNDimSystem< SizeType, InternalVectorType >::Iterator;
+				using Selftype = NDimVector< DataType, SizeType, InternalVectorType >;
+
+				NDimVector() = delete;
+
+				template< typename IterT > NDimVector( IterT begin, IterT end) :
+					_linearizer( begin, end )
+				{
+					this->data = new DataType[ _linearizer.system_size() ];
+				}
+
+				NDimVector( const std::vector< size_t > &_sizes ) :
+					NDimVector( _sizes.cbegin(), _sizes.cend() ) {}
+
+				NDimVector( const Selftype& original ):
+					_linearizer( original._linearizer ),
+				    data( new DataType[ original.data_size() ] )
+				{
+					std::copy_n( original.data, original.data_size(), this->data );
+				}
+
+				NDimVector( Selftype&& original ) noexcept:
+					_linearizer( std::move( original._linearizer ) )
+				{
+					this->data = original.data;
+					original.data = nullptr;
+				}
+
+				Selftype& operator=( const Selftype &original ) = delete;
+
+				Selftype& operator=( Selftype &&original ) = delete;
+
+				~NDimVector() {
+					this->clean_mem();
+				}
+
+				size_t dimensions() const {
+					return this->_linearizer.dimensions();
+				}
+
+				size_t data_size() const {
+					return this->_linearizer.system_size();
+				}
+
+				inline DataType& at( ConstDomainVectorReference coordinates ) {
+					return this->data[ this->get_coordinate( coordinates.storage() ) ];
+				}
+
+				inline const DataType& at( ConstDomainVectorReference coordinates ) const {
+					return this->data[ this->get_coordinate( coordinates.storage() ) ];
+				}
+
+				inline DataType& at( ConstDomainVectorStorageType coordinates ) {
+					return this->data[ this->get_coordinate( coordinates ) ];
+				}
+
+				inline const DataType& at( ConstDomainVectorStorageType coordinates ) const {
+					return this->data[ this->get_coordinate( coordinates ) ];
+				}
+
+				DomainIterator domain_begin() const {
+					return this->_linearizer.begin();
+				}
+
+				DomainIterator domain_end() const {
+					return this->_linearizer.end();
+				}
+
+			private:
+				const LinearizedNDimSystem< SizeType, InternalVectorType > _linearizer;
+				DataType* data;
+
+				inline size_t get_coordinate( ConstDomainVectorStorageType coordinates ) const {
+					return this->_linearizer.ndim_to_linear( coordinates );
+				}
+
+				inline size_t get_coordinate( DomainIterator coordinates ) const {
+					return this->_linearizer.ndim_to_linear( coordinates );
+				}
+
+				void clean_mem() {
+					if ( this->data == nullptr ) {
+						delete[] this->data;
+					}
+				}
+			};
 
 		} // namespace geometry
 	} // namespace utils
 } // namespace grb
 
-#endif // _NDIM_VECTOR_H_
+#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
diff --git a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
new file mode 100644
index 000000000..81864cb20
--- /dev/null
+++ b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
@@ -0,0 +1,128 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IteratorValueAdaptor.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Adaptor to extract a given value out of an iterator.
+ * @date 2022-10-08
+ */
+
+#ifndef H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
+#define H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
+
+#include <type_traits>
+#include <iterator>
+#include <utility>
+
+namespace grb {
+	namespace utils {
+
+		/**
+		 * Adaptor for an iterator, to extract the value pointed to by the * operator.
+		 * It wraps an iterator under the same interface, using an object of type \a AdaptorType
+		 * to adapt the returned value.
+		 *
+		 * @tparam InnerIterType type of the underlying iterator
+		 * @tparam AdaptorType type of the adaptor, to be instantiated by default
+		 */
+		template<
+			typename InnerIterType,
+			typename AdaptorType
+		> struct IteratorValueAdaptor {
+
+			static_assert( std::is_default_constructible< AdaptorType >::value, "RefType must be default-constructible" );
+			static_assert( std::is_copy_constructible< AdaptorType >::value, "RefType must be copy-constructible" );
+			static_assert( std::is_copy_assignable< AdaptorType >::value, "RefType must be copy-assignable" );
+
+			typedef decltype( std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) ) reference;
+			typedef typename std::decay< reference >::type value_type;
+			typedef value_type * pointer;
+			typedef const value_type * const_pointer;
+			typedef typename std::iterator_traits< InnerIterType >::iterator_category iterator_category;
+			typedef typename std::iterator_traits< InnerIterType >::difference_type difference_type;
+
+			static constexpr bool is_random_access = std::is_base_of<
+				std::random_access_iterator_tag, iterator_category >::value;
+
+			InnerIterType iter;
+			AdaptorType adaptor;
+
+			using SelfType = IteratorValueAdaptor< InnerIterType, AdaptorType >;
+
+			/**
+			 * Construct a new Iterator Value Adaptor object fro an actual iterator.
+			 * The adaptor is built via its default constructor.
+			 *
+			 * @param _iter the underlying iterator, to be copied
+			 */
+			IteratorValueAdaptor(
+				const InnerIterType &_iter
+			) :
+				iter( _iter ),
+				adaptor() {}
+
+			/**
+			 * Construct a new Iterator Value Adaptor object fro an actual iterator.
+			 * The adaptor is built via its default constructor.
+			 *
+			 * @param _iter the underlying iterator, to be moved
+			 */
+			IteratorValueAdaptor(
+				InnerIterType &&_iter
+			) :
+				iter( std::move( _iter ) ),
+				adaptor() {}
+
+			IteratorValueAdaptor() = delete;
+
+			IteratorValueAdaptor( const SelfType & ) = default;
+
+			IteratorValueAdaptor( SelfType && ) = default;
+
+			SelfType& operator=( const SelfType & ) = default;
+
+			SelfType& operator=( SelfType && ) = default;
+
+			bool operator!=( const SelfType & o ) const { return o.iter != iter; }
+
+			bool operator==( const SelfType & o ) const { return ! operator!=( o ); }
+
+			reference operator*() { return adaptor( *iter ); }
+
+			const reference operator*() const { return adaptor( *iter ); }
+
+			pointer operator->() { return adaptor( *iter ); }
+
+			const_pointer operator->() const { return adaptor( *iter ); }
+
+			SelfType& operator++() { ++iter; return *this; }
+
+			SelfType & operator+=( typename std::enable_if< is_random_access, const size_t >::type offset ) {
+				iter += offset;
+				return *this;
+			}
+
+			difference_type operator-( typename std::enable_if< is_random_access, const SelfType & >::type other ) {
+				return iter - other.iter;
+			}
+		};
+
+	} // end namespace utils
+} // end namespace grb
+
+#endif // H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
diff --git a/include/graphblas/utils/iterators/partition_range.hpp b/include/graphblas/utils/iterators/partition_range.hpp
new file mode 100644
index 000000000..dd5f397c4
--- /dev/null
+++ b/include/graphblas/utils/iterators/partition_range.hpp
@@ -0,0 +1,71 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <algorithm>
+#include <cassert>
+
+#ifndef H_GRB_UTILS_PARTITION_RANGE
+#define H_GRB_UTILS_PARTITION_RANGE
+
+namespace grb {
+	namespace utils {
+
+		template< typename T > void partition_nonzeroes(
+				size_t num_procs,
+				size_t this_proc,
+				T num_elements,
+				T& first_offset,
+				T& last_offset
+		) {
+			const T per_process{ ( num_elements + num_procs - 1 ) / num_procs }; // round up
+			first_offset = std::min( per_process * static_cast< T >( this_proc ), num_elements );
+			last_offset = std::min( first_offset + per_process, num_elements );
+		}
+
+		template< typename IterT > void partition_iteration_range_on_procs(
+			size_t num_procs,
+			size_t this_proc,
+			size_t num_nonzeroes,
+			IterT &begin,
+			IterT &end
+		) {
+			static_assert( std::is_base_of< std::random_access_iterator_tag,
+				typename std::iterator_traits< IterT >::iterator_category >::value,
+				"the given iterator is not a random access one" );
+			assert( num_nonzeroes == static_cast< size_t >( end - begin ) );
+			size_t first, last;
+			partition_nonzeroes( num_procs, this_proc, num_nonzeroes, first, last );
+			if( last < num_nonzeroes ) {
+				end = begin;
+				end += last;
+			}
+			begin += first;
+		}
+
+		template< typename IterT > void partition_iteration_range_on_procs(
+			size_t num_nonzeroes,
+			IterT &begin,
+			IterT &end
+		) {
+			return partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(), num_nonzeroes, begin, end );
+		}
+
+	} // namespace utils
+} // namespace grb
+
+#endif // H_GRB_UTILS_PARTITION_RANGE
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 93c69d87e..2b544fb16 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -33,12 +33,13 @@
 #include <iostream>
 #include <memory>
 #include <type_traits>
+#include <algorithm>
 
 #include <graphblas.hpp>
 
-#include <chrono>
-
-// #define TEST_ITER
+//========== TRACE SOLVER STEPS =========
+// to easily trace the steps of the solver, just define this symbol
+// #define HPCG_PRINT_STEPS
 
 // here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
 #ifdef HPCG_PRINT_STEPS
@@ -53,10 +54,7 @@
 #define DBG_println( args ) std::cout << args << std::endl;
 
 // forward declaration for the tracing facility
-template< typename T,
-	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
->
-void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
+template< typename T > void print_norm( const grb::Vector< T > &r, const char * head );
 
 /**
  * @brief prints \p head and the norm of \p r.
@@ -65,11 +63,9 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
 #endif
 
 #include <graphblas/algorithms/hpcg/hpcg.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
 #include <graphblas/algorithms/hpcg/system_building_utils.hpp>
 
-#include <graphblas/algorithms/hpcg/old_ndim_matrix_builders.hpp>
-#include <graphblas/algorithms/hpcg/coloring.hpp>
-
 #include <graphblas/utils/Timer.hpp>
 
 #include <utils/argument_parser.hpp>
@@ -100,10 +96,10 @@ using namespace algorithms;
 static const char * const TEXT_HIGHLIGHT = "===> ";
 #define thcout ( std::cout << TEXT_HIGHLIGHT )
 #define thcerr ( std::cerr << TEXT_HIGHLIGHT )
-
+#define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
 
 /**
- * @brief Container for system parameters to create the HPCG problem.
+ * Container for system parameters to create the HPCG problem.
  */
 struct system_input {
 	size_t nx, ny, nz;
@@ -111,7 +107,7 @@ struct system_input {
 };
 
 /**
- * @brief Container for the parameters for the HPCG simulation.
+ * Container for the parameters for the HPCG simulation.
  */
 struct simulation_input : public system_input {
 	size_t test_repetitions;
@@ -122,30 +118,32 @@ struct simulation_input : public system_input {
 	bool print_iter_stats;
 };
 
+using IOType = double;
+using NonzeroType = double;
+using InputType = double;
+using ResidualType = double;
+using StdRing = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
+	grb::identities::zero, grb::identities::one >;
+using StdMinus = operators::subtract< NonzeroType >;
+using coord_t = size_t;
+
 /**
- * @brief Containers for test outputs.
+ * Containers for test outputs.
  */
 struct output {
-	RC error_code;
-	size_t test_repetitions;
-	size_t performed_iterations;
-	double residual;
+	RC error_code = SUCCESS;
+	size_t test_repetitions = 0;
+	size_t performed_iterations = 0;
+	NonzeroType residual = 0.0;
 	grb::utils::TimerResults times;
-	std::unique_ptr< PinnedVector< double > > pinnedVector;
-	double square_norm_diff;
-
-	output() {
-		error_code = SUCCESS;
-		test_repetitions = 0;
-		performed_iterations = 0;
-		residual = 0.0;
-	}
+	std::unique_ptr< PinnedVector< IOType > > pinnedVector;
+	NonzeroType square_norm_diff;
 };
 
 /**
- * @brief Returns the closets power of 2 bigger or equal to \p n .
+ * Returns the closets power of 2 bigger or equal to \p n .
  */
-template< typename T = size_t >
+template< typename T >
 T static next_pow_2( T n ) {
 	static_assert( std::is_integral< T >::value, "Integral required." );
 	--n;
@@ -157,49 +155,136 @@ T static next_pow_2( T n ) {
 	return n + 1;
 }
 
+using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
+	StdRing, StdMinus >;
+using mg_data_t = multigrid_data< IOType, NonzeroType >;
+using coarsening_data_t = coarsening_data< IOType, NonzeroType >;
+using smoothing_data_t = smoother_data< IOType >;
+using hpcg_data_t = mg_cg_data< IOType, NonzeroType, InputType >;
+
 /**
- * @brief Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
+ * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
  * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
  */
-static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
-	struct hpcg_system_params< 3, double > params {
+static void build_3d_system(
+	const system_input & in,
+	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
+	std::unique_ptr< hpcg_data_t > &holder
+) {
+	constexpr size_t DIMS = 3;
+	using builder_t = grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >;
+	const size_t pid { spmd<>::pid() };
+	grb::utils::Timer timer;
+
+	hpcg_system_params< 3, NonzeroType > params {
 		{ in.nx, in.ny, in.nz }, HALO_RADIUS, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE,
 			PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
 
-	return build_hpcg_system< 3, double >( holder, params );
+	std::vector< builder_t > mg_generators;
+	MASTER_PRINT( pid, "building HPCG generators for " << ( in.max_coarsening_levels + 1 )
+		<< " levels..." );
+	timer.reset();
+	build_hpcg_multigrid_generators( params, mg_generators );
+	double time = timer.time();
+	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
+	MASTER_PRINT( pid, "built HPCG generators for " << mg_generators.size()
+		<< " levels" << std::endl );
+
+	hpcg_data_t *data{ new hpcg_data_t( mg_generators[ 0 ].system_size() ) };
+	holder = std::unique_ptr< hpcg_data_t >( data );
+
+	std::vector< size_t > mg_sizes;
+	// exclude main system
+	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
+		[] ( const builder_t &b ) { return b.system_size(); } );
+
+	MASTER_PRINT( pid, "allocating data for the MultiGrid simulation...");
+	timer.reset();
+	allocate_multigrid_data( mg_sizes, system_levels, coarsener_levels, smoother_levels );
+	time = timer.time();
+	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+
+	// zero all vectors
+	MASTER_PRINT( pid, "zeroing all vectors...");
+	timer.reset();
+	data->zero_temp_vectors();
+	std::for_each( system_levels.begin(), system_levels.end(),
+		[]( std::unique_ptr< mg_data_t > &s) { s->zero_temp_vectors(); } );
+	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
+		[]( std::unique_ptr< coarsening_data_t > &s) { s->zero_temp_vectors(); } );
+	std::for_each( smoother_levels.begin(), smoother_levels.end(),
+		[]( std::unique_ptr< smoothing_data_t > &s) { s->zero_temp_vectors(); } );
+	time = timer.time();
+	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+
+	assert( mg_generators.size() == system_levels.size() );
+	assert( mg_generators.size() == smoother_levels.size() );
+	assert( mg_generators.size() - 1 == coarsener_levels.size() );
+
+	for( size_t i = 0; i < mg_generators.size(); i++) {
+		MASTER_PRINT( pid, "SYSTEM LEVEL " << i << std::endl );
+		MASTER_PRINT( pid, " populating system matrix: " );
+		timer.reset();
+		populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A );
+		time = timer.time();
+		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+
+		MASTER_PRINT( pid, " populating smoothing data: " );
+		timer.reset();
+		populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ] );
+		time = timer.time();
+		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+
+		if( i > 0 ) {
+			MASTER_PRINT( pid, " populating coarsening data: " );
+			timer.reset();
+			populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			time = timer.time();
+			MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+		}
+	}
 }
 
 #ifdef HPCG_PRINT_SYSTEM
-static void print_system( const hpcg_data< double, double, double > & data ) {
-	print_matrix( data.A, 70, "A" );
-	multi_grid_data< double, double > * coarser = data.coarser_level;
-	while( coarser != nullptr ) {
-		print_matrix( coarser->coarsening_matrix, 50, "COARSENING MATRIX" );
-		print_matrix( coarser->A, 50, "COARSER SYSTEM MATRIX" );
-		coarser = coarser->coarser_level;
+static void print_system(
+	const std::vector< std::unique_ptr< mg_data_t > > &system_levels,
+	const std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels
+) {
+	print_matrix( system_levels[ 0 ]->A, 70, "A" );
+	for( size_t i = 0; i < coarsener_levels.size(); i++ ) {
+		print_matrix( coarsener_levels[i ] ->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( system_levels[ i + 1 ]->A, 50, "COARSER SYSTEM MATRIX" );
 	}
 }
 #endif
 
 #ifdef HPCG_PRINT_STEPS
-template< typename T,
-		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
-	>
-void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
+template<
+	typename T,
+	class Ring
+> void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
 	T norm = 0;
 	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
 	if( head != nullptr ) {
-		std::cout << head << ": ";
 		printf(">>> %s: %lf\n", head, norm );
 	} else {
 		printf(">>> %lf\n", norm );
 	}
 }
+
+template< typename T > void print_norm( const grb::Vector< T > & r, const char * head ) {
+	return print_norm( r, head, StdRing() );
+}
 #endif
 
+
+
+
 /**
  * @brief Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
@@ -208,44 +293,49 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
 	const size_t pid { spmd<>::pid() };
 	assert( pid < spmd<>::nprocs() );
+	if( pid == 0 ) {
+		thcout << "beginning input generation..." << std::endl;
+	}
 	grb::utils::Timer timer;
 
 	// assume successful run
 	out.error_code = SUCCESS;
-	RC rc { SUCCESS };
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
-	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
-	if( pid == 0 ) {
-		thcout << "beginning input generation..." << std::endl;
-	}
+	std::unique_ptr< hpcg_data_t > hpcg_state;
+
+	hpcg_runner_t hpcg_runner( build_hpcg_runner< IOType, NonzeroType, InputType, ResidualType,
+		StdRing, StdMinus >( in.smoother_steps ) );
+	auto &mg_runner = hpcg_runner.mg_runner;
+	auto &coarsener = mg_runner.coarsener_runner;
+	auto &smoother = mg_runner.smoother_runner;
+	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
+	hpcg_runner.cg_opts.tolerance = 0.0;
+	hpcg_runner.cg_opts.with_preconditioning = ! in.no_preconditioning;
+
 	timer.reset();
-	rc = build_3d_system( hpcg_state, in );
+	build_3d_system( in, mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state );
 	double input_duration { timer.time() };
 
-	if( rc != SUCCESS ) {
-		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
-		out.error_code = rc;
-		return;
-	}
 	if( pid == 0 ) {
 		thcout << "input generation time (ms): " << input_duration << std::endl;
 	}
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
-		print_system( *hpcg_state );
+		print_system( mg_runner.system_levels, coarsener.coarsener_levels );
 	}
 #endif
 
-	Matrix< double > & A { hpcg_state->A };
-	Vector< double > & x { hpcg_state->x };
-	Vector< double > & b { hpcg_state->b };
+	Matrix< NonzeroType > & A { mg_runner.system_levels[ 0 ]->A };
+	Vector< NonzeroType > & x { hpcg_state->x };
+	Vector< NonzeroType > & b { hpcg_state->b };
 
+	RC rc { SUCCESS };
 	// set vectors as from standard HPCG benchmark
 	set( x, 1.0 );
 	set( b, 0.0 );
-	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	rc = grb::mxv( b, A, x, StdRing() );
 	set( x, 0.0 );
 
 #ifdef HPCG_PRINT_SYSTEM
@@ -257,15 +347,15 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	out.times.preamble = timer.time();
 
-	const bool with_preconditioning = ! in.no_preconditioning;
+	cg_out_data< NonzeroType > cg_out;
+	mg_data_t &grid_base = *mg_runner.system_levels[ 0 ];
 	if( in.evaluation_run ) {
 		out.test_repetitions = 0;
 		if( pid == 0 ) {
 			thcout << "beginning evaluation run..." << std::endl;
 		}
 		timer.reset();
-		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
-			in.max_iterations, 0.0, out.performed_iterations, out.residual, false );
+		rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
 		double single_time = timer.time();
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
@@ -277,6 +367,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		}
 		out.times.useful = single_time;
 		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		out.performed_iterations = cg_out.iterations;
+		out.residual = cg_out.norm_residual;
 
 		if( pid == 0 ) {
 			thcout << "Evaluation run" << std::endl;
@@ -293,15 +385,17 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	if( pid == 0 ) {
 		thcout << "beginning cold run..." << std::endl;
 	}
+	hpcg_runner.cg_opts.max_iterations = 1;
 	timer.reset();
-	rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
-		1, 0.0, out.performed_iterations, out.residual, false );
+	rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
 	double iter_duration { timer.time() };
 	if( pid == 0 ) {
 		thcout << "cold run duration (ms): " << iter_duration << std::endl;
 	}
 
 
+	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
+	hpcg_runner.cg_opts.print_iter_stats = in.print_iter_stats;
 	// do benchmark
 	for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
 		rc = set( x, 0.0 );
@@ -310,8 +404,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 			thcout << "beginning iteration: " << i << std::endl;
 		}
 		timer.reset();
-		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps,
-			in.max_iterations, 0.0, out.performed_iterations, out.residual, in.print_iter_stats );
+		rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
 		iter_duration = timer.time();
 		out.times.useful += iter_duration;
 		if( pid == 0 ) {
@@ -324,6 +417,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 	out.times.useful /= static_cast< double >( in.test_repetitions );
 
+	out.performed_iterations = cg_out.iterations;
+	out.residual = cg_out.norm_residual;
+
 	if( spmd<>::pid() == 0 ) {
 		if( rc == SUCCESS ) {
 			thcout << "repetitions, average time (ms): " << out.test_repetitions
@@ -339,15 +435,13 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// set error code
 	out.error_code = rc;
 
-	Semiring< grb::operators::add< double >, grb::operators::mul< double >,
-		grb::identities::zero, grb::identities::one > ring;
 	grb::set( b, 1.0 );
 	out.square_norm_diff = 0.0;
-	grb::eWiseMul( b, -1.0, x, ring );
-	grb::dot( out.square_norm_diff, b, b, ring );
+	grb::eWiseMul( b, -1.0, x, StdRing() );
+	grb::dot( out.square_norm_diff, b, b, StdRing() );
 
 	// output
-	out.pinnedVector = std::unique_ptr< PinnedVector< double > >( new PinnedVector< double >( x, SEQUENTIAL ) );
+	out.pinnedVector = std::unique_ptr< PinnedVector< NonzeroType > >( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
 	// finish timing
 	const double time_taken { timer.time() };
 	out.times.postamble = time_taken;
@@ -358,26 +452,11 @@ void grbProgram( const simulation_input & in, struct output & out ) {
  */
 static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
 
-#ifdef TEST_ITER
-static void test_iters();
-static void test_iters2();
-#endif
-
-void test_system_iter();
-
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
 	size_t test_outer_iterations;
 	double max_residual_norm;
 
-#ifdef TEST_ITER
-	test_iters();
-	test_iters2();
-	return 0;
-#endif
-	test_system_iter();
-	// return 0;
-
 	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
@@ -414,7 +493,8 @@ int main( int argc, char ** argv ) {
 	grb::Benchmarker< AUTOMATIC > benchmarker;
 	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
 	ASSERT_RC_SUCCESS( rc );
-	thcout << "Benchmark completed successfully and took " << out.performed_iterations << " iterations to converge with residual " << out.residual << std::endl;
+	thcout << "Benchmark completed successfully and took " << out.performed_iterations
+		<< " iterations to converge with residual " << out.residual << std::endl;
 
 	if( ! out.pinnedVector ) {
 		thcerr << "no output vector to inspect" << std::endl;
@@ -457,8 +537,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			"the execution of the algorithm)" )
 		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
-			"launch single run directly, without benchmarker (ignore "
-			"repetitions)" )
+			"launch single run directly, without benchmarker (ignore repetitions)" )
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" )
 		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false, "on each iteration, print more statistics" );
 
@@ -494,217 +573,3 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		sim_in.max_iterations = 1;
 	}
 }
-
-
-void test_system_iter() {
-	constexpr size_t DIMS = 2;
-	using row_index_t = size_t;
-	std::array< row_index_t, DIMS > dims;
-	dims.fill( 4 );
-	grb::utils::geometry::linearized_halo_ndim_system< row_index_t, DIMS > system( dims, 1 );
-	grb::utils::geometry::linearized_halo_ndim_system< row_index_t, DIMS >::iterator begin = system.begin();
-
-	while( begin.has_more_elements() ) {
-		std::cout << "row " << begin->get_element_linear() << ": ";
-		while( begin.has_more_neighbours() ) {
-			std::cout << /* "-- " << */ begin->get_neighbor_linear() << " ";
-			begin.next_neighbour();
-		}
-		std::cout << std::endl;
-		begin.next_element();
-	}
-
-	std::vector< size_t > colors, counters;
-	color_matrix_greedy( system, colors, counters );
-
-	std::cout << "final assignment:" << std::endl;
-	for( size_t i = 0; i < colors.size(); i++ ){
-		std::cout << i << " -> " << colors[ i ] << ", ";
-	}
-	std::cout << std::endl;
-}
-
-
-
-struct NZ {
-	size_t i;
-	size_t j;
-	double v;
-
-	NZ( size_t _i, size_t _j, double _v ): i(_i), j(_j), v(_v) {}
-
-	bool operator!=( const NZ& o ) const {
-		return i != o.i || j != o.j || v != o.v;
-	}
-};
-
-#ifdef TEST_ITER
-static void test_iters() {
-
-	using clock = std::chrono::steady_clock;
-
-	constexpr size_t DIMS = 3;
-	using coord_t = size_t;
-
-	std::array< coord_t, DIMS > finer_sizes{ 1024, 1024, 1024};
-	std::array< coord_t, DIMS > coarser_sizes;
-	for( size_t i = 0; i < finer_sizes.size(); i++ ) {
-		coarser_sizes[ i ] = finer_sizes[ i ] / 2;
-	}
-
-	size_t rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
-
-	std::array< size_t, DIMS > lfiner_sizes{ 1024, 1024, 1024};
-	std::array< size_t, DIMS > lcoarser_sizes{};
-	for( size_t i = 0; i < lfiner_sizes.size(); i++ ) {
-		lcoarser_sizes[ i ] = lfiner_sizes[ i ] / 2;
-	}
-	grb::algorithms::old::coarsener_generator_iterator< DIMS, double > sbegin( lcoarser_sizes, lfiner_sizes, 0 );
-	grb::algorithms::old::coarsener_generator_iterator< DIMS, double > send( lcoarser_sizes, lfiner_sizes, rows );
-
-
-	using citer = hpcg_coarsener_builder< DIMS, coord_t, double >::hpcg_coarsener_iterator;
-	hpcg_coarsener_builder< DIMS, coord_t, double > coarsener( coarser_sizes, finer_sizes );
-	citer pbegin( coarsener.make_begin_iterator() );
-	const citer pend( coarsener.make_end_iterator() );
-
-	size_t num_elements = pend - pbegin;
-	std::cout << "number of elements: " << num_elements << std::endl;
-
-	std::vector< NZ > svalues;
-	svalues.reserve( num_elements);
-	typename clock::time_point start( clock::now() );
-	for( ; sbegin != send; ++sbegin ) {
-		// printf( "inserting %lu %lu\n", sbegin.i(), sbegin.j() );
-		svalues.emplace_back( sbegin.i(), sbegin.j(), sbegin.v() );
-	}
-	typename clock::time_point finish( clock::now() );
-	std::cout << "sequential generation time (ms): " <<
-		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
-
-
-
-
-	const size_t nthreads = omp_get_max_threads();
-	size_t per_thread_num = ( num_elements + nthreads - 1 ) / nthreads;
-	std::vector< std::vector< NZ > > tvalues( nthreads );
-	for( size_t i = 0; i < nthreads; i++ ) {
-		tvalues[i].reserve( per_thread_num );
-	}
-	start = clock::now();
-	#pragma omp parallel
-	{
-
-		int t = omp_get_thread_num();
-		std::vector< NZ > &tv = tvalues[ t ];
-		// printf( "thread %d, size %lu\n", t, tv.size() );
-		#pragma omp for schedule( static )
-		for( auto it = pbegin; it != pend; ++it ) {
-			tv.emplace_back( it.i(), it.j(), it.v() );
-			// printf( "thread %d: inserting %lu %lu\n", t, it.i(), it.j() );
-		}
-	}
-	finish = clock::now();
-	std::cout << "parallel generation time (ms): " <<
-		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
-
-	std::vector< NZ > pvalues;
-	for( const std::vector< NZ > &tv: tvalues ) {
-		pvalues.insert( pvalues.end(), tv.cbegin(), tv.cend() );
-	}
-
-
-	if( svalues.size() != pvalues.size() ) {
-		std::cout << "different sizes!" << std::endl;
-		std::exit(-1);
-	}
-
-	for( size_t i = 0; i < svalues.size(); i++ ) {
-		if( svalues[i] != pvalues[i] ) {
-			std::cout << "error at position " << i << std::endl;
-		}
-	}
-	std::cout << "all OK" << std::endl;
-}
-
-static void test_iters2() {
-
-	using clock = std::chrono::steady_clock;
-	using coord_t = size_t;
-
-	constexpr size_t DIMS = 3, halo_size = 1;
-	constexpr double diag_value = 26.0, non_diag_value = -1.0;
-
-	std::array< coord_t, DIMS > sys_sizes{ 64, 64, 64};
-	size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< size_t >() ) };
-
-	std::array< size_t, DIMS > large_sys_sizes{ 64, 64, 64};
-	old::matrix_generator_iterator< DIMS, double > sbegin( large_sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
-	old::matrix_generator_iterator< DIMS, double > send( large_sys_sizes, n, halo_size, diag_value, non_diag_value );
-
-	hpcg_builder< DIMS, coord_t, double > hpcg_system( sys_sizes, halo_size );
-	matrix_generator_iterator< DIMS, coord_t, double > pbegin(
-		hpcg_system.make_begin_iterator( diag_value, non_diag_value ) );
-	matrix_generator_iterator< DIMS, coord_t, double > pend(
-		hpcg_system.make_end_iterator( diag_value, non_diag_value )
-	);
-
-	size_t num_elements = pend - pbegin;
-	std::cout << "number of elements: " << num_elements << std::endl;
-
-	std::vector< NZ > svalues;
-	svalues.reserve( num_elements);
-	typename clock::time_point start( clock::now() );
-	for( ; sbegin != send; ++sbegin ) {
-		svalues.emplace_back( sbegin.i(), sbegin.j(), sbegin.v() );
-	}
-	typename clock::time_point finish( clock::now() );
-	std::cout << "sequential generation time (ms): " <<
-		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
-
-
-
-
-	const size_t nthreads = omp_get_max_threads();
-	size_t per_thread_num = ( num_elements + nthreads - 1 ) / nthreads;
-	std::vector< std::vector< NZ > > tvalues( nthreads );
-	for( size_t i = 0; i < nthreads; i++ ) {
-		tvalues[i].reserve( per_thread_num );
-	}
-	start = clock::now();
-	#pragma omp parallel
-	{
-
-		int t = omp_get_thread_num();
-		std::vector< NZ > &tv = tvalues[ t ];
-		// printf( "thread %d, size %lu\n", t, tv.size() );
-		#pragma omp for schedule( static )
-		for( auto it = pbegin; it != pend; ++it ) {
-			tv.emplace_back( it.i(), it.j(), it.v() );
-			// printf( "thread %d: inserting %lu %lu\n", t, it.i(), it.j() );
-		}
-	}
-	finish = clock::now();
-	std::cout << "parallel generation time (ms): " <<
-		std::chrono::duration< double, std::milli >( finish - start ).count() << std::endl;
-
-	std::vector< NZ > pvalues;
-	for( const std::vector< NZ > &tv: tvalues ) {
-		pvalues.insert( pvalues.end(), tv.cbegin(), tv.cend() );
-	}
-
-
-	if( svalues.size() != pvalues.size() ) {
-		std::cout << "different sizes!" << std::endl;
-		std::exit(-1);
-	}
-
-	for( size_t i = 0; i < svalues.size(); i++ ) {
-		if( svalues[i] != pvalues[i] ) {
-			std::cout << "error at position " << i << std::endl;
-		}
-	}
-
-	std::cout << "all OK" << std::endl;
-}
-#endif // TEST_ITER

From 8411dabe76167696f6b89f47b9236df12ed7b841 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 18 Nov 2022 14:58:08 +0100
Subject: [PATCH 08/28]  reorganizing code in HPCG test to be more concise and
 clearer: - separating iterator for hpcg system generation and for coarsening
 to different files - little cleanups to geometry classes - renaming
 utils/geometry to utils/multigrid to more clearly indicate the intended usage

---
 ...rix_builders.hpp => coarsener_builder.hpp} | 270 +++-----------
 .../graphblas/algorithms/hpcg/coloring.hpp    |   8 +-
 include/graphblas/algorithms/hpcg/hpcg.hpp    |   6 +-
 .../algorithms/hpcg/system_builder.hpp        | 152 ++++++++
 .../algorithms/hpcg/system_building_utils.hpp | 187 +++++-----
 .../multigrid/multigrid_building_utils.hpp    |   2 +-
 .../array_vector_storage.hpp                  |  26 +-
 .../dynamic_vector_storage.hpp                |  11 +-
 .../halo_matrix_generator_iterator.hpp        |  29 +-
 .../linearized_halo_ndim_geometry.hpp         | 113 +++---
 .../linearized_halo_ndim_iterator.hpp         | 133 +------
 .../linearized_halo_ndim_system.hpp           |  25 +-
 .../linearized_ndim_iterator.hpp              |  24 +-
 .../linearized_ndim_system.hpp                |  26 +-
 .../{geometry => multigrid}/ndim_system.hpp   |  19 +-
 .../{geometry => multigrid}/ndim_vector.hpp   |  11 +-
 tests/smoke/hpcg.cpp                          | 349 ++++++++----------
 17 files changed, 593 insertions(+), 798 deletions(-)
 rename include/graphblas/algorithms/hpcg/{ndim_matrix_builders.hpp => coarsener_builder.hpp} (50%)
 create mode 100644 include/graphblas/algorithms/hpcg/system_builder.hpp
 rename include/graphblas/utils/{geometry => multigrid}/array_vector_storage.hpp (78%)
 rename include/graphblas/utils/{geometry => multigrid}/dynamic_vector_storage.hpp (94%)
 rename include/graphblas/utils/{geometry => multigrid}/halo_matrix_generator_iterator.hpp (89%)
 rename include/graphblas/utils/{geometry => multigrid}/linearized_halo_ndim_geometry.hpp (81%)
 rename include/graphblas/utils/{geometry => multigrid}/linearized_halo_ndim_iterator.hpp (63%)
 rename include/graphblas/utils/{geometry => multigrid}/linearized_halo_ndim_system.hpp (82%)
 rename include/graphblas/utils/{geometry => multigrid}/linearized_ndim_iterator.hpp (91%)
 rename include/graphblas/utils/{geometry => multigrid}/linearized_ndim_system.hpp (91%)
 rename include/graphblas/utils/{geometry => multigrid}/ndim_system.hpp (88%)
 rename include/graphblas/utils/{geometry => multigrid}/ndim_vector.hpp (95%)

diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/coarsener_builder.hpp
similarity index 50%
rename from include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
rename to include/graphblas/algorithms/hpcg/coarsener_builder.hpp
index 5958ecb0d..2ee848039 100644
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ b/include/graphblas/algorithms/hpcg/coarsener_builder.hpp
@@ -15,42 +15,17 @@
  * limitations under the License.
  */
 
-/**
- * @file ndim_matrix_builders.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
- *
- * In particular, the main matrices are:
- * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
- *   each dimension in priority order, where the first dimension has highest priority and the last
- *   dimension least priority; for each point (row), all its N-dimensional neighbours within
- *   a given distance are generated for the column
- * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
- *   each point to a corresponding system of finer sizes
- *
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
-#define _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
+#ifndef _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
+#define _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
 
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <initializer_list>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
 #include <cstddef>
+#include <array>
 #include <iterator>
+#include <stdexcept>
+#include <cmath>
 
-#include <graphblas/utils/geometry/halo_matrix_generator_iterator.hpp>
-
-
-
+#include <graphblas/utils/multigrid/array_vector_storage.hpp>
+#include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
 
 namespace grb {
 	namespace algorithms {
@@ -60,133 +35,7 @@ namespace grb {
 			typename CoordType,
 			typename ValueType
 		>
-		class HPCGBuilder {
-		public:
-
-			struct HPCGDiagGenerator {
-
-				ValueType _diag;
-				ValueType _non_diag;
-
-				HPCGDiagGenerator(
-					ValueType diag,
-					ValueType non_diag
-				) : _diag( diag ),
-				_non_diag( non_diag ) {}
-
-				HPCGDiagGenerator & operator=( const HPCGDiagGenerator & ) = default;
-
-				inline ValueType operator()( const CoordType &i, const CoordType &j ) const noexcept {
-					return j == i ? _diag: _non_diag;
-				}
-			};
-
-			using HaloSystemType = grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS >;
-			using Iterator = geometry::HaloMatrixGeneratorIterator< DIMS, CoordType, ValueType, HPCGDiagGenerator >;
-
-			HPCGBuilder(
-				const std::array< CoordType, DIMS > &sizes,
-				CoordType _halo,
-				ValueType diag,
-				ValueType non_diag
-			) :
-				halo( _halo ),
-				system( sizes, _halo ),
-				_diag_generator( diag, non_diag )
-			{
-				if( _halo <= 0 ) {
-					throw std::invalid_argument( "halo should be higher than 0" );
-				}
-				for( const auto i : sizes ) {
-					if( i < 2 * _halo + 1 ) {
-						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
-					}
-				}
-			}
-
-
-			HPCGBuilder( const HPCGBuilder< DIMS, CoordType, ValueType > & ) = default;
-
-			HPCGBuilder( HPCGBuilder< DIMS, CoordType, ValueType > && ) = default;
-
-			HPCGBuilder< DIMS, CoordType, ValueType > & operator=( const HPCGBuilder< DIMS, CoordType, ValueType > & ) = default;
-
-			HPCGBuilder< DIMS, CoordType, ValueType > & operator=( HPCGBuilder< DIMS, CoordType, ValueType > && ) = default;
-
-			size_t system_size() const {
-				return system.base_system_size();
-			}
-
-			size_t num_neighbors() const {
-				return system.halo_system_size();
-			}
-
-			const HaloSystemType & get_generator() const {
-				return system;
-			}
-
-			Iterator make_begin_iterator() const {
-				return Iterator( system, _diag_generator );
-			}
-
-			Iterator make_end_iterator() const {
-				Iterator result( system, _diag_generator );
-				result += num_neighbors() - 1; // do not trigger boundary checks
-				++result;
-				return result;
-			}
-
-			ValueType get_diag_value() const {
-				return _diag_generator._diag;
-			}
-
-			ValueType get_non_diag_value() const {
-				return _diag_generator._non_diag;
-			}
-
-
-		private:
-			const CoordType halo;
-			HaloSystemType system;
-			HPCGDiagGenerator _diag_generator;
-		};
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-		template<
-			size_t DIMS,
-			typename CoordType,
-			typename T
-		>
-		class hpcg_coarsener_builder;
-
+		class HPCGCoarsenerBuilder;
 
 		/**
 		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
@@ -204,27 +53,24 @@ namespace grb {
 		template<
 			size_t DIMS,
 			typename CoordType,
-			typename T
-		>
-		struct coarsener_generator_iterator {
+			typename ValueType
+		> struct HPCGCoarsenerGeneratorIterator {
 
-			friend hpcg_coarsener_builder< DIMS, CoordType, T >;
+			friend HPCGCoarsenerBuilder< DIMS, CoordType, ValueType >;
 
 			using RowIndexType = CoordType; ///< numeric type of rows
 			using ColumnIndexType = CoordType;
-			using ValueType = T;
+			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
+			using LinearSystemIterType = typename LinearSystemType::Iterator;
+			using SelfType = HPCGCoarsenerGeneratorIterator< DIMS, CoordType, ValueType >;
+			using ArrayType = std::array< CoordType, DIMS >;
 
-			using lin_system_t = grb::utils::geometry::LinearizedNDimSystem< CoordType,
-				grb::utils::geometry::ArrayVectorStorage< CoordType, DIMS > >;
-			using __iter_t = typename lin_system_t::Iterator;
-			using SelfType = coarsener_generator_iterator< DIMS, CoordType, T >;
-			using array_t = std::array< CoordType, DIMS >;
-
-			struct __value {
+			struct _HPCGValueGenerator {
 
 				friend SelfType;
 
-				__value(
+				_HPCGValueGenerator(
 					RowIndexType i,
 					ColumnIndexType j
 				) noexcept :
@@ -232,9 +78,9 @@ namespace grb {
 					_j( j )
 				{}
 
-				__value( const __value & ) = default;
+				_HPCGValueGenerator( const _HPCGValueGenerator & ) = default;
 
-				__value & operator=( const __value & ) = default;
+				_HPCGValueGenerator & operator=( const _HPCGValueGenerator & ) = default;
 
 				inline RowIndexType i() const { return _i; }
 				inline ColumnIndexType j() const { return _j; }
@@ -249,14 +95,14 @@ namespace grb {
 
 			// interface for std::random_access_iterator
 			using iterator_category = std::random_access_iterator_tag;
-			using value_type = __value;
+			using value_type = _HPCGValueGenerator;
 			using pointer = const value_type;
 			using reference = const value_type&;
-			using difference_type = typename __iter_t::difference_type;
+			using difference_type = typename LinearSystemIterType::difference_type;
 
-			coarsener_generator_iterator( const SelfType & o ) = default;
+			HPCGCoarsenerGeneratorIterator( const SelfType &o ) = default;
 
-			coarsener_generator_iterator( SelfType && o ) = default;
+			HPCGCoarsenerGeneratorIterator( SelfType &&o ) = default;
 
 			SelfType & operator=( const SelfType & ) = default;
 
@@ -333,20 +179,16 @@ namespace grb {
 				return _val.v();
 			}
 
-			const __iter_t & it() const {
-				return this->_sys_iter;
-			}
-
 		private:
 			//// incremented when incrementing the row coordinates; is is the ration between
 			//// #finer_sizes and row_generator#physical_sizes
-			const lin_system_t *_lin_sys;
-			const array_t *_steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			__iter_t _sys_iter;
+			const LinearSystemType *_lin_sys;
+			const ArrayType *_steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			LinearSystemIterType _sys_iter;
 			value_type _val;
 
 			/**
-			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
+			 * @brief Construct a new \c HPCGCoarsenerGeneratorIterator object from the coarser and finer sizes,
 			 * setting its row at \p _current_row and the column at the corresponding value.
 			 *
 			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
@@ -356,9 +198,9 @@ namespace grb {
 			 * @param _finer_sizes sizes of the finer system (columns)
 			 * @param _current_row row (in the coarser system) to set the iterator on
 			 */
-			coarsener_generator_iterator(
-				const lin_system_t &system,
-				const array_t &steps
+			HPCGCoarsenerGeneratorIterator(
+				const LinearSystemType &system,
+				const ArrayType &steps
 			) noexcept :
 				_lin_sys( &system ),
 				_steps( &steps ),
@@ -389,70 +231,66 @@ namespace grb {
 			}
 		};
 
-
 		template<
 			size_t DIMS,
 			typename CoordType,
-			typename T
-		>
-		class hpcg_coarsener_builder {
+			typename ValueType
+		> class HPCGCoarsenerBuilder {
 		public:
+			using ArrayType = std::array< CoordType, DIMS >;
+			using Iterator = HPCGCoarsenerGeneratorIterator< DIMS, CoordType, ValueType >;
+			using SelfType = HPCGCoarsenerBuilder< DIMS, CoordType, ValueType >;
 
-			using array_t = std::array< CoordType, DIMS >;
-			using hpcg_coarsener_iterator = coarsener_generator_iterator< DIMS, CoordType, T >;
-
-			hpcg_coarsener_builder(
-				const array_t &_coarser_sizes,
-				const array_t &_finer_sizes
+			HPCGCoarsenerBuilder(
+				const ArrayType &_finer_sizes,
+				const ArrayType &_coarser_sizes
 			) : system( _coarser_sizes.begin(), _coarser_sizes.end() ) {
 				for( size_t i { 0 }; i < DIMS; i++ ) {
 					// finer size MUST be an exact multiple of coarser_size
-					size_t step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
-					if( step == 0 || _finer_sizes[ i ] / step != _coarser_sizes[ i ] ) {
+					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
+					if( ratio.quot < 2 || ratio.rem != 0 ) {
 						throw std::invalid_argument(
 							std::string( "finer size of dimension " ) + std::to_string( i ) +
 							std::string( "is not an exact multiple of coarser size" )
 						);
 					}
-					steps[ i ] = step;
+					steps[ i ] = ratio.quot;
 				}
 			}
 
-			hpcg_coarsener_builder( const hpcg_coarsener_builder< DIMS, CoordType, T> & ) = delete;
+			HPCGCoarsenerBuilder( const SelfType & ) = delete;
 
-			hpcg_coarsener_builder( hpcg_coarsener_builder< DIMS, CoordType, T> && ) = delete;
+			HPCGCoarsenerBuilder( SelfType && ) = delete;
 
-			hpcg_coarsener_builder< DIMS, CoordType, T> & operator=( const hpcg_coarsener_builder< DIMS, CoordType, T> & ) = delete;
+			SelfType & operator=( const SelfType & ) = delete;
 
-			hpcg_coarsener_builder< DIMS, CoordType, T> & operator=( hpcg_coarsener_builder< DIMS, CoordType, T> && ) = delete;
+			SelfType & operator=( SelfType && ) = delete;
 
 			size_t system_size() const {
 				return system.system_size();
 			}
 
-			hpcg_coarsener_iterator make_begin_iterator() {
-				return hpcg_coarsener_iterator( system, steps );
+			Iterator make_begin_iterator() {
+				return Iterator( system, steps );
 			}
 
-			hpcg_coarsener_iterator make_end_iterator() {
-				hpcg_coarsener_iterator result( system, steps );
+			Iterator make_end_iterator() {
+				Iterator result( system, steps );
 				result += system_size() - 1; // do not trigger boundary checks
 				++result;
 				return result;
 			}
 
 		private:
-			const grb::utils::geometry::LinearizedNDimSystem< CoordType,
-				grb::utils::geometry::ArrayVectorStorage< CoordType, DIMS > > system;
+			const grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
 
-			array_t steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			ArrayType steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
 			//// incremented when incrementing the row coordinates; is is the ration between
 			//// #finer_sizes and row_generator#physical_sizes
 		};
 
-
 	} // namespace algorithms
 } // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
+#endif // _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
 
diff --git a/include/graphblas/algorithms/hpcg/coloring.hpp b/include/graphblas/algorithms/hpcg/coloring.hpp
index f9334afb3..f5793b6ca 100644
--- a/include/graphblas/algorithms/hpcg/coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/coloring.hpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <cstddef>
 
-#include <graphblas/utils/geometry/linearized_halo_ndim_system.hpp>
+#include <graphblas/utils/multigrid/linearized_halo_ndim_system.hpp>
 
 namespace grb {
 	namespace algorithms {
@@ -62,8 +62,8 @@ namespace grb {
 		template<
 			size_t DIMS,
 			typename CoordType
-		> void color_matrix_greedy(
-			const grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS > &system,
+		> void hpcg_greedy_color_ndim_system(
+			const grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType > &system,
 			std::vector< CoordType > &row_colors,
 			std::vector< CoordType > &color_counters,
 			bool reorder_rows_per_color = false
@@ -78,7 +78,7 @@ namespace grb {
 			row_colors[0] = 0; // first point gets color 0
 
 			// Finds colors in a greedy (a likely non-optimal) fashion.
-			typename grb::utils::geometry::LinearizedHaloNDimSystem< CoordType, DIMS >::Iterator begin = system.begin();
+			typename grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >::Iterator begin = system.begin();
 			begin.next_element(); // skip first row
 
 			while( begin.has_more_elements() ) {
diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 2d30584fe..9d65aa79f 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_HPCG
-#define _H_GRB_ALGORITHMS_HPCG
+#ifndef _H_GRB_ALGORITHMS_HPCG_HPCG
+#define _H_GRB_ALGORITHMS_HPCG_HPCG
 
 #include <utility>
 
@@ -71,4 +71,4 @@ namespace grb {
 	} // namespace algorithms
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_HPCG
+#endif // _H_GRB_ALGORITHMS_HPCG_HPCG
diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
new file mode 100644
index 000000000..700718e3b
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -0,0 +1,152 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file system_builders.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
+ *
+ * In particular, the main matrices are:
+ * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
+ *   each dimension in priority order, where the first dimension has highest priority and the last
+ *   dimension least priority; for each point (row), all its N-dimensional neighbours within
+ *   a given distance are generated for the column
+ * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
+ *   each point to a corresponding system of finer sizes
+ *
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
+#define _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <initializer_list>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include <cstddef>
+#include <iterator>
+
+#include <graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class HPCGSystemBuilder {
+		public:
+			struct HPCGDiagGenerator {
+
+				HPCGDiagGenerator(
+					ValueType diag,
+					ValueType non_diag
+				) noexcept :
+					_diag( diag ),
+					_non_diag( non_diag ) {}
+
+				HPCGDiagGenerator & operator=( const HPCGDiagGenerator & ) = default;
+
+				inline ValueType operator()( const CoordType &i, const CoordType &j ) const noexcept {
+					return j == i ? _diag: _non_diag;
+				}
+
+				ValueType _diag;
+				ValueType _non_diag;
+			};
+
+			using HaloSystemType = grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >;
+			using Iterator = grb::utils::multigrid::HaloMatrixGeneratorIterator< DIMS, CoordType,
+				ValueType, HPCGDiagGenerator >;
+
+			HPCGSystemBuilder(
+				const std::array< CoordType, DIMS > &sizes,
+				CoordType halo,
+				ValueType diag,
+				ValueType non_diag
+			) :
+				_system( sizes, halo ),
+				_diag_generator( diag, non_diag )
+			{
+				if( halo <= 0 ) {
+					throw std::invalid_argument( "halo should be higher than 0" );
+				}
+				for( const auto i : sizes ) {
+					if( i < 2 * halo + 1 ) {
+						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
+					}
+				}
+			}
+
+			HPCGSystemBuilder( const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
+
+			HPCGSystemBuilder( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
+
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=( const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
+
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
+
+			size_t system_size() const {
+				return _system.base_system_size();
+			}
+
+			size_t num_neighbors() const {
+				return _system.halo_system_size();
+			}
+
+			const HaloSystemType & get_generator() const {
+				return _system;
+			}
+
+			Iterator make_begin_iterator() const {
+				return Iterator( _system, _diag_generator );
+			}
+
+			Iterator make_end_iterator() const {
+				Iterator result( _system, _diag_generator );
+				result += num_neighbors() - 1; // do not trigger boundary checks
+				++result;
+				return result;
+			}
+
+			ValueType get_diag_value() const {
+				return _diag_generator._diag;
+			}
+
+			ValueType get_non_diag_value() const {
+				return _diag_generator._non_diag;
+			}
+
+
+		private:
+			HaloSystemType _system;
+			HPCGDiagGenerator _diag_generator;
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
+
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 7a8db963d..088bb9fb3 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -37,14 +37,73 @@
 #include <graphblas.hpp>
 #include <graphblas/utils/iterators/partition_range.hpp>
 
-#include "ndim_matrix_builders.hpp"
-
+#include "system_builder.hpp"
+#include "coarsener_builder.hpp"
 #include "coloring.hpp"
 
 namespace grb {
 	namespace algorithms {
 
-		template< typename CoordType > void split_rows_by_color(
+		/**
+		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
+		 * coarsening information.
+		 *
+		 * @tparam DIMS dimensions of the physical system
+		 * @tparam T type of matrix values
+		 */
+		template<
+			size_t DIMS,
+			typename NonzeroType
+		> struct hpcg_system_params {
+			std::array< size_t, DIMS > physical_sys_sizes;
+			size_t halo_size;
+			NonzeroType diag_value;
+			NonzeroType non_diag_value;
+			size_t min_phys_size;
+			size_t max_levels;
+			size_t coarsening_step;
+		};
+
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename NonzeroType
+		> void hpcg_build_multigrid_generators(
+			const hpcg_system_params< DIMS, NonzeroType > &params,
+			std::vector< grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > > &mg_generators
+		) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+
+			size_t const current_size{ std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL,
+				std::multiplies< size_t >() ) };
+			if( current_size > std::numeric_limits< CoordType >::max() ) {
+				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+			}
+			size_t min_physical_size { *std::min_element( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend() ) };
+			if( min_physical_size < params.min_phys_size ) {
+				throw std::domain_error( "the initial system is too small" );
+			}
+
+			std::array< CoordType, DIMS > coord_sizes;
+			// type-translate coordinates
+			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), coord_sizes.begin() );
+
+			// generate hierarchical coarseners
+			for( size_t coarsening_level = 0UL;
+				min_physical_size >= params.min_phys_size && coarsening_level <= params.max_levels;
+				coarsening_level++ ) {
+
+				// build generator
+				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+
+				// prepare for new iteration
+				min_physical_size /= params.coarsening_step;
+				std::for_each( coord_sizes.begin(), coord_sizes.end(),
+					[ &params ]( CoordType &v ){ v /= params.coarsening_step; });
+			}
+		}
+
+		template< typename CoordType > void hpcg_split_rows_by_color(
 			const std::vector< CoordType > & row_colors,
 			size_t num_colors,
 			std::vector< std::vector< CoordType > > & per_color_rows
@@ -57,11 +116,11 @@ namespace grb {
 
 		template <
 			size_t DIMS,
-			typename coord_t,
+			typename CoordType,
 			typename NonzeroType,
 			enum grb::Backend B
-		> grb::RC populate_system_matrix(
-			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &system_generator,
+		> grb::RC hpcg_populate_system_matrix(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
 			grb::Matrix< NonzeroType, B > &M
 		) {
 			const size_t pid { spmd<>::pid() };
@@ -69,30 +128,29 @@ namespace grb {
 			if( pid == 0) {
 				std::cout << "- generating system matrix...";
 			}
-			typename grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >::Iterator begin(
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator begin(
 				system_generator.make_begin_iterator() );
-			typename grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >::Iterator end(
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator end(
 				system_generator.make_end_iterator()
 			);
 			grb::utils::partition_iteration_range_on_procs( system_generator.num_neighbors(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
-
 		template<
-			typename coord_t,
 			size_t DIMS,
+			typename CoordType,
 			typename IOType,
 			typename NonzeroType
-		> grb::RC populate_coarsener(
-			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &finer_system_generator,
-			const grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType > &coarser_system_generator,
+		> grb::RC hpcg_populate_coarsener(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
 			coarsening_data< IOType, NonzeroType > &coarsener
 		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 
-			const std::array< coord_t, DIMS > &finer_sizes = finer_system_generator.get_generator().get_sizes();
-			const std::array< coord_t, DIMS > &coarser_sizes = coarser_system_generator.get_generator().get_sizes();
+			const std::array< CoordType, DIMS > &finer_sizes = finer_system_generator.get_generator().get_sizes();
+			const std::array< CoordType, DIMS > &coarser_sizes = coarser_system_generator.get_generator().get_sizes();
 			const size_t finer_size = finer_system_generator.system_size();
 			const size_t coarser_size = coarser_system_generator.system_size();
 
@@ -105,21 +163,15 @@ namespace grb {
 
 			assert( finer_sizes.size() == coarser_sizes.size() );
 
-			for( size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
-				std::ldiv_t ratio = std::ldiv( finer_sizes[ i ], coarser_sizes[ i ] );
-				if( ratio.quot < 2 || ratio.rem != 0 ) {
-					throw std::invalid_argument( "finer sizes should be a multiple of coarser sizes" );
-				}
-			}
 			grb::Matrix< NonzeroType > &M = coarsener.coarsening_matrix;
 			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
 				throw std::invalid_argument( "wrong matrix dimensions: matrix should be rectangular"
 											" with rows == <coarser size> and cols == <finer size>" );
 			}
 
-			grb::algorithms::hpcg_coarsener_builder< DIMS, coord_t, NonzeroType > coarsener_builder( coarser_sizes, finer_sizes );
-			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, NonzeroType > begin( coarsener_builder.make_begin_iterator() );
-			grb::algorithms::coarsener_generator_iterator< DIMS, coord_t, NonzeroType > end( coarsener_builder.make_end_iterator() );
+			grb::algorithms::HPCGCoarsenerBuilder< DIMS, CoordType, NonzeroType > coarsener_builder( finer_sizes, coarser_sizes );
+			grb::algorithms::HPCGCoarsenerGeneratorIterator< DIMS, CoordType, NonzeroType > begin( coarsener_builder.make_begin_iterator() );
+			grb::algorithms::HPCGCoarsenerGeneratorIterator< DIMS, CoordType, NonzeroType > end( coarsener_builder.make_end_iterator() );
 			grb::utils::partition_iteration_range_on_procs( coarsener_builder.system_size(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
@@ -196,7 +248,7 @@ namespace grb {
 			 * @return grb::RC the success value returned when trying to build the vector
 			 */
 			template< enum grb::Backend B >
-			grb::RC build_static_color_masks(
+			grb::RC hpcg_build_static_color_masks(
 				size_t matrix_size,
 				const std::vector< std::vector< size_t > > &per_color_rows,
 				std::vector< grb::Vector< bool, B > > & masks
@@ -206,7 +258,7 @@ namespace grb {
 				}
 				for( size_t i = 0; i < per_color_rows.size(); i++ ) {
 					const std::vector< size_t > & rows = per_color_rows[ i ];
-					/*
+#ifdef _DEBUG
 					{
 						std::cout << "\ncolor " << i << std::endl;
 						for( size_t row : rows ) {
@@ -214,7 +266,7 @@ namespace grb {
 						}
 						std::cout << std::endl;
 					}
-					*/
+#endif
 					masks.emplace_back( matrix_size );
 					grb::Vector< bool > & output_mask = masks.back();
 					std::vector< size_t >::const_iterator begin = rows.cbegin();
@@ -227,7 +279,7 @@ namespace grb {
 							<< toString( rc ) << std::endl;
 						return rc;
 					}
-					/*
+#ifdef _DEBUG
 					{
 						std::cout << "mask color " << i << std::endl;
 						size_t count = 0;
@@ -238,7 +290,7 @@ namespace grb {
 						}
 						std::cout << std::endl;
 					}
-					*/
+#endif
 				}
 				return grb::SUCCESS;
 			}
@@ -246,12 +298,12 @@ namespace grb {
 		} // namespace internal
 
 		template<
-			typename coord_t,
 			size_t DIMS,
-			typename T
-		> grb::RC populate_smoothing_data(
-			const grb::algorithms::HPCGBuilder< DIMS, coord_t, T > &system_generator,
-			smoother_data< T > &smoothing_info
+			typename CoordType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_smoothing_data(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
+			smoother_data< NonzeroType > &smoothing_info
 		) {
 			const size_t pid { spmd<>::pid() };
 
@@ -266,10 +318,10 @@ namespace grb {
 			if( pid == 0 ) {
 				std::cout << "- running coloring heuristics...";
 			}
-			std::vector< coord_t > colors, color_counters;
-			color_matrix_greedy( system_generator.get_generator(), colors, color_counters );
-			std::vector< std::vector< coord_t > > per_color_rows;
-			split_rows_by_color( colors, color_counters.size(), per_color_rows );
+			std::vector< CoordType > colors, color_counters;
+			hpcg_greedy_color_ndim_system( system_generator.get_generator(), colors, color_counters );
+			std::vector< std::vector< CoordType > > per_color_rows;
+			hpcg_split_rows_by_color( colors, color_counters.size(), per_color_rows );
 			if( rc != grb::SUCCESS ) {
 				if( pid == 0 ) {
 					std::cout << "error: " << __LINE__ << std::endl;
@@ -280,67 +332,10 @@ namespace grb {
 				std::cout <<"- found " << color_counters.size() << " colors,"
 					<< " generating color masks...";
 			}
-			return internal::build_static_color_masks( system_generator.system_size(),
+			return internal::hpcg_build_static_color_masks( system_generator.system_size(),
 				per_color_rows, smoothing_info.color_masks );
 		}
 
-		/**
-		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
-		 * coarsening information.
-		 *
-		 * @tparam DIMS dimensions of the physical system
-		 * @tparam T type of matrix values
-		 */
-		template< size_t DIMS, typename T >
-		struct hpcg_system_params {
-			std::array< size_t, DIMS > physical_sys_sizes;
-			size_t halo_size;
-			T diag_value;
-			T non_diag_value;
-			size_t min_phys_size;
-			size_t max_levels;
-			size_t coarsening_step;
-		};
-
-		template<
-			size_t DIMS,
-			typename coord_t,
-			typename T
-		> void build_hpcg_multigrid_generators(
-			const hpcg_system_params< DIMS, T > &params,
-			std::vector< grb::algorithms::HPCGBuilder< DIMS, coord_t, T > > &mg_generators
-		) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-
-			size_t const current_size{ std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL,
-				std::multiplies< size_t >() ) };
-			if( current_size > std::numeric_limits< coord_t >::max() ) {
-				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
-			}
-			size_t min_physical_size { *std::min_element( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend() ) };
-			if( min_physical_size < params.min_phys_size ) {
-				throw std::domain_error( "the initial system is too small" );
-			}
-
-			std::array< coord_t, DIMS > coord_sizes;
-			// type-translate coordinates
-			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), coord_sizes.begin() );
-
-			// generate linked list of hierarchical coarseners
-			for( size_t coarsening_level = 0UL;
-				min_physical_size >= params.min_phys_size && coarsening_level <= params.max_levels;
-				coarsening_level++ ) {
-
-				// build generator
-				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value, params.non_diag_value );
-
-				// prepare for new iteration
-				min_physical_size /= params.coarsening_step;
-				std::for_each( coord_sizes.begin(), coord_sizes.end(),
-					[ &params ]( coord_t &v ){ v /= params.coarsening_step; });
-			}
-		}
-
 	} // namespace algorithms
 } // namespace grb
 
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
index 714555426..34347582e 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -29,7 +29,7 @@ namespace grb {
 			typename MGInfoType,
 			typename CoarsenerInfoType,
 			typename SmootherInfoType
-		> void allocate_multigrid_data(
+		> void multigrid_allocate_data(
 			const std::vector< size_t > &mg_sizes,
 			std::vector< std::unique_ptr< MGInfoType > > &system_levels,
 			std::vector< std::unique_ptr< CoarsenerInfoType > > &coarsener_levels,
diff --git a/include/graphblas/utils/geometry/array_vector_storage.hpp b/include/graphblas/utils/multigrid/array_vector_storage.hpp
similarity index 78%
rename from include/graphblas/utils/geometry/array_vector_storage.hpp
rename to include/graphblas/utils/multigrid/array_vector_storage.hpp
index 45fbab04e..8eb1e4377 100644
--- a/include/graphblas/utils/geometry/array_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/array_vector_storage.hpp
@@ -24,8 +24,8 @@
  * @date 2022-10-24
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
-#define _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
 
 #include <array>
 #include <stdexcept>
@@ -34,7 +34,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			/**
 			 * Array with fixed size based on std::array with an interface compliant to what other classes
@@ -46,14 +46,14 @@ namespace grb {
 			 * @tparam DIMS the dimensions of the vector
 			 */
 			template<
-				typename DataType,
-				size_t DIMS
+				size_t DIMS,
+				typename DataType
 			> class ArrayVectorStorage: public std::array< DataType, DIMS > {
-
 			public:
 
 				using VectorStorageType = std::array< DataType, DIMS >&;
 				using ConstVectorStorageType = const std::array< DataType, DIMS >&;
+				using SelfType = ArrayVectorStorage< DIMS, DataType >;
 
 				ArrayVectorStorage( size_t _dimensions ) {
 					static_assert( DIMS > 0, "cannot allocate 0-sized array" );
@@ -65,20 +65,20 @@ namespace grb {
 				ArrayVectorStorage() = delete;
 
 				// only copy constructor/assignment, since there's no external storage
-				ArrayVectorStorage( const ArrayVectorStorage< DataType, DIMS > &o ) noexcept {
+				ArrayVectorStorage( const SelfType &o ) noexcept {
 					std::copy_n( o.cbegin(), DIMS, this->begin() );
 				}
 
-				ArrayVectorStorage( ArrayVectorStorage< DataType, DIMS > &&o ) = delete;
+				ArrayVectorStorage( SelfType &&o ) = delete;
 
-				ArrayVectorStorage< DataType, DIMS >& operator=(
-					const ArrayVectorStorage< DataType, DIMS > &original
+				SelfType& operator=(
+					const SelfType &original
 				) noexcept {
 					std::copy_n( original.begin(), DIMS, this->begin() );
 					return *this;
 				}
 
-				ArrayVectorStorage< DataType, DIMS >& operator=( ArrayVectorStorage< DataType, DIMS > &&original ) = delete;
+				SelfType & operator=( SelfType &&original ) = delete;
 
 				constexpr size_t dimensions() const {
 					return DIMS;
@@ -93,8 +93,8 @@ namespace grb {
 				}
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_ARRAY_VECTOR_STORAGE
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
diff --git a/include/graphblas/utils/geometry/dynamic_vector_storage.hpp b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
similarity index 94%
rename from include/graphblas/utils/geometry/dynamic_vector_storage.hpp
rename to include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
index a0def1980..9168f175c 100644
--- a/include/graphblas/utils/geometry/dynamic_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
-#define _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
 
 #include <cstddef>
 #include <cstddef>
@@ -32,7 +32,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			/**
 			 * Array with fixed size (i.e. decided at object creation) allocated on the heap with an interface compliant
@@ -54,7 +54,6 @@ namespace grb {
 				}
 
 			public:
-
 				// iterator fields
 				using reference = DataType&;
 				using const_reference = const DataType&;
@@ -147,8 +146,8 @@ namespace grb {
 				}
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_DYNAMIC_VECTOR_STORAGE
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
diff --git a/include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
similarity index 89%
rename from include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp
rename to include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
index 6eb469f21..2bd82ff35 100644
--- a/include/graphblas/utils/geometry/halo_matrix_generator_iterator.hpp
+++ b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
@@ -1,6 +1,6 @@
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
-#define _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
 
 #include <cstddef>
 
@@ -10,8 +10,8 @@
 #include "array_vector_storage.hpp"
 
 namespace grb {
-	namespace algorithms {
-		namespace geometry {
+	namespace utils {
+		namespace multigrid {
 
 			template<
 				size_t DIMS,
@@ -26,8 +26,7 @@ namespace grb {
 
 				using RowIndexType = CoordType; ///< numeric type of rows
 				using ColumnIndexType = CoordType;
-
-				using LinearSystemType = grb::utils::geometry::LinearizedHaloNDimSystem< RowIndexType, DIMS >;
+				using LinearSystemType = LinearizedHaloNDimSystem< DIMS, RowIndexType >;
 				using SelfType = HaloMatrixGeneratorIterator< DIMS, CoordType, ValueType, ValueCallable >;
 				using Iterator = typename LinearSystemType::Iterator;
 
@@ -92,12 +91,8 @@ namespace grb {
 
 				HaloMatrixGeneratorIterator( const SelfType & ) = default;
 
-				// HaloMatrixGeneratorIterator( SelfType && ) = default;
-
 				SelfType & operator=( const SelfType & ) = default;
 
-				// SelfType & operator=( SelfType && ) = default;
-
 				/**
 				 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
 				 *
@@ -160,14 +155,14 @@ namespace grb {
 				}
 
 				/**
-				 * @brief Returns current row.
+				 * @brief Returns the current row.
 				 */
 				inline RowIndexType i() const {
 					return _val.i();
 				}
 
 				/**
-				 * @brief Returns current column.
+				 * @brief Returns the current column.
 				 */
 				inline ColumnIndexType j() const {
 					return _val.j();
@@ -183,10 +178,6 @@ namespace grb {
 					return _val.v();
 				}
 
-				const Iterator & it() const {
-					return this->_sys_iter;
-				}
-
 			private:
 				value_type _val;
 				const LinearSystemType *_lin_system;
@@ -198,10 +189,8 @@ namespace grb {
 				}
 			};
 
-
-
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_HALO_MATRIX_GENRATOR_ITERATOR
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp
similarity index 81%
rename from include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
rename to include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp
index 04928ac09..0e53dd671 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_geometry.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp
@@ -1,6 +1,6 @@
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
-#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
+#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
 
 #include <cstddef>
 #include <vector>
@@ -9,6 +9,7 @@
 #include <stdexcept>
 #include <string>
 #include <cstddef>
+#include <algorithm>
 
 #include "array_vector_storage.hpp"
 #include "dynamic_vector_storage.hpp"
@@ -17,14 +18,17 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
-			template< typename CoordType, size_t DIMS > void __compute_neighbors_range(
-				const ArrayVectorStorage< CoordType, DIMS >& _system_sizes,
+			template<
+				size_t DIMS,
+				typename CoordType
+			> void __compute_neighbors_range(
+				const ArrayVectorStorage< DIMS, CoordType > &_system_sizes,
 				const CoordType halo,
-				const ArrayVectorStorage< CoordType, DIMS >& system_coordinates,
-				ArrayVectorStorage< CoordType, DIMS >& neighbors_start,
-				ArrayVectorStorage< CoordType, DIMS >& neighbors_range ) {
+				const ArrayVectorStorage< DIMS, CoordType > &system_coordinates,
+				ArrayVectorStorage< DIMS, CoordType > &neighbors_start,
+				ArrayVectorStorage< DIMS, CoordType > &neighbors_range ) {
 
 				for( CoordType i{0}; i < DIMS/* - 1*/; i++ ) {
 					const CoordType start{ system_coordinates[i] <= halo ? 0 : system_coordinates[i] - halo };
@@ -32,37 +36,28 @@ namespace grb {
 					neighbors_start[i] = start;
 					neighbors_range[i] = end - start + 1;
 				}
-				/*
-				const size_t last{ DIMS - 1 };
-				const CoordT start{ system_coordinates[ last ] <= halo ? 0 : system_coordinates[ last ] - halo };
-				const CoordT end{ system_coordinates[ last ] + halo }; // can extend beyond actual DIMS-dimensional space
-				neighbors_start[ last ] = start;
-				neighbors_range[ last ] = end - start + 1;
-				*/
 			}
 
-
-
-
-
-
-			template< typename CoordType, size_t DIMS > size_t __neighbour_to_system_coords(
-				const std::array< CoordType, DIMS > & sizes,
+			template<
+				size_t DIMS,
+				typename CoordType
+			> size_t __neighbour_to_system_coords(
+				const std::array< CoordType, DIMS > &sizes,
 				size_t system_size,
-				const std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > > & dimension_neighbors,
+				const std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >
+					&dimension_neighbors,
 				CoordType halo,
 				CoordType neighbor,
-				ArrayVectorStorage< CoordType, DIMS > & result) {
-
+				ArrayVectorStorage< DIMS, CoordType > &result
+			){
 				if( neighbor > system_size ) {
 					throw std::invalid_argument("neighbor number ( " + std::to_string(neighbor)
 						+ " ) >= system size ( " + std::to_string( system_size ) + " )");
 				}
-
-				ArrayVectorStorage< CoordType, DIMS > halo_coords( DIMS );
-			#ifdef DBG
+				ArrayVectorStorage< DIMS, CoordType > halo_coords( DIMS );
+#ifdef _DEBUG
 				size_t * const halo_coords_end{ halo_coords.data() + DIMS };
-			#endif
+#endif
 				std::fill_n( halo_coords.begin(), DIMS, 0 );
 
 				for( size_t _dim{DIMS}; _dim > 0; _dim--) {
@@ -72,13 +67,11 @@ namespace grb {
 					const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > & neighbors{ dimension_neighbors[dimension] };
 
 					CoordType * const halo_coords_begin{ halo_coords.data() + dimension };
-
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour " << neighbor << std::endl;
 					std::cout << "\thalo : ";
 					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
-			#endif
-
+#endif
 					size_t h{0};
 					size_t previous_neighs{ 0 };
 					*halo_coords_begin = h;
@@ -90,47 +83,44 @@ namespace grb {
 						previous_neighs += halo_max_neighs;
 						halo_max_neighs = neighbors.at( halo_coords_begin );
 					}
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "- initial halo - neighbour " << neighbor << std::endl;
 					std::cout << "\th " << h << std::endl;
 					std::cout << "\thalo : ";
 					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
 					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-			#endif
-
-
+#endif
 					if ( h < halo ){
 						result[dimension] = h;
 						neighbor -= previous_neighs;
-			#ifdef DBG
+#ifdef _DEBUG
 						std::cout << "end neighbour " << neighbor << std::endl;
-			#endif
+#endif
 						continue;
 					}
 					// saturation occurred
 					const size_t distance_from_halo{ ( neighbor - previous_neighs ) / halo_max_neighs };
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "- before middle elements - neighbour " << neighbor << std::endl;
 					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
 					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
 					std::cout << "\tdistance_from_halo " << distance_from_halo << std::endl;
 					std::cout << "\tdimension_size " << dimension_size << std::endl;
-			#endif
+#endif
 					if ( distance_from_halo < dimension_size - 2 * halo ) {
 						result[dimension] =  distance_from_halo + halo;
 						neighbor -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
-			#ifdef DBG
+#ifdef _DEBUG
 						std::cout << "end neighbour " << neighbor << std::endl;
-			#endif
+#endif
 						continue;
 					}
 					previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "- after middle elements -neighbour " << neighbor << std::endl;
 					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
 					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-			#endif
-
+#endif
 					h = halo - 1;
 					*halo_coords_begin = h;
 					halo_max_neighs = neighbors.at( halo_coords_begin );
@@ -141,27 +131,27 @@ namespace grb {
 						halo_max_neighs = neighbors.at( halo_coords_begin );
 					}
 					neighbor -= previous_neighs;
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "- final halo - neighbour " << neighbor << std::endl;
 					std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
-			#endif
+#endif
 					// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
 					// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
 					result[dimension] = dimension_size - 1 - h;
-			#ifdef DBG
+#ifdef _DEBUG
 					std::cout << "end neighbour " << neighbor << std::endl;
-			#endif
+#endif
 				}
-
 				return neighbor;
 			}
 
 
 			template< typename CoordType > size_t __accumulate_dimension_neighbours(
-				const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >& prev_neighs,
+				const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > &prev_neighs,
 				CoordType* coords_buffer,
 				size_t halo,
-				size_t local_size ) {
+				size_t local_size
+			) {
 				size_t neighs{0};
 				size_t h{0};
 				for( ; h < halo && local_size > 1; h++ ) {
@@ -188,17 +178,20 @@ namespace grb {
 				}
 			}
 
-			template< typename CoordType, size_t DIMS > size_t __init_halo_search(
-				typename LinearizedNDimSystem< CoordType, ArrayVectorStorage< CoordType, DIMS > >::ConstVectorReference sizes,
+			template<
+				typename CoordType,
+				size_t DIMS
+			> size_t __init_halo_search(
+				typename LinearizedNDimSystem< CoordType, ArrayVectorStorage< DIMS, CoordType > >::ConstVectorReference
+					sizes,
 				size_t halo,
-				std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >& dimension_limits ) {
-
+				std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >& dimension_limits
+			) {
 				using nd_vec = NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >;
 				using nd_vec_iterator = typename nd_vec::DomainIterator;
 
 				std::vector<size_t> halo_sizes( DIMS, halo + 1);
 				dimension_limits.emplace_back(halo_sizes);
-
 				// initialize values
 				__populate_halo_neighbors< CoordType >( halo, dimension_limits[0] );
 				for( size_t i{1}; i < DIMS; i++ ) {
@@ -226,8 +219,8 @@ namespace grb {
 				return __accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
 			}
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_GEOMETRY
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
similarity index 63%
rename from include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
rename to include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
index 9829fdb46..871d62b7c 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
-#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
 
 #include <cstddef>
 #include <vector>
@@ -31,27 +31,26 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			// forward declaration
 			template<
-				typename SizeType,
-				size_t DIMS
+				size_t DIMS,
+				typename SizeType
 			> class LinearizedHaloNDimSystem;
 
 			template<
-				typename SizeType,
-				size_t DIMS
+				size_t DIMS,
+				typename SizeType
 			> class LinearizedHaloNDimIterator {
 
-				using SystemType = LinearizedHaloNDimSystem< SizeType, DIMS >;
-				using VectorType = ArrayVectorStorage< SizeType, DIMS >;
+				using SystemType = LinearizedHaloNDimSystem< DIMS, SizeType >;
+				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
 				using VectorIteratorType = LinearizedNDimIterator< SizeType, VectorType >;
 
 			public:
-				//using VectorType = typename VectorIteratorType::VectorType;
 				using ConstVectorReference = typename VectorIteratorType::ConstVectorReference;
-				using SelfType = LinearizedHaloNDimIterator< SizeType, DIMS >;
+				using SelfType = LinearizedHaloNDimIterator< DIMS, SizeType >;
 
 				struct HaloNDimElement {
 				private:
@@ -62,10 +61,7 @@ namespace grb {
 					// for iteration
 					VectorIteratorType _element_iter; // coordinates iterator
 
-					//VectorType* _element;
-					//size_t _coordinates_linear;
 					VectorType _neighbor; //the current neighbor
-					//size_t _neighbor_linear;
 					SizeType _position;
 
 				public:
@@ -88,8 +84,6 @@ namespace grb {
 
 					HaloNDimElement& operator=( const HaloNDimElement& ) = default;
 
-					//HaloNDimElement& operator=( HaloNDimElement&& ) = delete;
-
 					ConstVectorReference get_element() const {
 						return this->_element_iter->get_position();
 					}
@@ -127,45 +121,22 @@ namespace grb {
 
 				inline void __update_neighbor() {
 					for( size_t i{0}; i < DIMS; i++ ) {
-						//(this->_point)._neighbor[i] = this->_neighbors_start[i] + (*(this->_neighbor_iter))[i];
 						this->_point._neighbor[i] = this->_neighbors_start[i] + this->_neighbor_iter->get_position()[i];
 					}
 				}
 
-				/*
-				void __update_neighbor_linear() {
-					(this->_point)._neighbor_linear =
-						this->_system.ndim_to_linear( this->_point._neighbor );
-				}
-				*/
-
 				inline void on_neighbor_iter_update() {
 					this->__update_neighbor();
-					//this->__update_neighbor_linear();
 				}
 
-				/*
-				void __update_coordinates_linear() {
-					(this->_point)._coordinates_linear =
-						this->_system.ndim_to_linear( *this->_element_iter );
-				}
-				*/
-
 				void on_element_update() {
-					//this->__update_coordinates_linear();
 					// reset everything
 					VectorType neighbors_range( DIMS );
 					this->_point._system->compute_neighbors_range(
-						//*(this->_point._element_iter),
 						this->_point._element_iter->get_position(),
 						this->_neighbors_start,
 						neighbors_range
 					);
-					/*
-					std::cout << "\t=== start ";
-					print( this->_neighbors_start ) << " range ";
-					print( neighbors_range )  << std::endl;
-					*/
 					// re-target _neighbors_linearizer
 					this->_neighbors_linearizer.retarget( neighbors_range );
 				}
@@ -193,48 +164,11 @@ namespace grb {
 					std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
 				}
 
-
-				/*
-				LinearizedHaloNDimIterator( const LinearizedHaloNDimIterator< SizeType, DIMS >& original ) noexcept:
-					_coordinates_linearizer( original._coordinates_linearizer ),
-					_halo( original._halo ),
-					_dimension_limits( original._dimension_limits ),
-					_neighbors_linearizer( original._neighbors_linearizer ),
-					_element_iter( original._element_iter ),
-					_neighbor_iter( original._neighbor_iter ),
-					_neighbor_end( original._neighbor_end ),
-					_neighbors_start( original._neighbors_start ),
-					_point( original._point ) {}
-				*/
-
 				LinearizedHaloNDimIterator( const SelfType & ) = default;
 
-				//LinearizedHaloNDimIterator( SelfType &&original ) = delete;
-
-				/*
-				LinearizedHaloNDimIterator< SizeType, DIMS >& operator=(
-					const LinearizedHaloNDimIterator< SizeType, DIMS >& original ) noexcept {
-					this->_coordinates_linearizer = original._coordinates_linearizer;
-					this->_halo = original._halo;
-					this->_dimension_limits = original._dimension_limits;
-					this->_neighbors_linearizer = original._neighbors_linearizer;
-					this->_element_iter = original._element_iter;
-					this->_coordinates_linear = original._coordinates_linear;
-					this->_neighbor_iter = original._neighbor_iter;
-					this->_neighbor_end = original._neighbor_end;
-					this->_neighbor = original._neighbor;
-					this->_neighbors_start = original._neighbors_start;
-					this->_neighbor_linear = original._neighbor_linear;
-				}
-				*/
-
 				SelfType & operator=( const SelfType & ) = default;
 
-				//SelfType & operator=( SelfType && ) = delete;
-
 				bool operator!=( const SelfType &other ) const {
-					//return (this->_point)._coordinates_linear != (other._point)._coordinates_linear
-					//	|| (this->_point)._neighbor_linear != (other._point)._neighbor_linear;
 					return this->_point._position != other._point._position; // use linear coordinate
 				}
 
@@ -251,12 +185,6 @@ namespace grb {
 				}
 
 				void next_neighbour() {
-					/*
-					std::cout << "sizes: " << this->_neighbors_linearizer.get_sizes()
-						<< " offset " << this->_neighbor_iter->get_position() << " -> "
-						<< this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() )
-						<< std::endl;
-					*/
 					++(this->_neighbor_iter);
 					this->on_neighbor_iter_update();
 					this->_point._position++;
@@ -269,11 +197,9 @@ namespace grb {
 				void next_element() {
 					size_t num_neighbours = this->_neighbors_linearizer.system_size();
 					size_t neighbour_position_offset =
-						this->_neighbors_linearizer.ndim_to_linear_offset( this->_neighbor_iter->get_position() );
-					// std::cout << " num_neighbours " << num_neighbours << " offset " << neighbour_position_offset << std::endl;
+						this->_neighbors_linearizer.ndim_to_linear( this->_neighbor_iter->get_position() );
 					++(this->_point._element_iter);
 					this->on_element_advance();
-					// this->_point._position++;
 					this->_point._position -= neighbour_position_offset;
 					this->_point._position += num_neighbours;
 				}
@@ -282,7 +208,6 @@ namespace grb {
 					++(this->_neighbor_iter);
 					if( !has_more_neighbours() ) {
 						++(this->_point._element_iter);
-						//this->_coordinates_linear = this->_coordinates_linearizer.ndim_to_linear( this->_element_iter );
 						this->on_element_advance();
 
 					} else {
@@ -303,11 +228,7 @@ namespace grb {
 					VectorType final_element( DIMS );
 					size_t neighbor_index{ (this->_point._system->neighbour_linear_to_element( final_position, final_element )) };
 
-					// std::cout << "\t=== element " << offset << " -- ";
-					// std::cout << final_element[0] << " " << final_element[0] << std::endl;
-
 					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
-					//this->_point._element = &( *this->_element_iter );
 					this->_point._position = final_position;
 
 					this->on_element_update();
@@ -329,7 +250,7 @@ namespace grb {
 					size_t a_pos{ _point.get_position() }, b_pos{ other._point.get_position() };
 					// std::cout << "diff " << a_pos << " - " << b_pos << std::endl;
 					size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
-					using diff_t = typename LinearizedHaloNDimIterator< SizeType, DIMS >::difference_type;
+					using diff_t = typename LinearizedHaloNDimIterator< DIMS, SizeType >::difference_type;
 
 					if( highest - lowest > static_cast< size_t >(
 						std::numeric_limits< diff_t >::max() ) ) {
@@ -339,47 +260,21 @@ namespace grb {
 					return ( static_cast< diff_t >( a_pos - b_pos ) );
 				}
 
-
-
-
 				// implementation depending on logic in operator++
 				static SelfType make_system_end_iterator( const SystemType& system ) {
 					SelfType result( system );
 
-					/*
-					std::cout << "result 0: element ";
-					print(result->get_element()) << " neighbor ";
-					print(result->get_neighbor())  << std::endl;
-					*/
-
 					// go to the very first point outside of space
 					result._point._element_iter = VectorIteratorType::make_system_end_iterator( system );
-					/*
-					std::cout << "result 1: element ";
-					print(result->get_element()) << " neighbor ";
-					print(result->get_neighbor())  << std::endl;
-					*/
-
 					result.on_element_advance();
 					result._point._position = system.halo_system_size();
-					//std::cout << "got sys size " << system.halo_system_size() << std::endl;
 
 					return result;
 				}
-
 			};
 
-			/*
-			template< typename SizeType, size_t DIMS > LinearizedHaloNDimIterator< SizeType, DIMS >
-				operator+( const LinearizedHaloNDimIterator< SizeType, DIMS >& original, size_t increment ) {
-				LinearizedHaloNDimIterator< SizeType, DIMS > res( original );
-				return ( res += increment );
-			}
-			*/
-
-
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_ITERATOR
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
similarity index 82%
rename from include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
rename to include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
index af296cc9f..cc84de621 100644
--- a/include/graphblas/utils/geometry/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -1,6 +1,6 @@
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
-#define _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
 
 #include <cstddef>
 #include <vector>
@@ -15,21 +15,20 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			// only with ArrayVectorStorage
 			template<
-				typename SizeType,
-				size_t DIMS
+				size_t DIMS,
+				typename SizeType
 			> class LinearizedHaloNDimSystem:
-				public LinearizedNDimSystem< SizeType, ArrayVectorStorage< SizeType, DIMS > > {
+				public LinearizedNDimSystem< SizeType, ArrayVectorStorage< DIMS, SizeType > > {
 			public:
-
-				using VectorType = ArrayVectorStorage< SizeType, DIMS >;
+				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
 				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
-				using SelfType = LinearizedHaloNDimSystem< SizeType, DIMS >;
+				using SelfType = LinearizedHaloNDimSystem< DIMS, SizeType >;
 				using BaseType = LinearizedNDimSystem< SizeType, VectorType >;
-				using Iterator = LinearizedHaloNDimIterator< SizeType, DIMS >;
+				using Iterator = LinearizedHaloNDimIterator< DIMS, SizeType >;
 
 				LinearizedHaloNDimSystem( ConstVectorStorageType sizes, SizeType halo ):
 					BaseType( sizes.cbegin(), sizes.cend() ),
@@ -102,15 +101,13 @@ namespace grb {
 				}
 
 			private:
-
 				const SizeType _halo;
 				std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > _dimension_limits;
 				size_t _system_size;
-
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_LINEARIZED_HALO_NDIM_SYSTEM
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
diff --git a/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
similarity index 91%
rename from include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
rename to include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
index 60f424164..f65ec8831 100644
--- a/include/graphblas/utils/geometry/linearized_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
-#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
 
 #include <cstddef>
 #include <algorithm>
@@ -29,7 +29,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			// forward declaration for default
 			template<
@@ -42,7 +42,6 @@ namespace grb {
 				typename InternalVectorType
 			> class LinearizedNDimIterator {
 			public:
-
 				using VectorType = InternalVectorType;
 				using LinNDimSysType = LinearizedNDimSystem< SizeType, VectorType >;
 				using ConstVectorReference = const VectorType&;
@@ -50,12 +49,10 @@ namespace grb {
 
 				struct NDimPoint {
 				private:
-
 					const LinNDimSysType* system; // pointer because of copy assignment
 					VectorType coords;
 
 				public:
-
 					friend SelfType;
 
 					NDimPoint() = delete;
@@ -82,7 +79,6 @@ namespace grb {
 					}
 				};
 
-
 				// interface for std::random_access_iterator
 				using iterator_category = std::random_access_iterator_tag;
 				using value_type = NDimPoint;
@@ -107,10 +103,6 @@ namespace grb {
 
 				SelfType& operator=( const SelfType &original ) = default;
 
-				// LinearizedNDimIterator( SelfType && ) = delete;
-
-				// SelfType operator=( SelfType && ) = delete;
-
 				~LinearizedNDimIterator() {}
 
 				SelfType & operator++() noexcept {
@@ -119,11 +111,6 @@ namespace grb {
 					for( size_t i { 0 }; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
 						SizeType& coord = this->_p.coords[ i ];
 						// must rewind dimension if we wrap-around
-						/*
-						SizeType new_coord = ( coord + 1 ) % this->_p.system->get_sizes()[ i ];
-						rewind = new_coord < coord;
-						coord = new_coord;
-						*/
 						SizeType plus = coord + 1;
 						rewind = plus >= this->_p.system->get_sizes()[ i ];
 						coord = rewind ? 0 : plus;
@@ -187,11 +174,10 @@ namespace grb {
 
 			private:
 				NDimPoint _p;
-
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_ITERATOR
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
diff --git a/include/graphblas/utils/geometry/linearized_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
similarity index 91%
rename from include/graphblas/utils/geometry/linearized_ndim_system.hpp
rename to include/graphblas/utils/multigrid/linearized_ndim_system.hpp
index 87352aa19..3e4c15b14 100644
--- a/include/graphblas/utils/geometry/linearized_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
-#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
 
 #include <cstddef>
 #include <algorithm>
@@ -41,7 +41,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			/**
 			 * Extends a \p NDimSystem by linearizing it, i.e. it provides facilities to map a vector in
@@ -60,7 +60,6 @@ namespace grb {
 				typename SizeType,
 				typename InternalVectorType
 			> class LinearizedNDimSystem: public NDimSystem< SizeType, InternalVectorType > {
-
 			public:
 				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
 
@@ -160,17 +159,6 @@ namespace grb {
 					return linear;
 				}
 
-				// probably same as ndim_to_linear !!!
-				size_t ndim_to_linear_offset( ConstVectorStorageType ndim_vector ) const {
-					size_t linear{ 0 };
-					size_t steps{ 1 };
-					for( size_t i{ 0 }; i < this->dimensions(); i++ ) {
-						linear += steps * ndim_vector[i];
-						steps *= this->_sizes[i];
-					}
-					return linear;
-				}
-
 				// must be same dimensionality
 				void retarget( ConstVectorReference _new_sizes ) {
 					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
@@ -191,11 +179,10 @@ namespace grb {
 				}
 
 			private:
-
 				VectorType offsets;
 				size_t _system_size;
 
-				 template<
+				template<
 					typename IterIn,
 					typename IterOut
 				> static size_t compute_offsets( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
@@ -208,9 +195,8 @@ namespace grb {
 				}
 			};
 
-
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM_LINEARIZER
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
diff --git a/include/graphblas/utils/geometry/ndim_system.hpp b/include/graphblas/utils/multigrid/ndim_system.hpp
similarity index 88%
rename from include/graphblas/utils/geometry/ndim_system.hpp
rename to include/graphblas/utils/multigrid/ndim_system.hpp
index f9a97c18d..9d387ce32 100644
--- a/include/graphblas/utils/geometry/ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/ndim_system.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
-#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
 
 #include <cstddef>
 #include <algorithm>
@@ -37,7 +37,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			/**
 			 * Describes a #dimensions()-dimensional system by storing its size along each dimension.
@@ -52,7 +52,6 @@ namespace grb {
 				typename SizeType,
 				typename InternalVectorType
 			> class NDimSystem {
-
 			public:
 				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
 
@@ -71,7 +70,7 @@ namespace grb {
 				 * @param begin range begin
 				 * @param end end of range
 				 */
-				template< typename IterT > NDimSystem( IterT begin, IterT end) noexcept :
+				template< typename IterType > NDimSystem( IterType begin, IterType end) noexcept :
 					_sizes( std::distance( begin, end ) )
 				{
 					std::copy( begin, end, this->_sizes.begin() );
@@ -98,13 +97,8 @@ namespace grb {
 
 				NDimSystem( const SelfType & ) = default;
 
-				// NDimSystem( SelfType && ) = default;
-
-				// NDimSystem( SelfType &&original ) noexcept: _sizes( std::move( original._sizes ) ) {}
 				NDimSystem( SelfType && ) = delete;
 
-				~NDimSystem() {}
-
 				SelfType & operator=( const SelfType &original ) = default;
 
 				SelfType & operator=( SelfType &&original ) = delete;
@@ -122,12 +116,11 @@ namespace grb {
 				}
 
 			protected:
-
 				InternalVectorType _sizes;
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_SYSTEM
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
diff --git a/include/graphblas/utils/geometry/ndim_vector.hpp b/include/graphblas/utils/multigrid/ndim_vector.hpp
similarity index 95%
rename from include/graphblas/utils/geometry/ndim_vector.hpp
rename to include/graphblas/utils/multigrid/ndim_vector.hpp
index eca89137e..26ee084e6 100644
--- a/include/graphblas/utils/geometry/ndim_vector.hpp
+++ b/include/graphblas/utils/multigrid/ndim_vector.hpp
@@ -1,6 +1,6 @@
 
-#ifndef _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
-#define _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
 
 #include <utility>
 #include <vector>
@@ -12,7 +12,7 @@
 
 namespace grb {
 	namespace utils {
-		namespace geometry {
+		namespace multigrid {
 
 			/**
 			 * Maps an N-dimensional vector to an array of data.
@@ -37,7 +37,6 @@ namespace grb {
 				typename SizeType,
 				typename InternalVectorType
 			> class NDimVector {
-
 			public:
 				static_assert( std::is_default_constructible< DataType >::value,
 					"the stored type is not default constructible" );
@@ -133,8 +132,8 @@ namespace grb {
 				}
 			};
 
-		} // namespace geometry
+		} // namespace multigrid
 	} // namespace utils
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_GEOMETRY_NDIM_VECTOR
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 2b544fb16..ebed53096 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -43,7 +43,6 @@
 
 // here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
 #ifdef HPCG_PRINT_STEPS
-
 #include <cstdio>
 
 // HPCG_PRINT_STEPS requires defining the following symbols
@@ -52,10 +51,8 @@
  * @brief simply prints \p args on a dedicated line.
  */
 #define DBG_println( args ) std::cout << args << std::endl;
-
 // forward declaration for the tracing facility
 template< typename T > void print_norm( const grb::Vector< T > &r, const char * head );
-
 /**
  * @brief prints \p head and the norm of \p r.
  */
@@ -86,9 +83,8 @@ constexpr double SYSTEM_DIAG_VALUE { 26.0 };
 constexpr double SYSTEM_NON_DIAG_VALUE { -1.0 };
 constexpr size_t BAND_WIDTH_3D { 13UL };
 constexpr size_t HALO_RADIUS { 1U };
-//============================================
-
 constexpr double MAX_NORM { 4.0e-14 };
+//============================================
 
 using namespace grb;
 using namespace algorithms;
@@ -110,7 +106,7 @@ struct system_input {
  * Container for the parameters for the HPCG simulation.
  */
 struct simulation_input : public system_input {
-	size_t test_repetitions;
+	size_t inner_test_repetitions;
 	size_t max_iterations;
 	size_t smoother_steps;
 	bool evaluation_run;
@@ -128,20 +124,65 @@ using StdMinus = operators::subtract< NonzeroType >;
 using coord_t = size_t;
 
 /**
- * Containers for test outputs.
+ * Container for test outputs.
  */
 struct output {
 	RC error_code = SUCCESS;
-	size_t test_repetitions = 0;
-	size_t performed_iterations = 0;
-	NonzeroType residual = 0.0;
+	size_t inner_test_repetitions = 0;
 	grb::utils::TimerResults times;
 	std::unique_ptr< PinnedVector< IOType > > pinnedVector;
-	NonzeroType square_norm_diff;
+	NonzeroType square_norm_diff = 0.0;
+	cg_out_data< NonzeroType > cg_out = { 0, 0.0 };
 };
 
+using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
+	StdRing, StdMinus >;
+using mg_data_t = multigrid_data< IOType, NonzeroType >;
+using coarsening_data_t = coarsening_data< IOType, NonzeroType >;
+using smoothing_data_t = smoother_data< IOType >;
+using hpcg_data_t = mg_cg_data< IOType, NonzeroType, InputType >;
+
+#ifdef HPCG_PRINT_SYSTEM
+static void print_system(
+	const std::vector< std::unique_ptr< mg_data_t > > &system_levels,
+	const std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels
+) {
+	assert( spmd<>::nprocs() == 1 ); // distributed printin of system not implemented
+	print_matrix( system_levels[ 0 ]->A, 70, "A" );
+	for( size_t i = 0; i < coarsener_levels.size(); i++ ) {
+		print_matrix( coarsener_levels[i ] ->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( system_levels[ i + 1 ]->A, 50, "COARSER SYSTEM MATRIX" );
+	}
+}
+#endif
+
+#ifdef HPCG_PRINT_STEPS
+template<
+	typename T,
+	class Ring
+> void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
+	T norm = 0;
+	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
+	(void)ret;
+	assert( ret == SUCCESS );
+	if( spmd<>::pid() != 0 ) {
+		return;
+	}
+	if( head != nullptr ) {
+		printf(">>> %s: %lf\n", head, norm );
+	} else {
+		printf(">>> %lf\n", norm );
+	}
+}
+
+template< typename T > void print_norm( const grb::Vector< T > & r, const char * head ) {
+	return print_norm( r, head, StdRing() );
+}
+#endif
+
+
 /**
- * Returns the closets power of 2 bigger or equal to \p n .
+ * Returns the closest power of 2 bigger or equal to \p n .
  */
 template< typename T >
 T static next_pow_2( T n ) {
@@ -155,16 +196,41 @@ T static next_pow_2( T n ) {
 	return n + 1;
 }
 
-using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
-	StdRing, StdMinus >;
-using mg_data_t = multigrid_data< IOType, NonzeroType >;
-using coarsening_data_t = coarsening_data< IOType, NonzeroType >;
-using smoothing_data_t = smoother_data< IOType >;
-using hpcg_data_t = mg_cg_data< IOType, NonzeroType, InputType >;
+static void allocate_system(
+	const std::vector< size_t > &mg_sizes,
+	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
+	std::unique_ptr< hpcg_data_t > &holder
+) {
+	const size_t pid { spmd<>::pid() };
+	grb::utils::Timer timer;
+
+	hpcg_data_t *data{ new hpcg_data_t( mg_sizes[ 0 ] ) };
+	holder = std::unique_ptr< hpcg_data_t >( data );
+	MASTER_PRINT( pid, "allocating data for the MultiGrid simulation...");
+	timer.reset();
+	multigrid_allocate_data( mg_sizes, system_levels, coarsener_levels, smoother_levels );
+	double time = timer.time();
+	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+
+	// zero all vectors
+	MASTER_PRINT( pid, "zeroing all vectors...");
+	timer.reset();
+	grb::RC rc = data->zero_temp_vectors();
+	ASSERT_RC_SUCCESS( rc );
+	std::for_each( system_levels.begin(), system_levels.end(),
+		[]( std::unique_ptr< mg_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
+		[]( std::unique_ptr< coarsening_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+	std::for_each( smoother_levels.begin(), smoother_levels.end(),
+		[]( std::unique_ptr< smoothing_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+	time = timer.time();
+	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
+}
 
 /**
  * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
- * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
  */
 static void build_3d_system(
 	const system_input & in,
@@ -174,11 +240,11 @@ static void build_3d_system(
 	std::unique_ptr< hpcg_data_t > &holder
 ) {
 	constexpr size_t DIMS = 3;
-	using builder_t = grb::algorithms::HPCGBuilder< DIMS, coord_t, NonzeroType >;
+	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
 	const size_t pid { spmd<>::pid() };
 	grb::utils::Timer timer;
 
-	hpcg_system_params< 3, NonzeroType > params {
+	hpcg_system_params< DIMS, NonzeroType > params {
 		{ in.nx, in.ny, in.nz }, HALO_RADIUS, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE,
 			PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
@@ -187,39 +253,18 @@ static void build_3d_system(
 	MASTER_PRINT( pid, "building HPCG generators for " << ( in.max_coarsening_levels + 1 )
 		<< " levels..." );
 	timer.reset();
-	build_hpcg_multigrid_generators( params, mg_generators );
+	hpcg_build_multigrid_generators( params, mg_generators );
 	double time = timer.time();
 	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
 	MASTER_PRINT( pid, "built HPCG generators for " << mg_generators.size()
 		<< " levels" << std::endl );
 
-	hpcg_data_t *data{ new hpcg_data_t( mg_generators[ 0 ].system_size() ) };
-	holder = std::unique_ptr< hpcg_data_t >( data );
 
 	std::vector< size_t > mg_sizes;
 	// exclude main system
 	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
 		[] ( const builder_t &b ) { return b.system_size(); } );
-
-	MASTER_PRINT( pid, "allocating data for the MultiGrid simulation...");
-	timer.reset();
-	allocate_multigrid_data( mg_sizes, system_levels, coarsener_levels, smoother_levels );
-	time = timer.time();
-	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
-
-	// zero all vectors
-	MASTER_PRINT( pid, "zeroing all vectors...");
-	timer.reset();
-	data->zero_temp_vectors();
-	std::for_each( system_levels.begin(), system_levels.end(),
-		[]( std::unique_ptr< mg_data_t > &s) { s->zero_temp_vectors(); } );
-	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
-		[]( std::unique_ptr< coarsening_data_t > &s) { s->zero_temp_vectors(); } );
-	std::for_each( smoother_levels.begin(), smoother_levels.end(),
-		[]( std::unique_ptr< smoothing_data_t > &s) { s->zero_temp_vectors(); } );
-	time = timer.time();
-	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
-
+	allocate_system( mg_sizes, system_levels, coarsener_levels, smoother_levels, holder );
 	assert( mg_generators.size() == system_levels.size() );
 	assert( mg_generators.size() == smoother_levels.size() );
 	assert( mg_generators.size() - 1 == coarsener_levels.size() );
@@ -228,63 +273,29 @@ static void build_3d_system(
 		MASTER_PRINT( pid, "SYSTEM LEVEL " << i << std::endl );
 		MASTER_PRINT( pid, " populating system matrix: " );
 		timer.reset();
-		populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A );
+		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A );
 		time = timer.time();
+		ASSERT_RC_SUCCESS( rc );
 		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
 
 		MASTER_PRINT( pid, " populating smoothing data: " );
 		timer.reset();
-		populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ] );
+		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ] );
 		time = timer.time();
+		ASSERT_RC_SUCCESS( rc );
 		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
 
 		if( i > 0 ) {
 			MASTER_PRINT( pid, " populating coarsening data: " );
 			timer.reset();
-			populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
 			time = timer.time();
+			ASSERT_RC_SUCCESS( rc );
 			MASTER_PRINT( pid, " time (ms) " << time << std::endl )
 		}
 	}
 }
 
-#ifdef HPCG_PRINT_SYSTEM
-static void print_system(
-	const std::vector< std::unique_ptr< mg_data_t > > &system_levels,
-	const std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels
-) {
-	print_matrix( system_levels[ 0 ]->A, 70, "A" );
-	for( size_t i = 0; i < coarsener_levels.size(); i++ ) {
-		print_matrix( coarsener_levels[i ] ->coarsening_matrix, 50, "COARSENING MATRIX" );
-		print_matrix( system_levels[ i + 1 ]->A, 50, "COARSER SYSTEM MATRIX" );
-	}
-}
-#endif
-
-#ifdef HPCG_PRINT_STEPS
-template<
-	typename T,
-	class Ring
-> void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm = 0;
-	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
-	(void)ret;
-	assert( ret == SUCCESS );
-	if( head != nullptr ) {
-		printf(">>> %s: %lf\n", head, norm );
-	} else {
-		printf(">>> %lf\n", norm );
-	}
-}
-
-template< typename T > void print_norm( const grb::Vector< T > & r, const char * head ) {
-	return print_norm( r, head, StdRing() );
-}
-#endif
-
-
-
-
 /**
  * @brief Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
@@ -292,14 +303,9 @@ template< typename T > void print_norm( const grb::Vector< T > & r, const char *
 void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
 	const size_t pid { spmd<>::pid() };
-	assert( pid < spmd<>::nprocs() );
-	if( pid == 0 ) {
-		thcout << "beginning input generation..." << std::endl;
-	}
-	grb::utils::Timer timer;
+	MASTER_PRINT( pid, "beginning input generation..." << std::endl );
 
-	// assume successful run
-	out.error_code = SUCCESS;
+	grb::utils::Timer timer;
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data_t > hpcg_state;
@@ -316,10 +322,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	timer.reset();
 	build_3d_system( in, mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state );
 	double input_duration { timer.time() };
-
-	if( pid == 0 ) {
-		thcout << "input generation time (ms): " << input_duration << std::endl;
-	}
+	MASTER_PRINT( pid, "input generation time (ms): " << input_duration << std::endl );
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
@@ -347,88 +350,55 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	out.times.preamble = timer.time();
 
-	cg_out_data< NonzeroType > cg_out;
+	cg_out_data< NonzeroType > &cg_out = out.cg_out;
 	mg_data_t &grid_base = *mg_runner.system_levels[ 0 ];
-	if( in.evaluation_run ) {
-		out.test_repetitions = 0;
-		if( pid == 0 ) {
-			thcout << "beginning evaluation run..." << std::endl;
-		}
-		timer.reset();
-		rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
-		double single_time = timer.time();
-		if( rc == SUCCESS ) {
-			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
-		}
-		if( rc != SUCCESS ) {
-			thcerr << "error during evaluation run" << std::endl;
-			out.error_code = rc;
-			return;
-		}
-		out.times.useful = single_time;
-		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
-		out.performed_iterations = cg_out.iterations;
-		out.residual = cg_out.norm_residual;
-
-		if( pid == 0 ) {
-			thcout << "Evaluation run" << std::endl;
-		}
-
-		std::cout << "  iterations: " << out.performed_iterations << std::endl
-			<< "  computed residual: " << out.residual << std::endl
-			<< "  time taken (ms): " << out.times.useful << std::endl
-			<< "  deduced inner repetitions for 1s duration: " << out.test_repetitions << std::endl;
-		return;
-	}
 
 	// do a cold run to warm the system up
-	if( pid == 0 ) {
-		thcout << "beginning cold run..." << std::endl;
-	}
+	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning cold run..." << std::endl );
 	hpcg_runner.cg_opts.max_iterations = 1;
 	timer.reset();
 	rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
 	double iter_duration { timer.time() };
-	if( pid == 0 ) {
-		thcout << "cold run duration (ms): " << iter_duration << std::endl;
-	}
-
+	ASSERT_RC_SUCCESS( rc );
+	MASTER_PRINT( pid, " time (ms): " << iter_duration << std::endl );
 
 	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
 	hpcg_runner.cg_opts.print_iter_stats = in.print_iter_stats;
 	// do benchmark
-	for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
+	const size_t inner_test_repetitions = in.evaluation_run ? 1 : in.inner_test_repetitions;
+	if( in.evaluation_run ) {
+		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning evaluation run..." << std::endl );
+	} else {
+		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning test run..." << std::endl );
+	}
+	out.inner_test_repetitions = 0;
+	out.times.useful = 0.0;
+	for( size_t i = 0; i < inner_test_repetitions; ++i ) {
 		rc = set( x, 0.0 );
-		assert( rc == SUCCESS );
-		if( pid == 0 ) {
-			thcout << "beginning iteration: " << i << std::endl;
-		}
+		ASSERT_RC_SUCCESS( rc );
+		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl );
 		timer.reset();
 		rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
-		iter_duration = timer.time();
-		out.times.useful += iter_duration;
-		if( pid == 0 ) {
-			thcout << "repetition,duration (ms): " << i << "," << iter_duration << std::endl;
-		}
-		out.test_repetitions++;
-		if( rc != SUCCESS ) {
-			break;
-		}
+		out.times.useful += timer.time();
+		ASSERT_RC_SUCCESS( rc );
+		MASTER_PRINT( pid, "repetition,duration (ms): " << i << "," << iter_duration << std::endl );
+		out.inner_test_repetitions++;
 	}
-	out.times.useful /= static_cast< double >( in.test_repetitions );
-
-	out.performed_iterations = cg_out.iterations;
-	out.residual = cg_out.norm_residual;
-
-	if( spmd<>::pid() == 0 ) {
-		if( rc == SUCCESS ) {
-			thcout << "repetitions, average time (ms): " << out.test_repetitions
-				<< ", " << out.times.useful << std::endl;
-		} else {
-			thcerr << "Failure: call to HPCG did not succeed (" << toString( rc )
-				<< ")." << std::endl;
-		}
+	if( in.evaluation_run ) {
+		rc = collectives<>::reduce( iter_duration, 0, operators::max< double >() );
+		ASSERT_RC_SUCCESS( rc );
+		out.inner_test_repetitions = static_cast< size_t >( 1000.0 / out.times.useful ) + 1;
+		MASTER_PRINT( pid, "Evaluation run" << std::endl
+			<< "  computed residual: " << cg_out.norm_residual << std::endl
+			<< "  iterations: " << cg_out.iterations << std::endl
+			<< "  time taken (ms): " << out.times.useful << std::endl
+			<< "  deduced inner repetitions for 1s duration: " << out.inner_test_repetitions << std::endl );
+		return;
 	}
+	out.times.useful /= static_cast< double >( in.inner_test_repetitions );
+
+	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
+				<< ", " << out.times.useful << std::endl );
 
 	// start postamble
 	timer.reset();
@@ -436,15 +406,14 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.error_code = rc;
 
 	grb::set( b, 1.0 );
-	out.square_norm_diff = 0.0;
 	grb::eWiseMul( b, -1.0, x, StdRing() );
+	out.square_norm_diff = 0.0;
 	grb::dot( out.square_norm_diff, b, b, StdRing() );
 
 	// output
-	out.pinnedVector = std::unique_ptr< PinnedVector< NonzeroType > >( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
+	out.pinnedVector.reset( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
 	// finish timing
-	const double time_taken { timer.time() };
-	out.times.postamble = time_taken;
+	out.times.postamble = timer.time();
 }
 
 /**
@@ -455,21 +424,21 @@ static void parse_arguments( simulation_input &, size_t &, double &, int, char *
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
 	size_t test_outer_iterations;
-	double max_residual_norm;
+	double max_diff_norm;
 
-	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
+	parse_arguments( sim_in, test_outer_iterations, max_diff_norm, argc, argv );
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
 	thcout << "System size z: " << sim_in.nz << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
-	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
+	thcout << "Test repetitions: " << sim_in.inner_test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
 	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
 	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
 	thcout << "Print iteration residual: " << std::boolalpha << sim_in.print_iter_stats << std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
-	thcout << "Maximum norm for residual: " << max_residual_norm << std::endl;
+	thcout << "Maximum norm for residual: " << max_diff_norm << std::endl;
 
 	// the output struct
 	struct output out;
@@ -482,7 +451,7 @@ int main( int argc, char ** argv ) {
 		grb::Launcher< AUTOMATIC > launcher;
 		rc = launcher.exec( &grbProgram, sim_in, out, true );
 		if( rc == SUCCESS ) {
-			sim_in.test_repetitions = out.test_repetitions;
+			sim_in.inner_test_repetitions = out.inner_test_repetitions;
 		} else {
 			thcout << "launcher.exec returns with non-SUCCESS error code " << grb::toString( rc ) << std::endl;
 			std::exit( -1 );
@@ -493,8 +462,8 @@ int main( int argc, char ** argv ) {
 	grb::Benchmarker< AUTOMATIC > benchmarker;
 	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
 	ASSERT_RC_SUCCESS( rc );
-	thcout << "Benchmark completed successfully and took " << out.performed_iterations
-		<< " iterations to converge with residual " << out.residual << std::endl;
+	thcout << "Benchmark completed successfully and took " << out.cg_out.iterations
+		<< " iterations to converge with residual " << out.cg_out.norm_residual << std::endl;
 
 	if( ! out.pinnedVector ) {
 		thcerr << "no output vector to inspect" << std::endl;
@@ -510,37 +479,41 @@ int main( int argc, char ** argv ) {
 
 	ASSERT_RC_SUCCESS( out.error_code );
 
-	double residual_norm { sqrt( out.square_norm_diff ) };
-	thcout << "Residual norm: " << residual_norm << std::endl;
+	double diff_norm { sqrt( out.square_norm_diff ) };
+	thcout << "Norm of difference vector |<exact solution> - <actual solution>|: " << diff_norm << std::endl;
 
-	ASSERT_LT( residual_norm, max_residual_norm );
+	ASSERT_LT( diff_norm, max_diff_norm );
 
 	thcout << "Test OK" << std::endl;
 	return 0;
 }
 
-static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
+static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_diff_norm, int argc, char ** argv ) {
 
 	argument_parser parser;
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
 		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
-			"maximum level for coarsening; 0 means no coarsening; note: actual "
-			"level may be limited"
+			"maximum level for coarsening; 0 means no coarsening; note: actual level may be limited"
 			" by the minimum system dimension" )
-		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
-		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
-		.add_optional_argument( "--max-iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
-		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
-			"maximum norm for the residual to be acceptable (does NOT limit "
+		.add_optional_argument( "--test-rep", sim_in.inner_test_repetitions, grb::config::BENCHMARKING::inner(),
+			"consecutive test repetitions before benchmarking" )
+		.add_optional_argument( "--outer-iterations", outer_iterations, 1,
+			"test repetitions with complete initialization" )
+		.add_optional_argument( "--max-cg-iterations", sim_in.max_iterations, MAX_ITERATIONS_DEF,
+			"maximum number of CG iterations" )
+		.add_optional_argument( "--max-difference-norm", max_diff_norm, MAX_NORM,
+			"maximum acceptable norm |<exact solution> - <actual solution>| (does NOT limit "
 			"the execution of the algorithm)" )
-		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
+		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF,
+			"number of pre/post-smoother steps; 0 disables smoothing" )
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
 			"launch single run directly, without benchmarker (ignore repetitions)" )
-		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" )
-		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false, "on each iteration, print more statistics" );
-
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
+			"do not apply pre-conditioning via multi-grid V cycle" )
+		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false,
+			"on each iteration, print more statistics" );
 
 	parser.parse( argc, argv );
 
@@ -564,7 +537,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
 		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
 	}
-	if( sim_in.test_repetitions == 0 ) {
+	if( sim_in.inner_test_repetitions == 0 ) {
 		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
 		std::exit( -1 );
 	}

From d0a9322bd7bc0cc2652aec72abcd680b22e64f61 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 5 Dec 2022 17:48:53 +0100
Subject: [PATCH 09/28] fixing default number of coarsening levels

---
 tests/smoke/hpcg.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index ebed53096..988e181f0 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -70,20 +70,20 @@ template< typename T > void print_norm( const grb::Vector< T > &r, const char *
 #include <utils/print_vec_mat.hpp>
 
 //========== MAIN PROBLEM PARAMETERS =========
-// values modifiable via cmd line args: default set as in reference HPCG
-constexpr size_t PHYS_SYSTEM_SIZE_DEF{ 16UL };
-constexpr size_t PHYS_SYSTEM_SIZE_MIN{ 4UL };
-constexpr size_t DEF_COARSENING_LEVELS{ 1U };
-constexpr size_t MAX_COARSENING_LEVELS{ 4U };
-constexpr size_t MAX_ITERATIONS_DEF{ 56UL };
-constexpr size_t SMOOTHER_STEPS_DEF{ 1 };
-
-// internal values
-constexpr double SYSTEM_DIAG_VALUE { 26.0 };
-constexpr double SYSTEM_NON_DIAG_VALUE { -1.0 };
-constexpr size_t BAND_WIDTH_3D { 13UL };
-constexpr size_t HALO_RADIUS { 1U };
-constexpr double MAX_NORM { 4.0e-14 };
+// default simulation parameters, set as in reference HPCG
+// users can input different ones via the cmd line
+constexpr size_t PHYS_SYSTEM_SIZE_DEF = 16UL;
+constexpr size_t PHYS_SYSTEM_SIZE_MIN = 2UL;
+constexpr size_t MAX_COARSENING_LEVELS = 3U;
+constexpr size_t MAX_ITERATIONS_DEF = 56UL;
+constexpr size_t SMOOTHER_STEPS_DEF = 1;
+
+// internal values defining the simulated physical system
+constexpr double SYSTEM_DIAG_VALUE = 26.0;
+constexpr double SYSTEM_NON_DIAG_VALUE = -1.0;
+constexpr size_t BAND_WIDTH_3D = 13UL;
+constexpr size_t HALO_RADIUS = 1U;
+constexpr double MAX_NORM = 4.0e-14;
 //============================================
 
 using namespace grb;
@@ -494,7 +494,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
-		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, MAX_COARSENING_LEVELS,
 			"maximum level for coarsening; 0 means no coarsening; note: actual level may be limited"
 			" by the minimum system dimension" )
 		.add_optional_argument( "--test-rep", sim_in.inner_test_repetitions, grb::config::BENCHMARKING::inner(),

From 38cfbf8cde61f05d640afd4257322005fd2c50c1 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 23 Jan 2023 16:39:44 +0100
Subject: [PATCH 10/28] accepting non-power-of-2 system sizes

---
 tests/smoke/hpcg.cpp | 73 +++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 988e181f0..8d524ef92 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -182,21 +182,14 @@ template< typename T > void print_norm( const grb::Vector< T > & r, const char *
 
 
 /**
- * Returns the closest power of 2 bigger or equal to \p n .
+ * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
+ * for each level of the multi-grid. The input is the vector of system sizes \p mg_sizes, with sizes in
+ * monotonically \b decreasing order (finest system first).
+ *
+ * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
+ * explained in \ref multigrid_allocate_data().
  */
-template< typename T >
-T static next_pow_2( T n ) {
-	static_assert( std::is_integral< T >::value, "Integral required." );
-	--n;
-	n |= ( n >> 1 );
-	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
-		const unsigned shift = static_cast< T >( 1U ) << i;
-		n |= ( n >> shift );
-	}
-	return n + 1;
-}
-
-static void allocate_system(
+static void allocate_system_structures(
 	const std::vector< size_t > &mg_sizes,
 	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
@@ -264,7 +257,7 @@ static void build_3d_system(
 	// exclude main system
 	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
 		[] ( const builder_t &b ) { return b.system_size(); } );
-	allocate_system( mg_sizes, system_levels, coarsener_levels, smoother_levels, holder );
+	allocate_system_structures( mg_sizes, system_levels, coarsener_levels, smoother_levels, holder );
 	assert( mg_generators.size() == system_levels.size() );
 	assert( mg_generators.size() == smoother_levels.size() );
 	assert( mg_generators.size() - 1 == coarsener_levels.size() );
@@ -488,7 +481,13 @@ int main( int argc, char ** argv ) {
 	return 0;
 }
 
-static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_diff_norm, int argc, char ** argv ) {
+static void parse_arguments(
+	simulation_input & sim_in,
+	size_t & outer_iterations,
+	double & max_diff_norm,
+	int argc,
+	char ** argv
+) {
 
 	argument_parser parser;
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
@@ -517,22 +516,6 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 
 	parser.parse( argc, argv );
 
-	// check for valid values
-	size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
-	if( ssize != sim_in.nx ) {
-		std::cout << "Setting system size x to " << ssize << " instead of " << sim_in.nx << std::endl;
-		sim_in.nx = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.ny ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.ny ) {
-		std::cout << "Setting system size y to " << ssize << " instead of " << sim_in.ny << std::endl;
-		sim_in.ny = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.nz ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.nz ) {
-		std::cout << "Setting system size z to " << ssize << " instead of " << sim_in.nz << std::endl;
-		sim_in.nz = ssize;
-	}
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
 		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
 		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
@@ -542,7 +525,29 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		std::exit( -1 );
 	}
 	if( sim_in.max_iterations == 0 ) {
-		std::cout << "Setting number of iterations to 1" << std::endl;
-		sim_in.max_iterations = 1;
+		std::cerr << "ERROR no CG iterations selected: set \"--max-cg-iterations > 0\"" << std::endl;
+		std::exit( -1 );
+	}
+
+	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
+	std::cout << "max_system_divider " << max_system_divider << std::endl;
+	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
+		std::cout << "trying " << s << std::endl;
+		std::lldiv_t div_res = std::div( static_cast< long long >( s ), static_cast< long long >( max_system_divider ) );
+		if ( div_res.rem != 0) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
+				<< sim_in.max_coarsening_levels << " times because it is not exactly divisible" << std::endl;
+			std::exit( -1 );
+		}
+		std::cout << "div_res.quot " << div_res.quot << std::endl;
+		if ( div_res.quot < static_cast< long long >( PHYS_SYSTEM_SIZE_MIN ) ) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
+				<< sim_in.max_coarsening_levels << " times because it is too small" << std::endl;
+			std::exit( -1 );
+		}
+		if ( div_res.quot % 2 != 0 ) {
+			std::cerr << "ERROR: the coarsest size " << div_res.rem << " is not a multiple of 2" << std::endl;
+			std::exit( -1 );
+		}
 	}
 }

From b971c8998451f9892c8ea1a350b6c39c6d0919d5 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 23 Jan 2023 17:03:49 +0100
Subject: [PATCH 11/28] error if too many coarsening levels

---
 tests/smoke/hpcg.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 8d524ef92..3e318b0cb 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -517,8 +517,9 @@ static void parse_arguments(
 	parser.parse( argc, argv );
 
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
-		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
-		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
+		std::cerr << "ERROR: max coarsening level is " << sim_in.max_coarsening_levels <<
+			"; at most " << MAX_COARSENING_LEVELS << " is allowed" << std::endl;
+		std::exit( -1 );
 	}
 	if( sim_in.inner_test_repetitions == 0 ) {
 		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
@@ -530,16 +531,13 @@ static void parse_arguments(
 	}
 
 	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
-	std::cout << "max_system_divider " << max_system_divider << std::endl;
 	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
-		std::cout << "trying " << s << std::endl;
 		std::lldiv_t div_res = std::div( static_cast< long long >( s ), static_cast< long long >( max_system_divider ) );
 		if ( div_res.rem != 0) {
 			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
 				<< sim_in.max_coarsening_levels << " times because it is not exactly divisible" << std::endl;
 			std::exit( -1 );
 		}
-		std::cout << "div_res.quot " << div_res.quot << std::endl;
 		if ( div_res.quot < static_cast< long long >( PHYS_SYSTEM_SIZE_MIN ) ) {
 			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
 				<< sim_in.max_coarsening_levels << " times because it is too small" << std::endl;

From b2aa0d50788e88cfa55e7c4fbe9f286aaabc86bd Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 18 Nov 2022 14:53:06 +0100
Subject: [PATCH 12/28] more documentation in utilities and code cleanups:
 simpler logic in benchmark and renaming of MG and HPCG data structures
 documenting facilities for multigrid and HPCG algorithms and multigrid utils

---
 .../{coloring.hpp => greedy_coloring.hpp}     |  20 +-
 include/graphblas/algorithms/hpcg/hpcg.hpp    |  40 +-
 ...builder.hpp => single_point_coarsener.hpp} | 145 +++---
 .../algorithms/hpcg/system_builder.hpp        |  50 +-
 .../algorithms/hpcg/system_building_utils.hpp | 180 +++++--
 .../multigrid/multigrid_building_utils.hpp    |  41 ++
 .../algorithms/multigrid/multigrid_cg.hpp     | 248 ++++-----
 .../algorithms/multigrid/multigrid_data.hpp   |  68 +--
 .../multigrid/multigrid_v_cycle.hpp           | 135 ++---
 .../multigrid/red_black_gauss_seidel.hpp      | 164 +++---
 ...rsener.hpp => single_matrix_coarsener.hpp} |  86 ++--
 .../utils/iterators/IteratorValueAdaptor.hpp  |  61 ++-
 .../utils/iterators/partition_range.hpp       |  73 ++-
 include/graphblas/utils/iterators/utils.hpp   |  24 +
 .../utils/multigrid/array_vector_storage.hpp  |  28 +-
 .../multigrid/dynamic_vector_storage.hpp      |  14 +-
 .../halo_matrix_generator_iterator.hpp        |  69 ++-
 .../linearized_halo_ndim_geometry.hpp         | 226 --------
 .../linearized_halo_ndim_iterator.hpp         | 257 +++++++---
 .../multigrid/linearized_halo_ndim_system.hpp | 482 +++++++++++++++++-
 .../multigrid/linearized_ndim_iterator.hpp    |  96 +++-
 .../multigrid/linearized_ndim_system.hpp      | 145 ++++--
 .../graphblas/utils/multigrid/ndim_system.hpp |  20 +-
 .../graphblas/utils/multigrid/ndim_vector.hpp |  64 ++-
 tests/smoke/hpcg.cpp                          | 258 +++++-----
 tests/utils/matrix_generators.hpp             |  38 +-
 26 files changed, 1958 insertions(+), 1074 deletions(-)
 rename include/graphblas/algorithms/hpcg/{coloring.hpp => greedy_coloring.hpp} (92%)
 rename include/graphblas/algorithms/hpcg/{coarsener_builder.hpp => single_point_coarsener.hpp} (57%)
 rename include/graphblas/algorithms/multigrid/{coarsener.hpp => single_matrix_coarsener.hpp} (70%)
 delete mode 100644 include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp

diff --git a/include/graphblas/algorithms/hpcg/coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
similarity index 92%
rename from include/graphblas/algorithms/hpcg/coloring.hpp
rename to include/graphblas/algorithms/hpcg/greedy_coloring.hpp
index f5793b6ca..5b6f80b2c 100644
--- a/include/graphblas/algorithms/hpcg/coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -15,9 +15,14 @@
  * limitations under the License.
  */
 
+/**
+ * @file greedy_coloring.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to partition the elements of a mesh via a simple, greedy coloring algorithm.
+ */
 
-#ifndef _H_GRB_ALGORITHMS_HPCG_COLORING
-#define _H_GRB_ALGORITHMS_HPCG_COLORING
+#ifndef _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
+#define _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
 
 #include <vector>
 #include <cstddef>
@@ -69,11 +74,8 @@ namespace grb {
 			bool reorder_rows_per_color = false
 		) {
 
-			// This function can be used to completely transform any part of the data structures.
-			// Right now it does nothing, so compiling with a check for unused variables results in complaints
-
 			CoordType nrows = system.system_size();
-			row_colors.insert( row_colors.begin(), nrows, nrows ); // value `nrow' means `uninitialized'; initialized colors go from 0 to nrow-1
+			row_colors.insert( row_colors.begin(), nrows, nrows ); // value `nrows' means `uninitialized'; initialized colors go from 0 to nrow-1
 			CoordType totalColors = 1;
 			row_colors[0] = 0; // first point gets color 0
 
@@ -81,6 +83,7 @@ namespace grb {
 			typename grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >::Iterator begin = system.begin();
 			begin.next_element(); // skip first row
 
+			std::vector< bool > assigned( totalColors );
 			while( begin.has_more_elements() ) {
 				CoordType curRow = begin->get_element_linear();
 
@@ -88,7 +91,7 @@ namespace grb {
 					// if color already assigned to curRow
 					continue;
 				}
-				std::vector< bool > assigned( totalColors, false );
+				assigned.assign( totalColors, false );
 				CoordType currentlyAssigned = 0;
 
 				while( begin.has_more_neighbours() ) {
@@ -108,6 +111,7 @@ namespace grb {
 
 				if( currentlyAssigned < totalColors ) {
 					// if there is at least one color left to use, look for it
+					// smallest possible
 					for( CoordType j = 0; j < totalColors; ++j ) {
 						if( !assigned[ j ] ) {
 							// if no neighbor with this color, use it for this row
@@ -163,4 +167,4 @@ namespace grb {
 	} // namespace algorithms
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_HPCG_COLORING
+#endif // _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 9d65aa79f..c4598323a 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -15,13 +15,25 @@
  * limitations under the License.
  */
 
+/**
+ * @dir include/graphblas/algorithms/hpcg
+ * This folder contains the code specific to the HPCG benchmark implementation: generation of the physical system,
+ * generation of the single point coarsener and coloring algorithm.
+ */
+
+/**
+ * @file hpcg.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utility to build a full HPCG runner, bringing together all needed data structures.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_HPCG_HPCG
 #define _H_GRB_ALGORITHMS_HPCG_HPCG
 
 #include <utility>
 
 #include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
-#include <graphblas/algorithms/multigrid/coarsener.hpp>
+#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
 #include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
 #include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
 
@@ -36,14 +48,20 @@ namespace grb {
 			typename InputType,
 			class Ring,
 			class Minus
-		> using HPCGRunnerType = mg_cg_runner< IOType, NonzeroType, InputType, ResidualType,
-			multigrid_runner< IOType, NonzeroType, InputType,
-				red_black_smoother_runner< IOType, NonzeroType, Ring >,
-				single_point_coarsener< IOType, NonzeroType, Ring, Minus >,
+		> using HPCGRunnerType = MultiGridCGRunner< IOType, NonzeroType, InputType, ResidualType,
+			MultiGridRunner< IOType, NonzeroType,
+				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >,
+				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus >,
 				Ring, Minus >,
 			Ring, Minus
 		>;
 
+		/**
+		 * Builds a full HPCG runner object by "assemblying" all needed information,
+		 * with default type for smoother, coarsener and multi-grid runner.
+		 *
+		 * @param[in] smoother_steps how many times the smoother should run (both pre- and post-smoothing)
+		 */
 		template<
 			typename IOType,
 			typename ResidualType,
@@ -54,13 +72,13 @@ namespace grb {
 		> HPCGRunnerType< IOType, ResidualType, NonzeroType, InputType, Ring, Minus >
 			build_hpcg_runner( size_t smoother_steps ) {
 
-			single_point_coarsener< IOType, NonzeroType, Ring, Minus > coarsener;
-			red_black_smoother_runner< IOType, NonzeroType, Ring >
-				smoother{ smoother_steps, smoother_steps, 1UL, {}, Ring() };
+			SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus > coarsener;
+			RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >
+				smoother( { smoother_steps, smoother_steps, 1UL, {}, Ring() } );
 
-			multigrid_runner< IOType, NonzeroType, InputType,
-				red_black_smoother_runner< IOType, NonzeroType, Ring >,
-				single_point_coarsener< IOType, NonzeroType, Ring, Minus >,
+			MultiGridRunner< IOType, NonzeroType,
+				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >,
+				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus >,
 				Ring, Minus
 			> mg_runner( std::move( smoother ), std::move( coarsener ) );
 
diff --git a/include/graphblas/algorithms/hpcg/coarsener_builder.hpp b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
similarity index 57%
rename from include/graphblas/algorithms/hpcg/coarsener_builder.hpp
rename to include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
index 2ee848039..a3826c9c0 100644
--- a/include/graphblas/algorithms/hpcg/coarsener_builder.hpp
+++ b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,14 @@
  * limitations under the License.
  */
 
-#ifndef _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
-#define _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
+/**
+ * @file single_point_coarsener.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to build the coarsening matrix for an HPCG simulation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
+#define _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
 
 #include <cstddef>
 #include <array>
@@ -30,40 +36,44 @@
 namespace grb {
 	namespace algorithms {
 
+		// forward declaration
 		template<
 			size_t DIMS,
 			typename CoordType,
 			typename ValueType
-		>
-		class HPCGCoarsenerBuilder;
+		> class SinglePointCoarsenerBuilder;
 
 		/**
-		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
+		 * Iterator class to generate the coarsening matrix for an HPCG simulation.
 		 *
-		 * This class coarsens a finer system to a coarser system by projecting each input value (column),
-		 * espressed in finer coordinates, to an output (row) value espressed in coarser coordinates.
-		 * The coarser sizes are assumed to be row_generator#physical_sizes, while the finer sizes are here
-		 * stored inside #finer_sizes.
+		 * The coarsening matrix samples a single value from the finer space for every element
+		 * of the coarser space; this value is the first one (i.e. the one with smallest coordinates)
+		 * in the finer sub-space corresponding to each coarser element.
 		 *
-		 * The corresponding refinement matrix is obtained by transposing the coarsening matrix.
+		 * This coarsening method is simple but can lead to unstable results, especially with certain combinations
+		 * of smoothers and partitioning methods.
 		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam T type of matrix values
+		 * This iterator is random-access.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
+		 *  the finer value)
 		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
 			typename ValueType
-		> struct HPCGCoarsenerGeneratorIterator {
+		> struct SinglePointCoarsenerIterator {
 
-			friend HPCGCoarsenerBuilder< DIMS, CoordType, ValueType >;
+			friend SinglePointCoarsenerBuilder< DIMS, CoordType, ValueType >;
 
 			using RowIndexType = CoordType; ///< numeric type of rows
 			using ColumnIndexType = CoordType;
 			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType,
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
 			using LinearSystemIterType = typename LinearSystemType::Iterator;
-			using SelfType = HPCGCoarsenerGeneratorIterator< DIMS, CoordType, ValueType >;
+			using SelfType = SinglePointCoarsenerIterator< DIMS, CoordType, ValueType >;
 			using ArrayType = std::array< CoordType, DIMS >;
 
 			struct _HPCGValueGenerator {
@@ -100,19 +110,16 @@ namespace grb {
 			using reference = const value_type&;
 			using difference_type = typename LinearSystemIterType::difference_type;
 
-			HPCGCoarsenerGeneratorIterator( const SelfType &o ) = default;
+			SinglePointCoarsenerIterator( const SelfType &o ) = default;
 
-			HPCGCoarsenerGeneratorIterator( SelfType &&o ) = default;
+			SinglePointCoarsenerIterator( SelfType &&o ) = default;
 
 			SelfType & operator=( const SelfType & ) = default;
 
 			SelfType & operator=( SelfType && ) = default;
 
 			/**
-			 * @brief Increments the row and the column according to the respective physical sizes,
-			 * thus iterating onto the coarsening matrix coordinates.
-			 *
-			 * @return \code *this \endcode, i.e. the same object with the updates row and column
+			 * Advances \c this by 1 in constant time.
 			 */
 			SelfType & operator++() noexcept {
 				(void) ++_sys_iter;
@@ -120,36 +127,36 @@ namespace grb {
 				return *this;
 			}
 
+			/**
+			 * Advances \c this by \p offset in constant time.
+			 */
 			SelfType & operator+=( size_t offset ) {
 				_sys_iter += offset;
 				update_coords();
 				return *this;
 			}
 
+			/**
+			 * Computes the difference between \c this and \p o as integer.
+			 */
 			difference_type operator-( const SelfType &o ) const {
 				return this->_sys_iter - o._sys_iter;
 			}
 
 			/**
-			 * @brief Returns whether \c this and \p o differ.
+			 * Returns whether \c this and \p o differ.
 			 */
 			bool operator!=( const SelfType &o ) const {
 				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
-			 * @brief Returns whether \c this and \p o are equal.
+			 * Returns whether \c this and \p o are equal.
 			 */
 			bool operator==( const SelfType &o ) const {
 				return ! this->operator!=( o );
 			}
 
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
 			reference operator*() const {
 				return _val;
 			}
@@ -159,53 +166,48 @@ namespace grb {
 			}
 
 			/**
-			 * @brief Returns the current row, according to the coarser system.
+			 * Returns the current row, within the coarser system.
 			 */
 			inline RowIndexType i() const {
 				return _val.i();
 			}
 
 			/**
-			 * @brief Returns the current column, according to the finer system.
+			 * Returns the current column, within the finer system.
 			 */
 			inline ColumnIndexType j() const {
 				return _val.j();
 			}
 
 			/**
-			 * @brief Returns always 1, as the coarsening keeps the same value.
+			 * Returns always 1, as the coarsening keeps the same value.
 			 */
 			inline ValueType v() const {
 				return _val.v();
 			}
 
 		private:
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
 			const LinearSystemType *_lin_sys;
-			const ArrayType *_steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			const ArrayType *_steps;
 			LinearSystemIterType _sys_iter;
 			value_type _val;
 
 			/**
-			 * @brief Construct a new \c HPCGCoarsenerGeneratorIterator object from the coarser and finer sizes,
-			 * setting its row at \p _current_row and the column at the corresponding value.
+			 * Construct a new SinglePointCoarsenerIterator object starting from the LinearizedNDimSystem
+			 * object \p system describing the \b coarser system and the \b ratios \p steps between each finer and
+			 * the corresponding corser dimension.
 			 *
-			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
-			 * construction will throw an exception.
-			 *
-			 * @param _coarser_sizes sizes of the coarser system (rows)
-			 * @param _finer_sizes sizes of the finer system (columns)
-			 * @param _current_row row (in the coarser system) to set the iterator on
+			 * @param system LinearizedNDimSystem object describing the coarser system
+			 * @param steps ratios per dimension between finer and coarser system
 			 */
-			HPCGCoarsenerGeneratorIterator(
+			SinglePointCoarsenerIterator(
 				const LinearSystemType &system,
 				const ArrayType &steps
 			) noexcept :
 				_lin_sys( &system ),
 				_steps( &steps ),
 				_sys_iter( _lin_sys->begin() ),
-				_val(0, 0)
+				_val( 0, 0 )
 			{
 				update_coords();
 			}
@@ -216,13 +218,13 @@ namespace grb {
 			}
 
 			/**
-			 * @brief Returns the row coordinates converted to the finer system, to compute
+			 * Returns the row coordinates converted to the finer system, to compute
 			 * the column value.
 			 */
 			ColumnIndexType coarse_rows_to_finer_col() const noexcept {
-				ColumnIndexType finer { 0 };
-				ColumnIndexType s { 1 };
-				for( size_t i { 0 }; i < DIMS; i++ ) {
+				ColumnIndexType finer = 0;
+				ColumnIndexType s = 1;
+				for( size_t i = 0; i < DIMS; i++ ) {
 					s *= (*_steps)[ i ];
 					finer += s * _sys_iter->get_position()[ i ];
 					s *= _lin_sys->get_sizes()[ i ];
@@ -231,21 +233,36 @@ namespace grb {
 			}
 		};
 
+		/**
+		 * Builder object to create iterators that generate a coarsening matrix.
+		 *
+		 * It is a facility to generate beginning and end iterators and abstract the logic away from users.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
+		 *  the finer value)
+		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
 			typename ValueType
-		> class HPCGCoarsenerBuilder {
+		> class SinglePointCoarsenerBuilder {
 		public:
 			using ArrayType = std::array< CoordType, DIMS >;
-			using Iterator = HPCGCoarsenerGeneratorIterator< DIMS, CoordType, ValueType >;
-			using SelfType = HPCGCoarsenerBuilder< DIMS, CoordType, ValueType >;
+			using Iterator = SinglePointCoarsenerIterator< DIMS, CoordType, ValueType >;
+			using SelfType = SinglePointCoarsenerBuilder< DIMS, CoordType, ValueType >;
 
-			HPCGCoarsenerBuilder(
+			/**
+			 * Construct a new SinglePointCoarsenerBuilder object from the sizes of finer system
+			 * and those of the coarser system; finer sizes must be an exact multiple of coarser sizes,
+			 * otherwise an exception is raised.
+			 */
+			SinglePointCoarsenerBuilder(
 				const ArrayType &_finer_sizes,
 				const ArrayType &_coarser_sizes
 			) : system( _coarser_sizes.begin(), _coarser_sizes.end() ) {
-				for( size_t i { 0 }; i < DIMS; i++ ) {
+				for( size_t i = 0; i < DIMS; i++ ) {
 					// finer size MUST be an exact multiple of coarser_size
 					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
 					if( ratio.quot < 2 || ratio.rem != 0 ) {
@@ -258,26 +275,34 @@ namespace grb {
 				}
 			}
 
-			HPCGCoarsenerBuilder( const SelfType & ) = delete;
+			SinglePointCoarsenerBuilder( const SelfType & ) = delete;
 
-			HPCGCoarsenerBuilder( SelfType && ) = delete;
+			SinglePointCoarsenerBuilder( SelfType && ) = delete;
 
 			SelfType & operator=( const SelfType & ) = delete;
 
 			SelfType & operator=( SelfType && ) = delete;
 
+			/**
+			 * Returns the size of the finer system, i.e. its number of elements.
+			 */
 			size_t system_size() const {
 				return system.system_size();
 			}
 
+			/**
+			 * Produces a beginning iterator to generate the coarsening matrix.
+			 */
 			Iterator make_begin_iterator() {
 				return Iterator( system, steps );
 			}
 
+			/**
+			 * Produces an end iterator to stop the generation of the coarsening matrix.
+			 */
 			Iterator make_end_iterator() {
 				Iterator result( system, steps );
-				result += system_size() - 1; // do not trigger boundary checks
-				++result;
+				result += system_size(); // do not trigger boundary checks
 				return result;
 			}
 
@@ -292,5 +317,5 @@ namespace grb {
 
 	} // namespace algorithms
 } // namespace grb
-#endif // _H_GRB_ALGORITHMS_HPCG_COARSENER_BUILDER
+#endif // _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
 
diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
index 700718e3b..48a2e640d 100644
--- a/include/graphblas/algorithms/hpcg/system_builder.hpp
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,17 +18,7 @@
 /**
  * @file system_builders.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
- *
- * In particular, the main matrices are:
- * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
- *   each dimension in priority order, where the first dimension has highest priority and the last
- *   dimension least priority; for each point (row), all its N-dimensional neighbours within
- *   a given distance are generated for the column
- * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
- *   each point to a corresponding system of finer sizes
- *
- * @date 2021-04-30
+ * Utilities to build the system matrix for an HPCG simulation in a generic number of dimensions.
  */
 
 #ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
@@ -52,6 +42,14 @@
 namespace grb {
 	namespace algorithms {
 
+		/**
+		 * Builder class to build the iterators that generate an HPCG system matrix, describing a
+		 * \p DIMS -dimensional simulation mesh for Fourier-like heat propagation.
+		 *
+		 * @tparam DIMS dimensions of the mesh
+		 * @tparam CoordType type storing the coordinates and sizes of the matrix
+		 * @tparam ValueType nonzero type
+		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
@@ -81,6 +79,14 @@ namespace grb {
 			using Iterator = grb::utils::multigrid::HaloMatrixGeneratorIterator< DIMS, CoordType,
 				ValueType, HPCGDiagGenerator >;
 
+			/**
+			 * Construct a new HPCGSystemBuilder object from the data of the physical system.
+			 *
+			 * @param sizes sizes along each dimension
+			 * @param halo halo size
+			 * @param diag value along the diagonal, for self-interactions
+			 * @param non_diag value outside the diagonal, for element-element interaction
+			 */
 			HPCGSystemBuilder(
 				const std::array< CoordType, DIMS > &sizes,
 				CoordType halo,
@@ -108,26 +114,41 @@ namespace grb {
 
 			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
 
+			/**
+			 * Number of elements of the mesh.
+			 */
 			size_t system_size() const {
 				return _system.base_system_size();
 			}
 
+			/**
+			 * Total number of neighbors for all elements of the mesh.
+			 */
 			size_t num_neighbors() const {
 				return _system.halo_system_size();
 			}
 
+			/**
+			 * Get the generator object, i.e. the HaloSystemType object that describes the geometry
+			 * of the simulation mesh.
+			 */
 			const HaloSystemType & get_generator() const {
 				return _system;
 			}
 
+			/**
+			 * Builds the beginning iterator to generate the system matrix.
+			 */
 			Iterator make_begin_iterator() const {
 				return Iterator( _system, _diag_generator );
 			}
 
+			/**
+			 * Builds the end iterator to generate the system matrix.
+			 */
 			Iterator make_end_iterator() const {
 				Iterator result( _system, _diag_generator );
-				result += num_neighbors() - 1; // do not trigger boundary checks
-				++result;
+				result += num_neighbors();
 				return result;
 			}
 
@@ -139,7 +160,6 @@ namespace grb {
 				return _diag_generator._non_diag;
 			}
 
-
 		private:
 			HaloSystemType _system;
 			HPCGDiagGenerator _diag_generator;
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 088bb9fb3..c0b522521 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,9 @@
  */
 
 /**
- * @file hpcg_system_building_utils.hpp
+ * @file system_building_utils.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build an antire system for HPCG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
+ * Utilities to build an antire system for HPCG simulations in an arbitrary number of dimensions.
  */
 
 #ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
@@ -29,23 +28,24 @@
 #include <cassert>
 #include <cstddef>
 #include <memory>
-#include <type_traits>
 #include <algorithm>
 #include <cstdlib>
 #include <stdexcept>
+#include <cmath>
+#include <string>
 
 #include <graphblas.hpp>
 #include <graphblas/utils/iterators/partition_range.hpp>
 
 #include "system_builder.hpp"
-#include "coarsener_builder.hpp"
-#include "coloring.hpp"
+#include "single_point_coarsener.hpp"
+#include "greedy_coloring.hpp"
 
 namespace grb {
 	namespace algorithms {
 
 		/**
-		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
+		 * Container of the parameter for HPCG simulation generation: physical system characteristics and
 		 * coarsening information.
 		 *
 		 * @tparam DIMS dimensions of the physical system
@@ -54,7 +54,7 @@ namespace grb {
 		template<
 			size_t DIMS,
 			typename NonzeroType
-		> struct hpcg_system_params {
+		> struct HPCGSystemParams {
 			std::array< size_t, DIMS > physical_sys_sizes;
 			size_t halo_size;
 			NonzeroType diag_value;
@@ -64,29 +64,43 @@ namespace grb {
 			size_t coarsening_step;
 		};
 
+		/**
+		 * Builds all required system generators for an entire multi-grid simulation; each generator
+		 * corresponds to a level of the HPCG system multi-grid, with increasingly coarser sizes, and can
+		 * generate the system matrix of that level. All required pieces of information required to build
+		 * the levels is stored in \p params.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam NonzeroType type of the nonzero
+		 * @param[in] params structure with the parameters to build an entire HPCG simulation
+		 * @param[out] mg_generators std::vector of HPCGSystemBuilder, one per layer of the multi-grid
+		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
 			typename NonzeroType
 		> void hpcg_build_multigrid_generators(
-			const hpcg_system_params< DIMS, NonzeroType > &params,
+			const HPCGSystemParams< DIMS, NonzeroType > &params,
 			std::vector< grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > > &mg_generators
 		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 
-			size_t const current_size{ std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL,
-				std::multiplies< size_t >() ) };
+			size_t const current_size = std::accumulate( params.physical_sys_sizes.cbegin(),
+				params.physical_sys_sizes.cend(), 1UL, std::multiplies< size_t >() );
 			if( current_size > std::numeric_limits< CoordType >::max() ) {
-				throw std::domain_error( "CoordT cannot store the matrix coordinates" );
+				throw std::domain_error( "CoordType cannot store the matrix coordinates" );
 			}
-			size_t min_physical_size { *std::min_element( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend() ) };
+			size_t min_physical_size = *std::min_element( params.physical_sys_sizes.cbegin(),
+				params.physical_sys_sizes.cend() );
 			if( min_physical_size < params.min_phys_size ) {
 				throw std::domain_error( "the initial system is too small" );
 			}
 
 			std::array< CoordType, DIMS > coord_sizes;
 			// type-translate coordinates
-			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), coord_sizes.begin() );
+			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(),
+				coord_sizes.begin() );
 
 			// generate hierarchical coarseners
 			for( size_t coarsening_level = 0UL;
@@ -94,36 +108,44 @@ namespace grb {
 				coarsening_level++ ) {
 
 				// build generator
-				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+				mg_generators.emplace_back( coord_sizes, params.halo_size,
+					params.diag_value, params.non_diag_value );
 
 				// prepare for new iteration
 				min_physical_size /= params.coarsening_step;
 				std::for_each( coord_sizes.begin(), coord_sizes.end(),
-					[ &params ]( CoordType &v ){ v /= params.coarsening_step; });
-			}
-		}
-
-		template< typename CoordType > void hpcg_split_rows_by_color(
-			const std::vector< CoordType > & row_colors,
-			size_t num_colors,
-			std::vector< std::vector< CoordType > > & per_color_rows
-		) {
-			per_color_rows.resize( num_colors );
-			for( CoordType i = 0; i < row_colors.size(); i++ ) {
-				per_color_rows[ row_colors[ i ] ].push_back( i );
+					[ &params ]( CoordType &v ) {
+						std::ldiv_t ratio = std::ldiv( v, params.coarsening_step );
+						if( ratio.rem != 0 ) {
+							throw std::invalid_argument(
+								std::string( "system size " ) + std::to_string( v ) +
+								std::string( " is not divisible by " ) +
+								std::to_string( params.coarsening_step )
+							);
+						}
+						v = ratio.quot;
+					});
 			}
 		}
 
+		/**
+		 * Populates the system matrix \p M out of the builder \p system_generator.
+		 *
+		 * The matrix \p M must have been previously allocated and initialized with the proper sizes,
+		 * as this procedure only populates it with the nozeroes generated by \p system_generator.
+		 *
+		 * This function takes care of the parallelism by employing random-access iterators and by
+		 * \b parallelizing the generation across multiple processes in case of distributed execution.
+		 */
 		template <
 			size_t DIMS,
 			typename CoordType,
-			typename NonzeroType,
-			enum grb::Backend B
+			typename NonzeroType
 		> grb::RC hpcg_populate_system_matrix(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			grb::Matrix< NonzeroType, B > &M
+			grb::Matrix< NonzeroType > &M
 		) {
-			const size_t pid { spmd<>::pid() };
+			const size_t pid = spmd<>::pid();
 
 			if( pid == 0) {
 				std::cout << "- generating system matrix...";
@@ -133,10 +155,20 @@ namespace grb {
 			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator end(
 				system_generator.make_end_iterator()
 			);
-			grb::utils::partition_iteration_range_on_procs( system_generator.num_neighbors(), begin, end );
+			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
+				system_generator.num_neighbors(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
+		/**
+		 * Populates the coarsening data \p coarsener (in particular the coarsening matrix) from the
+		 * builder of the finer system \p finer_system_generator and that of the coarser system
+		 * \p coarser_system_generator.
+		 *
+		 * This function takes care of parallelizing the generation by using a random-access iterator
+		 * to generate the coarsening matrix and by distributing the generation across nodes
+		 * of a distributed system (if any).
+		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
@@ -145,7 +177,7 @@ namespace grb {
 		> grb::RC hpcg_populate_coarsener(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
-			coarsening_data< IOType, NonzeroType > &coarsener
+			CoarseningData< IOType, NonzeroType > &coarsener
 		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 
@@ -158,8 +190,8 @@ namespace grb {
 				throw std::invalid_argument( "wrong sizes");
 			}
 
-			size_t const rows { coarser_size };
-			size_t const cols { finer_size };
+			size_t const rows = coarser_size;
+			size_t const cols = finer_size;
 
 			assert( finer_sizes.size() == coarser_sizes.size() );
 
@@ -169,18 +201,48 @@ namespace grb {
 											" with rows == <coarser size> and cols == <finer size>" );
 			}
 
-			grb::algorithms::HPCGCoarsenerBuilder< DIMS, CoordType, NonzeroType > coarsener_builder( finer_sizes, coarser_sizes );
-			grb::algorithms::HPCGCoarsenerGeneratorIterator< DIMS, CoordType, NonzeroType > begin( coarsener_builder.make_begin_iterator() );
-			grb::algorithms::HPCGCoarsenerGeneratorIterator< DIMS, CoordType, NonzeroType > end( coarsener_builder.make_end_iterator() );
-			grb::utils::partition_iteration_range_on_procs( coarsener_builder.system_size(), begin, end );
+			using gen_t = typename grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType >;
+			gen_t coarsener_builder( finer_sizes, coarser_sizes );
+			typename gen_t::Iterator begin( coarsener_builder.make_begin_iterator() ),
+				end( coarsener_builder.make_end_iterator() );
+			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
+				coarsener_builder.system_size(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
 		namespace internal {
 
+			/**
+			 * Store row values based on their color into separate vectors.
+			 *
+			 * @param[in] row_colors for each row (corresponding to a vector position) its color
+			 * @param[in] num_colors number of colors, i.e. max across all values in \p row_colors + 1
+			 * @param[out] per_color_rows for each position \a i it stores an std::vector with all rows
+			 *  of color \a i inside \p row_colors
+			 */
+			template< typename CoordType > void hpcg_split_rows_by_color(
+				const std::vector< CoordType > & row_colors,
+				size_t num_colors,
+				std::vector< std::vector< CoordType > > & per_color_rows
+			) {
+				per_color_rows.resize( num_colors );
+				for( CoordType i = 0; i < row_colors.size(); i++ ) {
+					per_color_rows[ row_colors[ i ] ].push_back( i );
+				}
+			}
+
+			/**
+			 * Utility class implementing a random-access iterator that always returns a
+			 * \c true value.
+			 *
+			 * It is used in the following to build mask vectors via buildVectorUnique(), where
+			 * all the non-zero positions are \c true.
+			 *
+			 * @tparam CoordType type of the internal coordinate
+			 */
 			template< typename CoordType > struct true_iter {
 
-				static const bool __TRUE = true;
+				// static const bool __TRUE;
 
 				using self_t = true_iter< CoordType >;
 				using iterator_category = std::random_access_iterator_tag;
@@ -225,12 +287,11 @@ namespace grb {
 
 			private:
 				CoordType index;
+				const bool __TRUE = true; // for its address to be passed outside
 			};
 
-			template< typename CoordType > const bool true_iter< CoordType >::__TRUE;
-
 			/**
-			 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
+			 * Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
 			 *
 			 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
 			 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
@@ -241,17 +302,15 @@ namespace grb {
 			 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
 			 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
 			 *
-			 * @tparam B GraphBLAS backend for the vector
 			 * @param masks output vector of color masks
 			 * @param matrix_size size of the system matrix
 			 * @param colors numbers of colors masks to build; it must be < \p matrix_size
 			 * @return grb::RC the success value returned when trying to build the vector
 			 */
-			template< enum grb::Backend B >
 			grb::RC hpcg_build_static_color_masks(
 				size_t matrix_size,
 				const std::vector< std::vector< size_t > > &per_color_rows,
-				std::vector< grb::Vector< bool, B > > & masks
+				std::vector< grb::Vector< bool> > &masks
 			) {
 				if( ! masks.empty() ) {
 					throw std::invalid_argument( "vector of masks is expected to be empty" );
@@ -273,7 +332,7 @@ namespace grb {
 					std::vector< size_t >::const_iterator end = rows.cend();
 					// partition_iteration_range( rows.size(), begin, end );
 					grb::RC rc = grb::buildVectorUnique( output_mask, begin , end, true_iter< size_t >( 0 ),
-						true_iter< size_t >( std::distance( begin, end ) ), IOMode::SEQUENTIAL );
+						true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
 					if( rc != SUCCESS ) {
 						std::cerr << "error while creating output mask for color " << i << ": "
 							<< toString( rc ) << std::endl;
@@ -297,15 +356,32 @@ namespace grb {
 
 		} // namespace internal
 
+		/**
+		 * Populates the smoothing information \p smoothing_info for a Red-Black Gauss-Seidel smoother
+		 * to be used for an HPCG simulation. The information about the mesh to smooth are passed
+		 * via \p system_generator.
+		 *
+		 * Steps for the smoother generation:
+		 *
+		 * 1. the mesh elements (the system matrix rows) are colored via a greedy algorithm, so that
+		 *  no two neighboring elements have the same color; this phase colors the \b entire system
+		 *  and cannot be parallelized, even in a distributed system, since the current coloring algorithm
+		 *  is \b not distributed
+		 * 2. rows are split according to their color
+		 * 3. for each color \a c the color mask with the corresponding rows is generated:
+		 *  a dedicated sparse grb::Vector<bool> signals the rows of color \a c (by marking them as \c true
+		 *  ); such a vector allows updating all rows of color \a c in \b parallel when used as a mask
+		 *  to an mxv() operation (as done during smoothing)
+		 */
 		template<
 			size_t DIMS,
 			typename CoordType,
 			typename NonzeroType
 		> grb::RC hpcg_populate_smoothing_data(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			smoother_data< NonzeroType > &smoothing_info
+			SmootherData< NonzeroType > &smoothing_info
 		) {
-			const size_t pid { spmd<>::pid() };
+			const size_t pid = spmd<>::pid();
 
 			grb::RC rc = set( smoothing_info.A_diagonal, system_generator.get_diag_value() );
 			if( rc != grb::SUCCESS ) {
@@ -321,7 +397,9 @@ namespace grb {
 			std::vector< CoordType > colors, color_counters;
 			hpcg_greedy_color_ndim_system( system_generator.get_generator(), colors, color_counters );
 			std::vector< std::vector< CoordType > > per_color_rows;
-			hpcg_split_rows_by_color( colors, color_counters.size(), per_color_rows );
+			internal::hpcg_split_rows_by_color( colors, color_counters.size(), per_color_rows );
+			colors.clear();
+			colors.shrink_to_fit();
 			if( rc != grb::SUCCESS ) {
 				if( pid == 0 ) {
 					std::cout << "error: " << __LINE__ << std::endl;
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
index 34347582e..75d23a7cc 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file multigrid_building_utils.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to allocate data for an entire multi-grid simulation.
+ */
+
 #include <vector>
 #include <memory>
 #include <cstddef>
@@ -25,6 +31,38 @@
 namespace grb {
 	namespace algorithms {
 
+		/**
+		 * Allocates all the levels for an entire multi-grid simulation for the multi-grid v-cycle,
+		 * the coarsener and the smoother. This routine just allocates and initializes the data structures,
+		 * but does \b not populate them, which depends on the specific algorithms.
+		 *
+		 * Thanks to the templating, this routine is meant to be independent from the specific algorithm
+		 * choosen for the simulation, but simply implements the logic to move from one level (finer)
+		 * to the next one (coarser). To be used with any data structure, the constructor of each
+		 * structure must meet a certain interface, as explained in the following.
+		 *
+		 * Note: structures are allocated on the heap and manged via an std::unique_ptr for efficiency
+		 * and convenience: since they may store large data amounts, moving them via their move (copy)
+		 * constructor (as required for the growth of an std::vector) may be costly, and forces the user
+		 * to implement the move constructor for each type (which may be annoying).
+		 * Furthermore, avoiding movement (copy) entirely protects against possible bugs
+		 * in move (copy)-constructor logic (not uncommon in prototypes).
+		 *
+		 * @tparam MGInfoType type holding the information to run the chosen multi-grid algorithm:
+		 * 	its constructor must take in input the coarsening level (0 to \p mg_sizes.size() )
+		 *  and the size of the system matrix for that level
+		 * @tparam CoarsenerInfoType type holding the information for the coarsener;
+		 *  its constructor must take in input the size of the finer system matrix and that of
+		 *  the coarser system matrix (in this order)
+		 * @tparam SmootherInfoType type holding the information for the smoother;
+		 *  its constructor must take in input the size of the system matrix for that level
+		 *
+		 * @param mg_sizes sizes of the system matrix for each level of the multi-grid
+		 * @param system_levels system data (system matrix, residual, solution, ...) for each level
+		 * @param coarsener_levels at position \a i of this vector, data to coarsen from level \a i
+		 *  (system size \p mg_sizes [i] ) to level \a i+1 (system size \p mg_sizes [i+1] )
+		 * @param smoother_levels smoother data for each level
+		 */
 		template<
 			typename MGInfoType,
 			typename CoarsenerInfoType,
@@ -43,6 +81,9 @@ namespace grb {
 			smoother_levels.emplace_back( new SmootherInfoType( finer_size ) ); // create smoother for main
 			for( size_t i = 1; i < mg_sizes.size(); i++ ) {
 				size_t coarser_size = mg_sizes[ i ];
+				if( coarser_size >= finer_size ) {
+					throw std::invalid_argument( "system sizes not monotonically decreasing" );
+				}
 				coarsener_levels.emplace_back( new CoarsenerInfoType( finer_size, coarser_size ) );
 				system_levels.emplace_back( new MGInfoType( i, coarser_size ) );
 				smoother_levels.emplace_back( new SmootherInfoType( coarser_size ) );
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 2ac3c0770..735f87d81 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,17 @@
  */
 
 /**
- * @file hpcg.hpp
+ * @dir include/graphblas/algorithms/mutligrid
+ * This folder contains the implementation of the algorithms for a basic multi-grid V-cycle solver:
+ * Conjugate Gradient with multi-grid, a basic V-cycle multi-grid implementation, a single-matrix coarsener/
+ * prolonger, an implementation of a Red-Black Gauss-Seidel smoother. These algorithms can be composed
+ * via their specific runners, as in the example HPCG benchmark.
+ */
+
+/**
+ * @file multigrid_cg.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief File with the main routine to run a full HPCG simulation, comprising multi-grid runs
- *        with Red-Black Gauss-Seidel smoothing.
- * @date 2021-04-30
+ * Algorithm and runner for a Conjugate Gradient solver augmented with a multi-grid solver.
  */
 
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_CG
@@ -30,26 +36,20 @@
 #include <utility>
 
 #include <graphblas.hpp>
+#include <graphblas/utils/Timer.hpp>
 
 #include "multigrid_data.hpp"
 
-#include <graphblas/utils/Timer.hpp>
-
 
 namespace grb {
 	namespace algorithms {
 
 		/**
-		 * @brief Data stucture to store the data for a full HPCG run: system vectors and matrix,
-		 * coarsening information and temporary vectors.
+		 * Data stucture to store the vectors specific to the Conjugate Gradient algorithm,
+		 * including inputs, outputs and temporary vectors.
 		 *
-		 * This data structures contains all the needed vectors and matrices to solve a linear system
-		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
-		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
-		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
-		 * building a \code multigrid_data<IOType, NonzeroType> \endcode object and storing its pointer into
-		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
-		 * user's intervention.
+		 * Input and output vectors use the same naming scheme as for the corresponding mathematics,
+		 * where the equation to solve is conventionally written as \f$ A x = b \f$.
 		 *
 		 * @tparam IOType type of values of the vectors for intermediate results
 		 * @tparam NonzeroType type of the values stored inside the system matrix #A
@@ -59,107 +59,83 @@ namespace grb {
 			typename IOType,
 			typename NonzeroType,
 			typename InputType
-		> struct mg_cg_data {
+		> struct MultiGridCGData {
 
-			grb::Vector< InputType > b; ///< right-side vector of known values
+			grb::Vector< InputType > b; ///< Right-side vector of known values.
 			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
 			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
-			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
-			///< to set the initial solution value
+			grb::Vector< IOType > x;    ///< system solution being refined over the iterations: it us up to the user
+			///< to set the initial solution value to something meaningful
 
 
 			/**
-			 * @brief Construct a new \c hpcg_data object by building vectors and matrices and by setting
-			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
-			 *
-			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
+			 * Construct a new \c MultiGridCGData object by building its vectors with size \p sys_size.
 			 */
-			mg_cg_data( size_t sys_size ) :
+			MultiGridCGData( size_t sys_size ) :
 				b( sys_size ),
 				u( sys_size ),
 				p( sys_size ),
 				x( sys_size ) {}
 
-			grb::RC zero_temp_vectors() {
-				grb::RC rc = grb::set( u, 0 );
-				rc = rc ? rc : grb::set( p, 0 );
+			grb::RC init_vectors( IOType zero ) {
+				grb::RC rc = grb::set( u, zero );
+				rc = rc ? rc : grb::set( p, zero );
 				return rc;
 			}
 		};
 
+		/**
+		 * Container for various options and algebraic abstractions to be passed to a CG simulation with multi-grid.
+		 */
 		template <
 			typename IOType,
 			typename ResidualType,
-			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
-			class Minus = operators::subtract< IOType >
-		>
-		struct cg_options {
-			bool with_preconditioning;
-			size_t max_iterations;
-			ResidualType tolerance;
-			bool print_iter_stats;
-			Ring ring;
-			Minus minus;
+			class Ring,
+			class Minus
+		> struct CGOptions {
+			bool with_preconditioning; ///<  whether preconditioning is enabled
+			size_t max_iterations; ///< max number of allowed iterations for CG: after that, the solver is halted
+									///< and the result achieved so far returned
+			ResidualType tolerance; ///< ratio between initial residual and current residual that halts the solver
+										///< if reached, for the solution is to be considered "good enough"
+			bool print_iter_stats; ///< whether to print information on the multi-grid and the residual on each iteration
+			Ring ring; ///< algebraic ring to be used
+			Minus minus; ///< minus operator to be used
 		};
 
-
-		template < typename ResidualType > struct cg_out_data {
-			size_t iterations;
-			ResidualType norm_residual;
+		/**
+		 * Structure for the output information of a CG run.
+		 */
+		template < typename ResidualType > struct CGOutInfo {
+			size_t iterations; ///< number of iterations performed
+			ResidualType norm_residual; ///< norm of the final residual
 		};
 
 		/**
-		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
+		 * Conjugate Gradient algorithm implementation augmented by a Multi-Grid solver,
+		 * inspired to the High Performance Conjugate Gradient benchmark.
 		 *
-		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the HPCG algorithm.
-		 * The implementation here closely follows the reference HPCG benchmark used for the HPCG500 rank,
-		 * visible at https://github.com/hpcg-benchmark/hpcg.
-		 * The only difference is the usage of a Red-Black Gauss-Seidel smoother instead of the standard one
-		 * for performance reasons, as the standard Gauss-Seidel algorithm is inherently sequential and not
-		 * expressible in terms of standard linear algebra operations.
-		 * In particular, this implementation (as the standard one) couples a standard CG algorithm with a V-cycle
-		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
-		 * availability of coarsening information, which should be stored inside \p data; otherwise,
-		 * the refinement is not performed and only the CG algorithm is run. For more information on inputs
-		 * and on coarsening information, you may consult the \ref hpcg_data class documentation.
+		 * This CG solver calls the MG solver at the beginning of each iteration to improve
+		 * the initial solution via the residual (thanks to the smoother) and then proceeds with
+		 * the standard CG iteration.
 		 *
-		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
-		 * and populated with the proper values; in particular
-		 * - hpcg_data#x with the initial tentative solution (iterative solutions are also stored here)
-		 * - hpcg_data#A with the system matrix
-		 * - hpcg_data#b with the right-hand side vector \f$ b \f$
-		 * - hpcg_data#A_diagonal with the diagonal values of the matrix
-		 * - hpcg_data#color_masks with the color masks for this level
-		 * - hpcg_data#coarser_level with the information for the coarser multi-grid run (if any)
-		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
-		 * but not necessarily populated with values, as they are internally populated when needed; hence,
-		 * any previous values are overwritten.
-		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
+		 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
 		 * the failure code.
 		 *
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam ResidualType type of the residual norm
 		 * @tparam NonzeroType type of matrix values
 		 * @tparam InputType type of values of the right-hand side vector b
-		 * @tparam Ring the ring of algebraic operators zero-values
-		 * @tparam Minus the minus operator for subtractions
+		 * @tparam MultiGridrunnerType type for the multi-grid runner object
+		 * @tparam Ring algebraic ring type
+		 * @tparam Minus minus operator
 		 *
-		 * @param[in,out] data \ref hpcg_data object storing inputs, outputs and temporary vectors used for the computation,
-		 *                     as long as the information for the recursive multi-grid runs
-		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
-		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
-		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
-		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
-		 *                           the simulation stops even if the residual norm is above \p tolerance
-		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
-		 *                      the simulation at
-		 * @param[out] iterations numbers of iterations performed
-		 * @param[out] norm_residual norm of the final residual
-		 * @param[in] ring the ring to perform the operations on
-		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-		 *                          unsuccessful operation otherwise
+		 * @param cg_data data for the CG solver only
+		 * @param cg_opts options for the CG solver
+		 * @param grid_base base (i.e., finer) level of the multi-grid, with the information of the physical system
+		 * @param MultiGridRunner runner object (functor) to call the multi-grid solver
+		 * @param out_info solver output information
+		 * @return grb::RC SUCCESS in case of succesful run
 		 */
 		template<
 			typename IOType,
@@ -169,27 +145,28 @@ namespace grb {
 			typename MultiGridrunnerType,
 			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
 			class Minus = operators::subtract< IOType >
-		> grb::RC mg_cg(
-			multigrid_data< IOType, NonzeroType > &grid_base,
-			mg_cg_data< IOType, NonzeroType, InputType > &data,
-			const cg_options< IOType, ResidualType > &cg_opts,
+		> grb::RC multigrid_conjugate_gradient(
+			MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
+			const CGOptions< IOType, ResidualType, Ring, Minus > &cg_opts,
+			MultiGridData< IOType, NonzeroType > &grid_base,
 			MultiGridrunnerType &multigrid_runner,
-			cg_out_data< ResidualType > &out_data
+			CGOutInfo< ResidualType > &out_info
 		) {
 			ResidualType alpha;
 
-			const grb::Matrix< NonzeroType > &A { grid_base.A };
-			grb::Vector< IOType > &r { grid_base.r };  // residual vector
-			grb::Vector< IOType > &z { grid_base.z };  // pre-conditioned residual vector
-			grb::Vector< IOType > &x { data.x };
-			const grb::Vector< InputType > &b { data.b };
-			grb::Vector< IOType > &p { data.p };  // direction vector
-			grb::Vector< IOType > &Ap { data.u }; // temp vector
-			grb::RC ret { SUCCESS };
+			const grb::Matrix< NonzeroType > &A = grid_base.A;
+			grb::Vector< IOType > &r = grid_base.r;  // residual vector
+			grb::Vector< IOType > &z = grid_base.z;  // pre-conditioned residual vector
+			grb::Vector< IOType > &x = cg_data.x;
+			const grb::Vector< InputType > &b = cg_data.b;
+			grb::Vector< IOType > &p = cg_data.p;  // direction vector
+			grb::Vector< IOType > &Ap = cg_data.u; // temp vector
+			grb::RC ret = SUCCESS;
 
-			ret = ret ? ret : grb::set( Ap, 0 );
-			ret = ret ? ret : grb::set( r, 0 );
-			ret = ret ? ret : grb::set( p, 0 );
+			const IOType io_zero = cg_opts.ring.template getZero< IOType >();
+			ret = ret ? ret : grb::set( Ap, io_zero );
+			ret = ret ? ret : grb::set( r, io_zero );
+			ret = ret ? ret : grb::set( p, io_zero );
 
 			ret = ret ? ret : grb::set( p, x );
 			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, cg_opts.ring ); // Ap = A * x
@@ -198,7 +175,8 @@ namespace grb {
 			ret = ret ? ret : grb::eWiseApply( r, b, Ap, cg_opts.minus ); // r = b - Ap;
 			assert( ret == SUCCESS );
 
-			ResidualType norm_residual = cg_opts.ring.template getZero< ResidualType >();
+			const ResidualType residual_zero = cg_opts.ring.template getZero< ResidualType >();
+			ResidualType norm_residual = residual_zero;
 			ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring ); // norm_residual = r' * r;
 			assert( ret == SUCCESS );
 
@@ -206,10 +184,10 @@ namespace grb {
 			norm_residual = std::sqrt( norm_residual );
 
 			// initial norm of residual
-			out_data.norm_residual = norm_residual;
-			const ResidualType norm_residual_initial { norm_residual };
-			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
-			size_t iter { 0 };
+			out_info.norm_residual = norm_residual;
+			const ResidualType norm_residual_initial = norm_residual;
+			ResidualType old_r_dot_z = residual_zero, r_dot_z = residual_zero, beta = residual_zero;
+			size_t iter = 0;
 
 			grb::utils::Timer timer;
 
@@ -258,7 +236,7 @@ namespace grb {
 					assert( ret == SUCCESS );
 
 					beta = r_dot_z / old_r_dot_z;
-					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
+					ret = ret ? ret : grb::set( Ap, io_zero );                         // Ap  = 0;
 					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, cg_opts.ring ); // Ap += beta * p + z;
 					std::swap( Ap, p );                                         // p = Ap;
 					assert( ret == SUCCESS );
@@ -267,7 +245,7 @@ namespace grb {
 				DBG_print_norm( p, "middle p" );
 #endif
 
-				ret = ret ? ret : grb::set( Ap, 0 );
+				ret = ret ? ret : grb::set( Ap, io_zero );
 				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, cg_opts.ring ); // Ap = A * p;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
@@ -302,17 +280,31 @@ namespace grb {
 				}
 
 				++iter;
-				out_data.iterations = iter;
-				out_data.norm_residual = norm_residual;
+				out_info.iterations = iter;
+				out_info.norm_residual = norm_residual;
 			} while( iter < cg_opts.max_iterations &&
 				norm_residual / norm_residual_initial > cg_opts.tolerance && ret == SUCCESS );
 
 			return ret;
 		}
 
-
-
-
+		/**
+		 * Runner object incapsulating all information to run a Conjugate Gradient solver
+		 * with multi-grid.
+		 *
+		 * The multi-grid runner must be constructed separately (depending on the chosen algorithm)
+		 * and move-transfered during construction of this runner.
+		 * The \p MultiGridrunnerType must implement a functional interface whose input (from CG)
+		 * is the structure with the system information for one level of the grid.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam InputType type of values of the right-hand side vector b
+		 * @tparam ResidualType type of the residual norm
+		 * @tparam MultiGridrunnerType type for the multi-grid runner object
+		 * @tparam Ring algebraic ring type
+		 * @tparam Minus minus operator
+		 */
 		template<
 			typename IOType,
 			typename NonzeroType,
@@ -321,35 +313,47 @@ namespace grb {
 			typename MultiGridRunnerType,
 			class Ring,
 			class Minus
+		> struct MultiGridCGRunner {
 
-		> struct mg_cg_runner {
-
-			using HPCGInputType = mg_cg_data< IOType, NonzeroType, InputType >;
+			using HPCGInputType = MultiGridCGData< IOType, NonzeroType, InputType >;
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
 			static_assert( std::is_default_constructible< Minus >::value,
 				"cannot construct the Minus operator with default values" );
-			// static_assert( std::is_copy_constructible< MultiGridRunnerType >::value,
-			// 	"cannot construct the Multi-Grid runner by copy" );
 			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
 				"cannot construct the Multi-Grid runner by move" );
 
 			// default value: override with your own
-			cg_options< IOType, ResidualType, Ring, Minus > cg_opts{ true, 10, 0.0, false, Ring(), Minus() };
+			CGOptions< IOType, ResidualType, Ring, Minus > cg_opts = { true, 10,
+				Ring(). template getZero< ResidualType >(), false, Ring(), Minus() };
 
 			MultiGridRunnerType mg_runner;
 
-			mg_cg_runner(
+			/**
+			 * Construct a new MultiGridCGRunner object by moving the required MG runner.
+			 *
+			 * Moving the state of the MG is safer in that it avoids use-after-free issues,
+			 * as the state of the MG runner is managed automatically with this object.
+			 */
+			MultiGridCGRunner(
 				MultiGridRunnerType &&_mg_runner
 			) : mg_runner( std::move( _mg_runner ) ) {}
 
+			/**
+			 * Functional operator to invoke a full CG-MG computation.
+			 *
+			 * @param grid_base base level of the grid
+			 * @param cg_data data for CG
+			 * @param out_info output information from CG
+			 * @return grb::RC indicating the success or the error occurred
+			 */
 			inline grb::RC operator()(
 				typename MultiGridRunnerType::MultiGridInputType &grid_base,
-				mg_cg_data< IOType, NonzeroType, InputType > &data,
-				cg_out_data< ResidualType > &out_data
+				MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
+				CGOutInfo< ResidualType > &out_info
 			) {
-				return mg_cg( grid_base, data, cg_opts, mg_runner, out_data );
+				return multigrid_conjugate_gradient( cg_data, cg_opts, grid_base, mg_runner, out_info );
 			}
 
 		};
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
index e76063aec..6462e4019 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_data.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,9 @@
  */
 
 /**
- * @file hpcg_data.hpp
+ * @file multigrid_data.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Data structures to store HPCG input/output data.
- * @date 2021-04-30
+ * Data structure definition to store the information of a single multi-grid level.
  */
 
 #ifndef _H_GRB_ALGORITHMS_HPCG_DATA
@@ -36,46 +35,33 @@ namespace grb {
 	namespace algorithms {
 
 		/**
-		 * @brief Data container for all multi-grid inputs and outputs.
+		 * This data structure stores information for a \b single multi-grid level. This information
+		 * dependes exclusively on the size of the underlying physical system.
+
+		 *
+		 * Internal ALP/GraphBLAS containers are initialized to the proper size,
+		 * but their values are \b not initialized as this depends on the specific algorithm chosen
+		 * for the multi-grid solver. Populating them is user's task.
 		 *
 		 * @tparam IOType Type of values of the vectors for intermediate results
 		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
 		 *                     and the coarsening matrix #Ax_finer
-		 *
-		 * This data structure stores information for a full multi-grid V cycle, i.e.
-		 * - input and output vectors for solution, residual and temporary vectors
-		 * - coarsening information, in particular the #coarsening_matrix that
-		 *   coarsens a larger system of size #finer_size to the current system
-		 *   of size #system_size
-		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
-		 *   if no further coarsening is desired; note that this information is automatically
-		 *   destructed on object destruction (if any)
-		 *
-		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
-		 * thus having size #system_size; this also holds for the system matrix #A,
-		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
-		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
-		 * coming from another \code multigrid_data<IOType, NonzeroType> \endcode object whose #system_size equals
-		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
-		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
-		 * the information for a further coarsening is stored inside #coarser_level, so that the
-		 * hierarchy of coarsened levels is reflected inside this data structure.
-		 *
-		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
-		 * but their values are \b not initialized.
 		 */
 		template<
 			typename IOType,
 			typename NonzeroType
-		> struct multigrid_data {
-
-			const size_t level;
-			const size_t system_size; ///< size of the system, i.e. side of the #A
-			grb::Matrix< NonzeroType > A;                   ///< system matrix
-			grb::Vector< IOType > z;                        ///< multi-grid solution
-			grb::Vector< IOType > r;                        ///< residual
-
-			multigrid_data(
+		> struct MultiGridData {
+
+			const size_t level; ///< level of the grid (0 for the finest physical system)
+			const size_t system_size; ///< size of the system, i.e. side of the #A system matrix
+			grb::Matrix< NonzeroType > A; ///< system matrix
+			grb::Vector< IOType > z; ///< multi-grid solution
+			grb::Vector< IOType > r; ///< residual
+
+			/**
+			 * Construct a new multigrid data object from level information and system size.
+			 */
+			MultiGridData(
 				size_t _level,
 				size_t sys_size
 			) :
@@ -86,13 +72,13 @@ namespace grb {
 				r( sys_size ) {}
 
 			// for safety, disable copy semantics
-			multigrid_data( const multigrid_data< IOType, NonzeroType > & o ) = delete;
+			MultiGridData( const MultiGridData< IOType, NonzeroType > & o ) = delete;
 
-			multigrid_data<IOType, NonzeroType > & operator=( const multigrid_data< IOType, NonzeroType > & ) = delete;
+			MultiGridData<IOType, NonzeroType > & operator=( const MultiGridData< IOType, NonzeroType > & ) = delete;
 
-			grb::RC zero_temp_vectors() {
-				grb::RC rc = grb::set( z, 0 );
-				rc = rc ? rc : grb::set( r, 0 );
+			grb::RC init_vectors( IOType zero ) {
+				grb::RC rc = grb::set( z, zero );
+				rc = rc ? rc : grb::set( r, zero );
 				return rc;
 			}
 		};
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 77b785e2d..963da74d5 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,8 @@
 /**
  * @file multigrid_v_cycle.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief This file contains the routines for multi-grid solution refinement, including the main routine
+ * This file contains the routines for multi-grid solution refinement, including the main routine
  *        and those for coarsening and refinement of the tentative solution.
- * @date 2021-04-30
  */
 
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
@@ -33,57 +32,51 @@
 #include <utility>
 
 #include <graphblas.hpp>
-
 #include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
 
 #include "multigrid_data.hpp"
 
 namespace grb {
 	namespace algorithms {
-		/**
-		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
-		 */
-		namespace internal {
-
-
-
-		} // namespace internal
 
 		/**
-		 * @brief Multi-grid V cycle implementation to refine a given solution.
+		 * Multi-grid V cycle implementation to refine a given solution.
 		 *
 		 * A full multi-grid run goes through the following steps:
-		 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
-		 *    to improve on the initial solution stored into \p data.z
-		 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
-		 * -# a multi-grid run is recursively performed on the coarser system
-		 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
-		 *    into \p data.z
-		 * -# this solution is further smoothed for \p postsmoother_steps steps
 		 *
-		 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
+		 * 1. calls the pre-smoother to improve on the initial solution stored into \p mgiter_begin->z
+		 * 2. coarsens the residual vector
+		 * 3. recursively solves the coarser system
+		 * 4. prolongs the coarser solution into the \p mgiter_begin->z
+		 * 5. further smooths the solution wih a post-smoother call
+		 *
+		 * The algorithm moves across grid levels via the STL-like iterators \p mgiter_begin
+		 * and \p mgiter_end and accesses the grid data via the former (using the operator \c * ): when
+		 * \p mgiter_begin \c == \p mgiter_end , a smoothing round is invoked and the recursion halted.
 		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-		 * the failure code.
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
+		 * and returning the failure code.
 		 *
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam NonzeroType type of matrix values
+		 * @tparam MGSysIterType type of the iterator across grid levels
+		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
+		 *  smoothing steps
+		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
+		 *  and prolongation
 		 * @tparam Ring the ring of algebraic operators zero-values
 		 * @tparam Minus the minus operator for subtractions
 		 *
-		 * @param[in,out] data \ref multigrid_data object storing the relevant data for the multi-grid run of the current
-		 *                     clevel
-		 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
-		 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
-		 *                and only smoothing occurs on the current solution
-		 * @param[in] presmoother_steps number of pre-smoother steps
-		 * @param[in] postsmoother_steps number of post-smoother steps
-		 * @param[in] ring the ring to perform the operations on
-		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-		 *                          unsuccessful operation otherwise
+		 * @param mgiter_begin iterator pointing to the current level of the multi-grid
+		 * @param mgiter_end end iterator, indicating the end of the recursion
+		 * @param smoother callable object to invoke the smoothing steps
+		 * @param coarsener callable object to coarsen and prolong (between current and coarser grid levels)
+		 * @param ring the ring to perform the operations on
+		 * @param minus the \f$ - \f$ operator for vector subtractions
+		 * @return grb::RC if the algorithm could correctly terminate, the error code of the first
+		 *  unsuccessful operation otherwise
 		 */
-		template<
+		template <
 			typename IOType,
 			typename NonzeroType,
 			typename MGSysIterType,
@@ -99,22 +92,21 @@ namespace grb {
 			const Ring &ring,
 			const Minus &minus
 		) {
-			static_assert( std::is_base_of< multigrid_data< IOType, NonzeroType >,
+			static_assert( std::is_base_of< MultiGridData< IOType, NonzeroType >,
 				typename std::decay< decltype( *mgiter_begin ) >::type >::value, "the iterator type MGSysIterType"
-				" must reference an object of type multigrid_data< IOType, NonzeroType >" );
+				" must reference an object of type MultiGridData< IOType, NonzeroType >" );
 
-			RC ret { SUCCESS };
+			RC ret = SUCCESS;
 			assert( mgiter_begin != mgiter_end );
-			multigrid_data< IOType, NonzeroType > &finer_system = *mgiter_begin;
+			MultiGridData< IOType, NonzeroType > &finer_system = *mgiter_begin;
 			++mgiter_begin;
 
 #ifdef HPCG_PRINT_STEPS
 			DBG_println( "mg BEGINNING {" );
 #endif
 
-
 			// clean destination vector
-			ret = ret ? ret : grb::set( finer_system.z, 0 );
+			ret = ret ? ret : grb::set( finer_system.z, ring. template getZero< IOType >() );
 #ifdef HPCG_PRINT_STEPS
 			DBG_print_norm( finer_system.r, "initial r" );
 #endif
@@ -128,7 +120,7 @@ namespace grb {
 #endif
 				return ret;
 			}
-			multigrid_data< IOType, NonzeroType > &coarser_system = *mgiter_begin;
+			MultiGridData< IOType, NonzeroType > &coarser_system = *mgiter_begin;
 
 			// pre-smoother
 			ret = ret ? ret : smoother.pre_smooth( finer_system );
@@ -165,15 +157,31 @@ namespace grb {
 			return ret;
 		}
 
+		/**
+		 * Callable object to invoke the V-cycle multi-grid algorithm, which also requires
+		 * a smoother and a coarsener object.
+		 *
+		 * It is built by transferring into it the state of both the smoother and the coarsener,
+		 * in order to avoid use-after-free issues.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam MGSysIterType type of the iterator across grid levels
+		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
+		 *  smoothing steps
+		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
+		 *  and prolongation
+		 * @tparam Ring the ring of algebraic operators and zero values
+		 * @tparam Minus the minus operator for subtractions
+		 */
 		template<
 			typename IOType,
 			typename NonzeroType,
-			typename InputType,
 			typename MGSmootherType,
 			typename CoarsenerType,
 			class Ring,
 			class Minus
-		> struct multigrid_runner {
+		> struct MultiGridRunner {
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
@@ -184,19 +192,20 @@ namespace grb {
 			static_assert( std::is_move_constructible< CoarsenerType >::value,
 				"CoarsenerType must be move-constructible");
 
-			using MultiGridInputType = multigrid_data< IOType, NonzeroType >;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType >;
 
 			// check the interface between HPCG and MG match
 			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType,
 				MultiGridInputType >::value, "input type of the Smoother kernel must match the input from Multi-Grid" );
 
-			MGSmootherType smoother_runner;
-			CoarsenerType coarsener_runner;
-			std::vector< std::unique_ptr< MultiGridInputType > > system_levels;
-			Ring ring;
-			Minus minus;
+			MGSmootherType smoother_runner; ///< object to run the smoother
+			CoarsenerType coarsener_runner; ///< object to run the coarsener
+			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
+			Ring ring; ///< algebraic ring
+			Minus minus; ///< minus operator
 
-			struct Extractor {
+			// operator to extract the reference out of an std::unique_ptr object
+			struct __extractor {
 				MultiGridInputType & operator()(
 					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference &ref
 				) {
@@ -210,23 +219,29 @@ namespace grb {
 				}
 			};
 
-			using UniquePtrExtractor = grb::utils::IteratorValueAdaptor<
+			using __unique_ptr_extractor = grb::utils::IteratorValueAdaptor<
 				typename std::vector< std::unique_ptr< MultiGridInputType > >::iterator,
-				Extractor
+				__extractor
 			>;
 
-
-			multigrid_runner(
+			/**
+			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
+			 * smoother and coarsener.
+			 */
+			MultiGridRunner(
 				MGSmootherType &&_smoother_runner,
 				CoarsenerType &&_coarsener_runner
 			) : smoother_runner( std::move( _smoother_runner ) ),
 				coarsener_runner( std::move(  _coarsener_runner ) ) {}
 
-			inline grb::RC operator()(
-				MultiGridInputType &system
-			) {
-				return multi_grid< IOType, NonzeroType, UniquePtrExtractor, MGSmootherType, CoarsenerType, Ring, Minus >(
-					UniquePtrExtractor( system_levels.begin() += system.level ), UniquePtrExtractor( system_levels.end() ),
+			/**
+			 * Operator to invoke a full multi-grid run starting from the given level.
+			 */
+			inline grb::RC operator()( MultiGridInputType &system ) {
+				return multi_grid< IOType, NonzeroType, __unique_ptr_extractor,
+					MGSmootherType, CoarsenerType, Ring, Minus >(
+					__unique_ptr_extractor( system_levels.begin() += system.level ),
+					__unique_ptr_extractor( system_levels.end() ),
 					smoother_runner, coarsener_runner, ring, minus );
 			}
 		};
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 615b4340b..97d0c80e4 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,7 @@
 /**
  * @file red_black_gauss_seidel.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Contains the routines to perform a forward-backward pass of a Red-Black Gauss-Seidel smoother.
- * @date 2021-04-30
+ * Contains the routines to perform a forward-backward pass of a Red-Black Gauss-Seidel smoother.
  */
 
 #ifndef _H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
@@ -34,30 +33,36 @@
 namespace grb {
 	namespace algorithms {
 
-		template< typename IOType > struct smoother_data {
+		/**
+		 * Data structures to run the RBGS smoother on a single level of the multi-grid.
+		 */
+		template< typename IOType > struct SmootherData {
 
-			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
-			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
+			grb::Vector< IOType > A_diagonal; ///< vector with the diagonal of #A
+			grb::Vector< IOType > smoother_temp; ///< for smoother's intermediate results
 			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
 
-			smoother_data( size_t sys_size ) :
+			/**
+			 * Construct a new SmootherData object from the level size.
+			 */
+			SmootherData( size_t sys_size ) :
 				A_diagonal( sys_size ),
-				smoother_temp( sys_size ) { }
+				smoother_temp( sys_size ) {}
 
 			// for safety, disable copy semantics
-			smoother_data( const smoother_data & o ) = delete;
+			SmootherData( const SmootherData & o ) = delete;
 
-			smoother_data & operator=( const smoother_data & ) = delete;
+			SmootherData & operator=( const SmootherData & ) = delete;
 
-			grb::RC zero_temp_vectors() {
-				return grb::set( smoother_temp, 0 );
+			grb::RC init_vectors( IOType zero ) {
+				return grb::set( smoother_temp, zero );
 			}
 		};
 
 		namespace internal {
 
 			/**
-			 * @brief Runs a single step of Red-Black Gauss-Seidel for a specific color.
+			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
 			 *
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
@@ -71,7 +76,7 @@ namespace grb {
 			 * @param[in] color_mask the mask of colors to filter the rows to smooth
 			 * @param[in] ring the ring to perform the operations on
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
+			 *  unsuccessful operation otherwise
 			 */
 			template<
 				typename IOType,
@@ -86,8 +91,8 @@ namespace grb {
 				const grb::Vector< bool > & color_mask,
 				const Ring & ring
 			) {
-				RC ret { SUCCESS };
-				ret = ret ? ret : grb::set( smoother_temp, 0 );
+				RC ret = SUCCESS;
+				ret = ret ? ret : grb::set( smoother_temp, ring. template getZero< IOType >() );
 
 				// acc_temp[mask] = A[mask] * x[mask]
 				ret = ret ? ret : grb::mxv< grb::descriptors::safe_overlap >( smoother_temp, color_mask, A, x, ring );
@@ -98,37 +103,38 @@ namespace grb {
 				// x[mask] = r[mask] - smoother_temp[mask] + x[mask] .* diagonal[mask]
 				// x[mask] = x[maks] ./ diagonal[mask]
 				ret = ret ? ret :
-                            grb::eWiseLambda(
-								[ &x, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
-									// if the mask was properly initialized, the check on the mask value is unnecessary;
-					                // nonetheless, it is left not to violate the semantics of RBGS in case also the false values
-					                // had been initialized (in which case the check is fundamental); if only true values were initialized,
-					                // we expect CPU branch prediction to neutralize the branch cost
-									// if( color_mask[ i ] ) {
-										IOType d = A_diagonal[ i ];
-										IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
-										x[ i ] = v / d;
-									// }
-								},
-								color_mask, x, r, smoother_temp, A_diagonal );
+					grb::eWiseLambda(
+						[ &x, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
+							// if the mask was properly initialized, the check on the mask value is unnecessary;
+							// nonetheless, it is left not to violate the semantics of RBGS in case also the false values
+							// had been initialized (in which case the check is fundamental); if only true values were initialized,
+							// we expect CPU branch prediction to neutralize the branch cost
+							// if( color_mask[ i ] ) {
+								IOType d = A_diagonal[ i ];
+								IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
+								x[ i ] = v / d;
+							// }
+						},
+						color_mask, x, r, smoother_temp, A_diagonal );
 				assert( ret == SUCCESS );
 				return ret;
 			}
 
 			/**
-			 * @brief Runs a single forward and backward pass of Red-Black Gauss-Seidel smoothing on the system stored in \p data.
+			 * Runs a single forward and backward pass of Red-Black Gauss-Seidel smoothing
+			 * on the system stored in \p data.
 			 *
-			 * This routine performs a forward and a backward step of Red-Black Gauss-Seidel for each color stored in \p data.color_masks.
-			 * Color stored inside this container <b>are assumed to be mutually exclusive and to cover all rows of the solution vector<\b>,
-			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic to generate and pass correct
-			 * coloring information. Otherwise, \b no guarantees hold on the result.
+			 * This routine performs a forward and a backward step of Red-Black Gauss-Seidel for each color
+			 * stored in \p data.color_masks. Colors stored inside this container
+			 * <b>are assumed to be mutually exclusive and to cover all rows of the solution vector<\b>,
+			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
+			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
 			 *
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
 			 *
-			 * @param data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *             residual, system matrix colors, temporary vectors
+			 * @param[in,out] data structure with the data of a single grid level
 			 * @param[in] ring the ring to perform the operations on
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
@@ -138,22 +144,22 @@ namespace grb {
 				typename NonzeroType,
 				class Ring
 			> grb::RC red_black_gauss_seidel(
-				multigrid_data< IOType, NonzeroType > &data,
-				smoother_data< IOType > &smoothing_info,
+				MultiGridData< IOType, NonzeroType > &data,
+				SmootherData< IOType > &smoothing_info,
 				const Ring & ring
 			) {
-				RC ret { SUCCESS };
+				RC ret = SUCCESS;
 				// forward step
-				std::vector< grb::Vector< bool > >::const_iterator end { smoothing_info.color_masks.cend() };
-				for( std::vector< grb::Vector< bool > >::const_iterator it {
-					smoothing_info.color_masks.cbegin() }; it != end && ret == SUCCESS; ++it ) {
+				using cit_t = typename std::vector< grb::Vector< bool > >::const_iterator;
+				cit_t end = smoothing_info.color_masks.cend();
+				for( cit_t it = smoothing_info.color_masks.cbegin(); it != end && ret == SUCCESS; ++it ) {
 					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
 						smoothing_info.smoother_temp, *it, ring );
 				}
 				// backward step
-				std::vector< grb::Vector< bool > >::const_reverse_iterator rend { smoothing_info.color_masks.crend() };
-				for( std::vector< grb::Vector< bool > >::const_reverse_iterator rit {
-					smoothing_info.color_masks.crbegin() }; rit != rend && ret == SUCCESS; ++rit ) {
+				using crit_t = typename std::vector< grb::Vector< bool > >::const_reverse_iterator;
+				crit_t rend = smoothing_info.color_masks.crend();
+				for( crit_t rit = smoothing_info.color_masks.crbegin(); rit != rend && ret == SUCCESS; ++rit ) {
 					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
 						smoothing_info.smoother_temp, *rit, ring );
 				}
@@ -162,64 +168,62 @@ namespace grb {
 
 		} // namespace internal
 
+		/**
+		 * Runner object for the RBGS smoother, with multiple methods for each type of smoothing step:
+		 * pre-, post- and non-recursive, as invoked during a full run of a multi-grid V-cycle.
+		 *
+		 * It stores the information to smooth each level of the grid, to be initalized separately.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam Ring the ring of algebraic operators
+		 */
 		template <
 			typename IOType,
 			typename NonzeroType,
 			class Ring
-		> struct red_black_smoother_runner {
-			size_t presmoother_steps ;
-			size_t postsmoother_steps;
-			size_t non_recursive_smooth_steps;
-			std::vector< std::unique_ptr< smoother_data< IOType > > > levels;
-			Ring ring;
+		> struct RedBlackGSSmootherRunner {
+
+			size_t presmoother_steps; ///< number of pre-smoother steps
+			size_t postsmoother_steps;  ///< number of post-smoother steps
+			size_t non_recursive_smooth_steps;  ///< number of smoother steps for the last grid level
+			std::vector< std::unique_ptr< SmootherData< IOType > > > levels;  ///< for each grid level,
+				///< the smoothing data (finest first)
+			Ring ring;  ///< the algebraic ring
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring operator with default values" );
 
-			using SmootherInputType = multigrid_data< IOType, NonzeroType >;
+			using SmootherInputType = MultiGridData< IOType, NonzeroType >;
 
-			inline grb::RC pre_smooth(
-				SmootherInputType& data
-			) {
-				return run_smoother( data, presmoother_steps );
+			inline grb::RC pre_smooth( SmootherInputType& data ) {
+				return __run_smoother( data, presmoother_steps );
 			}
 
-			inline grb::RC post_smooth(
-				SmootherInputType& data
-			) {
-				return run_smoother( data, postsmoother_steps );
+			inline grb::RC post_smooth( SmootherInputType& data ) {
+				return __run_smoother( data, postsmoother_steps );
 			}
 
-			inline grb::RC nonrecursive_smooth(
-				SmootherInputType& data
-			) {
-				return run_smoother( data, non_recursive_smooth_steps );
+			inline grb::RC nonrecursive_smooth( SmootherInputType& data ) {
+				return __run_smoother( data, non_recursive_smooth_steps );
 			}
 
 			/**
-			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
-			 * inside \p data.
+			 * Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother,
+			 * with inputs and outputs stored inside \p data.
 			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *                     residual, system matrix colors, temporary vectors
-			 * @param[in] smoother_steps how many smoothing steps to run
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
+			 * This is an internal method called by all user-facing methods, because this specific
+			 * smoother performs all smoothing steps the same way.
 			 */
-			grb::RC run_smoother(
+			grb::RC __run_smoother(
 				SmootherInputType &data,
 				const size_t smoother_steps
 			) {
-				RC ret { SUCCESS };
+				RC ret = SUCCESS;
 
-				smoother_data< IOType > &smoothing_info = *( levels.at( data.level ).get() );
+				SmootherData< IOType > &smoothing_info = *( levels.at( data.level ).get() );
 
-				for( size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
+				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
 					ret = ret ? ret : internal::red_black_gauss_seidel( data, smoothing_info, ring );
 					assert( ret == SUCCESS );
 				}
diff --git a/include/graphblas/algorithms/multigrid/coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
similarity index 70%
rename from include/graphblas/algorithms/multigrid/coarsener.hpp
rename to include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index 47116c22a..e1ef7db73 100644
--- a/include/graphblas/algorithms/multigrid/coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -16,14 +16,13 @@
  */
 
 /**
- * @file hpcg_data.hpp
+ * @file single_matrix_coarsener.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Implementation of the coarsener of HPCG
- * @date 2022-11-08
+ * Implementation of a coarsener using the same matrix for both coarsening and prolongation.
  */
 
-#ifndef _H_GRB_ALGORITHMS_HPCG_COARSENER
-#define _H_GRB_ALGORITHMS_HPCG_COARSENER
+#ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
+#define _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
 
 #include <vector>
 #include <memory>
@@ -35,37 +34,40 @@
 namespace grb {
 	namespace algorithms {
 
+		/**
+		 * Structure storing the data for the coarsener
+		 */
 		template<
 			typename IOType,
 			typename NonzeroType
-		>
-		struct coarsening_data {
+		> struct CoarseningData {
 
 			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
 			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
 			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
 
 			/**
-			 * @brief Construct a new \c coarsening_data by initializing internal data structures
-			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
+			 * Construct a new CoarseningData object by initializing internal data structures.
+			 *
 			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
+			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
 			 */
-			coarsening_data( size_t _finer_size, size_t coarser_size ) :
+			CoarseningData( size_t _finer_size, size_t coarser_size ) :
 				coarsening_matrix( coarser_size, _finer_size ),
 				Ax_finer( _finer_size ) {}
 
-			grb::RC zero_temp_vectors() {
-				return grb::set( Ax_finer, 0 );
+			grb::RC init_vectors( IOType zero ) {
+				return grb::set( Ax_finer, zero );
 			}
 		};
 
 		namespace internal {
 
 			/**
-			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
+			 * computes the coarser residual vector \p CoarseningData.r by coarsening
 			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
 			 *
-			 * The coarsening information are stored inside \p coarsening_data.
+			 * The coarsening information are stored inside \p CoarseningData.
 			 *
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
@@ -73,7 +75,7 @@ namespace grb {
 			 * @tparam Minus the minus operator for subtractions
 			 *
 			 * @param[in] r_fine fine residual vector
-			 * @param[in,out] coarsening_data \ref multigrid_data data structure storing the information for coarsening
+			 * @param[in,out] coarsening_data \ref MultiGridData data structure storing the information for coarsening
 			 * @param[in] ring the ring to perform the operations on
 			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
@@ -87,28 +89,27 @@ namespace grb {
 			> grb::RC compute_coarsening(
 				const grb::Vector< IOType > & r_fine, // fine residual
 				grb::Vector< IOType > & r_coarse, // fine residual
-				coarsening_data< IOType, NonzeroType > & coarsening_data,
+				CoarseningData< IOType, NonzeroType > & coarsening_data,
 				const Ring & ring,
 				const Minus & minus
 			) {
-				RC ret { SUCCESS };
+				RC ret = SUCCESS;
 				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer prima" );
-				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
-									  minus ); // Ax_finer = r_fine - Ax_finer
+				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine,
+					coarsening_data.Ax_finer, minus ); // Ax_finer = r_fine - Ax_finer
 				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer dopo" );
 				assert( ret == SUCCESS );
 
 				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
 				// to *coarsening_data->system_size
-				ret = ret ? ret : grb::set( r_coarse, 0 );
+				ret = ret ? ret : grb::set( r_coarse, ring.template getZero< IOType >() );
 				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( r_coarse, coarsening_data.coarsening_matrix,
 					coarsening_data.Ax_finer, ring ); // r = coarsening_matrix * Ax_finer
-				// DBG_print_norm( r_coarse, "+++ r_coarse" );
 				return ret;
 			}
 
 			/**
-			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
+			 * computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
 			 * \p x_fine.
 			 *
 			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
@@ -130,10 +131,10 @@ namespace grb {
 			> grb::RC compute_prolongation(
 				const grb::Vector< IOType > & z_coarse,
 				grb::Vector< IOType > & x_fine, // fine residual
-				grb::algorithms::coarsening_data< IOType, NonzeroType > & coarsening_data,
+				grb::algorithms::CoarseningData< IOType, NonzeroType > & coarsening_data,
 				const Ring & ring
 			) {
-				RC ret { SUCCESS };
+				RC ret = SUCCESS;
 				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
 				// to nrows(x_fine)
 				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
@@ -149,40 +150,55 @@ namespace grb {
 
 		} // namespace internal
 
+		/**
+		 * Runner structure, holding the data to coarsen the levels of a multi-grid simulation.
+		 *
+		 * This coarsener just uses the same matrix to perform the coarsening (via an mxv())
+		 * and the prolongation, using it transposed.
+		 */
 		template<
 			typename IOType,
 			typename NonzeroType,
 			class Ring,
 			class Minus
-		> struct single_point_coarsener {
+		> struct SingleMatrixCoarsener {
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
 			static_assert( std::is_default_constructible< Minus >::value,
 				"cannot construct the Minus operator with default values" );
 
-			using MultiGridInputType = multigrid_data< IOType, NonzeroType >;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType >;
 
-			// default value: override with your own
-			std::vector< std::unique_ptr< grb::algorithms::coarsening_data< IOType, NonzeroType > > > coarsener_levels;
+			/**
+			 * Data to coarsen each level, from finer to coarser.
+			 */
+			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType,
+				NonzeroType > > > coarsener_levels;
 			Ring ring;
 			Minus minus;
 
-
-			// single_point_coarsener() = default;
-
+			/**
+			 * Method required by MultiGridRunner before the recursive call, to coarsen
+			 * the residual vector of \p finer (the finer system) into the residual of
+			 * \p coarser (the coarser system).
+			 */
 			inline grb::RC coarsen_residual(
 				const MultiGridInputType &finer,
 				MultiGridInputType &coarser
 			) {
 				// first compute the residual
-				coarsening_data< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
-				grb::RC ret = grb::set( coarsener.Ax_finer, 0 );
+				CoarseningData< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
+				grb::RC ret = grb::set( coarsener.Ax_finer, ring. template getZero< IOType >() );
 				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( coarsener.Ax_finer, finer.A, finer.z, ring );
-				// DBG_print_norm( coarsener.Ax_finer, "temp Axf" );
+
 				return internal::compute_coarsening( finer.r, coarser.r, coarsener, ring, minus );
 			}
 
+			/**
+			 * Method required by MultiGridRunner after the recursive call, to "prolong" the coarser solution
+			 * into the finer solution.
+			 */
 			inline grb::RC prolong_solution(
 				const MultiGridInputType &coarser,
 				MultiGridInputType &finer
@@ -194,4 +210,4 @@ namespace grb {
 	} // namespace algorithms
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_HPCG_COARSENER
+#endif // _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
diff --git a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
index 81864cb20..bca870af8 100644
--- a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
+++ b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
@@ -15,11 +15,17 @@
  * limitations under the License.
  */
 
+/**
+ * @dir include/graphblas/utils/iterators
+ * Various utilities to work with STL-like iterators and ALP/GraphBLAS iterators:
+ * adaptors, partitioning facilities, traits and functions to check compile-time
+ * and runtime properties.
+ */
+
 /**
  * @file IteratorValueAdaptor.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Adaptor to extract a given value out of an iterator.
- * @date 2022-10-08
+ * Definition of an adaptor to extract a given value out of an iterator.
  */
 
 #ifndef H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
@@ -45,9 +51,10 @@ namespace grb {
 			typename AdaptorType
 		> struct IteratorValueAdaptor {
 
-			static_assert( std::is_default_constructible< AdaptorType >::value, "RefType must be default-constructible" );
-			static_assert( std::is_copy_constructible< AdaptorType >::value, "RefType must be copy-constructible" );
-			static_assert( std::is_copy_assignable< AdaptorType >::value, "RefType must be copy-assignable" );
+			static_assert( std::is_copy_constructible< AdaptorType >::value,
+				"AdaptorType must be copy-constructible" );
+			static_assert( std::is_copy_assignable< AdaptorType >::value,
+				"AdaptorType must be copy-assignable" );
 
 			typedef decltype( std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) ) reference;
 			typedef typename std::decay< reference >::type value_type;
@@ -65,31 +72,51 @@ namespace grb {
 			using SelfType = IteratorValueAdaptor< InnerIterType, AdaptorType >;
 
 			/**
-			 * Construct a new Iterator Value Adaptor object fro an actual iterator.
+			 * Construct a new IteratorValueAdaptor object from an actual iterator.
 			 * The adaptor is built via its default constructor.
-			 *
-			 * @param _iter the underlying iterator, to be copied
+			 */
+			IteratorValueAdaptor( typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
+				const InnerIterType & >::type _iter ) :
+				iter( _iter ),
+				adaptor() {}
+
+			/**
+			 * Construct a new IteratorValueAdaptor object from an iterator and an existing adaptor object.
 			 */
 			IteratorValueAdaptor(
-				const InnerIterType &_iter
+				const InnerIterType &_iter,
+				const AdaptorType &_adaptor
 			) :
 				iter( _iter ),
-				adaptor() {}
+				adaptor( _adaptor ) {}
 
 			/**
-			 * Construct a new Iterator Value Adaptor object fro an actual iterator.
+			 * Construct a new Iterator Value Adaptor object from an actual iterator.
 			 * The adaptor is built via its default constructor.
 			 *
 			 * @param _iter the underlying iterator, to be moved
 			 */
 			IteratorValueAdaptor(
-				InnerIterType &&_iter
+				typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
+				InnerIterType && >::type _iter
 			) :
 				iter( std::move( _iter ) ),
 				adaptor() {}
 
+			/**
+			 * Construct a new IteratorValueAdaptor object from an actual iterator
+			 * and an existing adaptor object by moving their state.
+			 */
+			IteratorValueAdaptor(
+				InnerIterType &&_iter,
+				AdaptorType &&_adaptor
+			) :
+				iter( std::move( _iter ) ),
+				adaptor( std::move( _adaptor ) ) {}
+
 			IteratorValueAdaptor() = delete;
 
+			// since it is an iterator, we MUST have copy and move semantics
 			IteratorValueAdaptor( const SelfType & ) = default;
 
 			IteratorValueAdaptor( SelfType && ) = default;
@@ -112,12 +139,18 @@ namespace grb {
 
 			SelfType& operator++() { ++iter; return *this; }
 
-			SelfType & operator+=( typename std::enable_if< is_random_access, const size_t >::type offset ) {
+			SelfType & operator+=(
+				typename std::enable_if< is_random_access,
+				const size_t >::type offset
+			) {
 				iter += offset;
 				return *this;
 			}
 
-			difference_type operator-( typename std::enable_if< is_random_access, const SelfType & >::type other ) {
+			difference_type operator-(
+				typename std::enable_if< is_random_access,
+				const SelfType & >::type other
+			) {
 				return iter - other.iter;
 			}
 		};
diff --git a/include/graphblas/utils/iterators/partition_range.hpp b/include/graphblas/utils/iterators/partition_range.hpp
index dd5f397c4..60d228b3a 100644
--- a/include/graphblas/utils/iterators/partition_range.hpp
+++ b/include/graphblas/utils/iterators/partition_range.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file partition_range.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of utilities to partition iterators across processes.
+ */
+
 #include <cstddef>
 #include <algorithm>
 #include <cassert>
@@ -25,44 +31,73 @@
 namespace grb {
 	namespace utils {
 
-		template< typename T > void partition_nonzeroes(
+		/**
+		 * Partitions the size of a collection across processes and computes the first offset
+		 * and the size for the local partition.
+		 *
+		 * @tparam T size type
+		 * @param[in] num_procs total number of processes
+		 * @param[in] this_proc ID of current process
+		 * @param[in] num_elements total number of elements in the collection
+		 * @param[out] first_offset offset to the first element of the local partition
+		 * @param[out] local_size size of the local partition
+		 */
+		template< typename T > void partition_collection_size(
 				size_t num_procs,
 				size_t this_proc,
 				T num_elements,
 				T& first_offset,
-				T& last_offset
+				T& local_size
 		) {
-			const T per_process{ ( num_elements + num_procs - 1 ) / num_procs }; // round up
+			const T per_process = ( num_elements + num_procs - 1 ) / num_procs; // round up
 			first_offset = std::min( per_process * static_cast< T >( this_proc ), num_elements );
-			last_offset = std::min( first_offset + per_process, num_elements );
+			local_size = std::min( first_offset + per_process, num_elements );
 		}
 
+		/**
+		 * Partitions an iteration range across processes according to the given information.
+		 *
+		 * With \p num_procs processes and \p this_proc < \p num_procs and a collection of \p num_elements
+		 * elements across all processes, it partitions the collection evenly among processes and sets
+		 * \p begin and \p end so that they iterate over the local partition designated by \p this_proc.
+		 *
+		 * It works also for a single-process scenario.
+		 *
+		 * Note: the number of processes and the ID of the current process is expected in input
+		 * not to introduce dependencies on separate code paths.
+		 *
+		 * @tparam IterT iterator type
+		 * @param[in] num_procs number of processes
+		 * @param[in] this_proc Id of current process
+		 * @param[in] num_elements number of elements of the collection; it can be computed as
+		 *  \code std::distance( begin, end ) \endcode
+		 * @param[out] begin beginning iterator to the whole collection
+		 * @param[out] end end iterator
+		 */
 		template< typename IterT > void partition_iteration_range_on_procs(
 			size_t num_procs,
 			size_t this_proc,
-			size_t num_nonzeroes,
+			size_t num_elements,
 			IterT &begin,
 			IterT &end
 		) {
 			static_assert( std::is_base_of< std::random_access_iterator_tag,
 				typename std::iterator_traits< IterT >::iterator_category >::value,
 				"the given iterator is not a random access one" );
-			assert( num_nonzeroes == static_cast< size_t >( end - begin ) );
-			size_t first, last;
-			partition_nonzeroes( num_procs, this_proc, num_nonzeroes, first, last );
-			if( last < num_nonzeroes ) {
+			assert( this_proc < num_procs );
+			assert( num_elements == static_cast< size_t >( end - begin ) );
+			if( num_procs == 1 ) {
+				return;
+			}
+			size_t first, num_local_elements;
+			partition_collection_size( num_procs, this_proc, num_elements, first, num_local_elements );
+			if( num_local_elements < num_elements ) {
 				end = begin;
-				end += last;
+				end += num_local_elements;
+			}
+			if( first > 0 ) {
+				begin += first;
 			}
-			begin += first;
-		}
-
-		template< typename IterT > void partition_iteration_range_on_procs(
-			size_t num_nonzeroes,
-			IterT &begin,
-			IterT &end
-		) {
-			return partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(), num_nonzeroes, begin, end );
 		}
 
 	} // namespace utils
diff --git a/include/graphblas/utils/iterators/utils.hpp b/include/graphblas/utils/iterators/utils.hpp
index b56899c83..0b635578d 100644
--- a/include/graphblas/utils/iterators/utils.hpp
+++ b/include/graphblas/utils/iterators/utils.hpp
@@ -25,6 +25,8 @@
 #define _H_GRB_ITERATOR_UTILS
 
 #include <cstddef>
+#include <algorithm>
+#include <type_traits>
 
 #include <graphblas/rc.hpp>
 #include <graphblas/type_traits.hpp>
@@ -78,6 +80,28 @@ namespace grb {
 			return SUCCESS;
 		}
 
+		/**
+		 * Computes the difference between \p a \a - \p b and returns it as the given
+		 * type \p DiffType.
+		 *
+		 * Raises an exception if \p DiffType cannot store the difference.
+		 */
+		template<
+			typename DiffType,
+			typename SizeType
+		> DiffType compute_signed_distance(
+			const SizeType a,
+			const SizeType b
+		) {
+			static_assert( std::is_signed< DiffType >::value, "DiffType should be signed" );
+			const SizeType diff = std::max( a, b ) - std::min( a, b );
+			if( diff > static_cast< SizeType >( std::numeric_limits< DiffType >::max() ) ) {
+				throw std::range_error( "cannot represent difference" );
+			}
+			DiffType result = static_cast< DiffType >( diff );
+			return a >= b ? result : -result ;
+		}
+
 	} // end namespace utils
 
 } // end namespace grb
diff --git a/include/graphblas/utils/multigrid/array_vector_storage.hpp b/include/graphblas/utils/multigrid/array_vector_storage.hpp
index 8eb1e4377..a40850f77 100644
--- a/include/graphblas/utils/multigrid/array_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/array_vector_storage.hpp
@@ -19,9 +19,7 @@
  * @file array_vector_storage.cpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
  * Extension of std::array<> exposing a larger interface and the underlying
- * 	storage structure.
- *
- * @date 2022-10-24
+ * storage structure.
  */
 
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
@@ -38,12 +36,12 @@ namespace grb {
 
 			/**
 			 * Array with fixed size based on std::array with an interface compliant to what other classes
-			 * in the geometry namespace expect, like storage() and dimensions() methods.
+			 * in the geometry namespace expect, like #storage() and #dimensions() methods.
 			 *
 			 * It describes a vector of dimensions #dimensions().
 			 *
-			 * @tparam DataType the data type of the vector elements
 			 * @tparam DIMS the dimensions of the vector
+			 * @tparam DataType the data type of the vector elements
 			 */
 			template<
 				size_t DIMS,
@@ -55,6 +53,12 @@ namespace grb {
 				using ConstVectorStorageType = const std::array< DataType, DIMS >&;
 				using SelfType = ArrayVectorStorage< DIMS, DataType >;
 
+				/**
+				 * Construct a new Array Vector Storage object of given dimensions;
+				 * internal values are \b not initialized.
+				 *
+				 * \p _dimensions must be equal to \p DIMS, or an exception is thrown.
+				 */
 				ArrayVectorStorage( size_t _dimensions ) {
 					static_assert( DIMS > 0, "cannot allocate 0-sized array" );
 					if( _dimensions != DIMS ) {
@@ -71,23 +75,31 @@ namespace grb {
 
 				ArrayVectorStorage( SelfType &&o ) = delete;
 
-				SelfType& operator=(
-					const SelfType &original
-				) noexcept {
+				SelfType& operator=( const SelfType &original ) noexcept {
 					std::copy_n( original.begin(), DIMS, this->begin() );
 					return *this;
 				}
 
 				SelfType & operator=( SelfType &&original ) = delete;
 
+				/**
+				 * Returns the geometrical dimensions of this vector, i.e. of the
+				 * geometrical space it refers to.
+				 */
 				constexpr size_t dimensions() const {
 					return DIMS;
 				}
 
+				/**
+				 * Returns a reference to the underlying storage object.
+				 */
 				inline VectorStorageType storage() {
 					return *this;
 				}
 
+				/**
+				 * Returns a const reference to the underlying storage object.
+				 */
 				inline ConstVectorStorageType storage() const {
 					return *this;
 				}
diff --git a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
index 9168f175c..9e5b7f92e 100644
--- a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file dynamic_vector_storage.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Extension of a heap-allocated array exposing the underlying storage and iterators.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
 #define _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
 
@@ -22,14 +28,6 @@
 #include <cstddef>
 #include <algorithm>
 
-/**
- * @file dynamic_vector_storage.cpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * Extension of a heap-allocated array exposing the underlying storage and iterators.
- *
- * @date 2022-10-24
- */
-
 namespace grb {
 	namespace utils {
 		namespace multigrid {
diff --git a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
index 2bd82ff35..2404cdf00 100644
--- a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
+++ b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
@@ -1,18 +1,67 @@
 
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/utils/multigrid
+ * This folder contains various utilities to describe an N-dimensional mesh (possibly with halo)
+ * and iterate through its elements and through the neighbors of each element, possible generating
+ * a matrix out of this information.
+ *
+ * These facilities are used to generate system matrices and various inputs for multi-grid simulations.
+ */
+
+/**
+ * @file halo_matrix_generator_iterator.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of HaloMatrixGeneratorIterator.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
 
 #include <cstddef>
 
+#include "array_vector_storage.hpp"
 #include "linearized_halo_ndim_system.hpp"
 #include "linearized_ndim_system.hpp"
 #include "linearized_ndim_iterator.hpp"
-#include "array_vector_storage.hpp"
 
 namespace grb {
 	namespace utils {
 		namespace multigrid {
 
+			/**
+			 * Iterator type to generate a matrix on top of the couples <element>-<neighbor> of an
+			 * \p DIMS -dimensional mesh.
+			 *
+			 * This iterator is random-access and meets the the interface of an ALP/GraphBLAS
+			 * input iterator, i.e. an object of this type \a it has methods \a i(), \a j() and
+			 * \a v() to describe a nonzero triplet (row index, column index and value, respectively).
+			 *
+			 * This data structure is based on the LinearizedHaloNDimIterator class, esentially wrapping the
+			 * underlying element index as \a i() and the neighbor index as \a j(); the value \a v()
+			 * is user-customizable via a functor of type \p ValueCallable, which emits the nonzero
+			 * of type \p ValueType based on the passed values of \a i() and \a j().
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType tyoe storing the coordinate and the system sizes along each dimension
+			 * @tparam ValueType type of nonzeroes
+			 * @tparam ValueCallable callable object producing the nonzero value based on \a i() and \a j()
+			 */
 			template<
 				size_t DIMS,
 				typename CoordType,
@@ -55,8 +104,6 @@ namespace grb {
 					}
 
 				private:
-					// ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
-					// ValueType non_diagonal_value; ///< value to emit outside of the diagonal
 					ValueCallable _value_producer;
 					RowIndexType _i;
 					ColumnIndexType _j;
@@ -70,7 +117,7 @@ namespace grb {
 				using difference_type = typename Iterator::difference_type;
 
 				/**
-				 * @brief Construct a new \c HaloMatrixGeneratorIterator object, setting the current row as \p row
+				 * Construct a new \c HaloMatrixGeneratorIterator object, setting the current row as \p row
 				 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
 				 *
 				 * @param sizes array with the sizes along the dimensions
@@ -94,7 +141,7 @@ namespace grb {
 				SelfType & operator=( const SelfType & ) = default;
 
 				/**
-				 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
+				 * Increments the iterator by moving coordinates to the next (row, column) to iterate on.
 				 *
 				 * This operator internally increments the columns coordinates until wrap-around, when it increments
 				 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
@@ -119,7 +166,7 @@ namespace grb {
 				}
 
 				/**
-				 * @brief Operator to compare \c this against \p o  and return whether they differ.
+				 * Operator to compare \c this against \p o  and return whether they differ.
 				 *
 				 * @param o object to compare \c this against
 				 * @return true of the row or the column is different between \p o and \c this
@@ -130,7 +177,7 @@ namespace grb {
 				}
 
 				/**
-				 * @brief Operator to compare \c this against \p o  and return whether they are equal.
+				 * Operator to compare \c this against \p o  and return whether they are equal.
 				 *
 				 * @param o object to compare \c this against
 				 * @return true of the row or the column is different between \p o and \c this
@@ -141,7 +188,7 @@ namespace grb {
 				}
 
 				/**
-				 * @brief Operator returning the triple to directly access row, column and element values.
+				 * Operator returning the triple to directly access row, column and element values.
 				 *
 				 * Useful when building the matrix by copying the triple of coordinates and value,
 				 * like for the BSP1D backend.
@@ -155,21 +202,21 @@ namespace grb {
 				}
 
 				/**
-				 * @brief Returns the current row.
+				 * Returns the current row.
 				 */
 				inline RowIndexType i() const {
 					return _val.i();
 				}
 
 				/**
-				 * @brief Returns the current column.
+				 * Returns the current column.
 				 */
 				inline ColumnIndexType j() const {
 					return _val.j();
 				}
 
 				/**
-				 * @brief Returns the current matrix value.
+				 * Returns the current matrix value.
 				 *
 				 * @return ValueType #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
 				 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp
deleted file mode 100644
index 0e53dd671..000000000
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_geometry.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-
-#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
-#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
-
-#include <cstddef>
-#include <vector>
-#include <array>
-#include <cassert>
-#include <stdexcept>
-#include <string>
-#include <cstddef>
-#include <algorithm>
-
-#include "array_vector_storage.hpp"
-#include "dynamic_vector_storage.hpp"
-#include "linearized_ndim_system.hpp"
-#include "ndim_vector.hpp"
-
-namespace grb {
-	namespace utils {
-		namespace multigrid {
-
-			template<
-				size_t DIMS,
-				typename CoordType
-			> void __compute_neighbors_range(
-				const ArrayVectorStorage< DIMS, CoordType > &_system_sizes,
-				const CoordType halo,
-				const ArrayVectorStorage< DIMS, CoordType > &system_coordinates,
-				ArrayVectorStorage< DIMS, CoordType > &neighbors_start,
-				ArrayVectorStorage< DIMS, CoordType > &neighbors_range ) {
-
-				for( CoordType i{0}; i < DIMS/* - 1*/; i++ ) {
-					const CoordType start{ system_coordinates[i] <= halo ? 0 : system_coordinates[i] - halo };
-					const CoordType end{ std::min( system_coordinates[i] + halo, _system_sizes[i] - 1 ) };
-					neighbors_start[i] = start;
-					neighbors_range[i] = end - start + 1;
-				}
-			}
-
-			template<
-				size_t DIMS,
-				typename CoordType
-			> size_t __neighbour_to_system_coords(
-				const std::array< CoordType, DIMS > &sizes,
-				size_t system_size,
-				const std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >
-					&dimension_neighbors,
-				CoordType halo,
-				CoordType neighbor,
-				ArrayVectorStorage< DIMS, CoordType > &result
-			){
-				if( neighbor > system_size ) {
-					throw std::invalid_argument("neighbor number ( " + std::to_string(neighbor)
-						+ " ) >= system size ( " + std::to_string( system_size ) + " )");
-				}
-				ArrayVectorStorage< DIMS, CoordType > halo_coords( DIMS );
-#ifdef _DEBUG
-				size_t * const halo_coords_end{ halo_coords.data() + DIMS };
-#endif
-				std::fill_n( halo_coords.begin(), DIMS, 0 );
-
-				for( size_t _dim{DIMS}; _dim > 0; _dim--) {
-
-					const size_t dimension{_dim - 1};
-					const size_t dimension_size{ sizes[dimension] };
-					const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > & neighbors{ dimension_neighbors[dimension] };
-
-					CoordType * const halo_coords_begin{ halo_coords.data() + dimension };
-#ifdef _DEBUG
-					std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour " << neighbor << std::endl;
-					std::cout << "\thalo : ";
-					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
-#endif
-					size_t h{0};
-					size_t previous_neighs{ 0 };
-					*halo_coords_begin = h;
-					size_t halo_max_neighs{ neighbors.at( halo_coords_begin ) };
-					//std::cout << "\tinitial halo_max_neighs " << halo_max_neighs << std::endl;
-					while( h < halo && neighbor >= previous_neighs + halo_max_neighs ) {
-						h++;
-						*halo_coords_begin = h;
-						previous_neighs += halo_max_neighs;
-						halo_max_neighs = neighbors.at( halo_coords_begin );
-					}
-#ifdef _DEBUG
-					std::cout << "- initial halo - neighbour " << neighbor << std::endl;
-					std::cout << "\th " << h << std::endl;
-					std::cout << "\thalo : ";
-					print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
-					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-#endif
-					if ( h < halo ){
-						result[dimension] = h;
-						neighbor -= previous_neighs;
-#ifdef _DEBUG
-						std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-						continue;
-					}
-					// saturation occurred
-					const size_t distance_from_halo{ ( neighbor - previous_neighs ) / halo_max_neighs };
-#ifdef _DEBUG
-					std::cout << "- before middle elements - neighbour " << neighbor << std::endl;
-					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
-					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-					std::cout << "\tdistance_from_halo " << distance_from_halo << std::endl;
-					std::cout << "\tdimension_size " << dimension_size << std::endl;
-#endif
-					if ( distance_from_halo < dimension_size - 2 * halo ) {
-						result[dimension] =  distance_from_halo + halo;
-						neighbor -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
-#ifdef _DEBUG
-						std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-						continue;
-					}
-					previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
-#ifdef _DEBUG
-					std::cout << "- after middle elements -neighbour " << neighbor << std::endl;
-					std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
-					std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
-#endif
-					h = halo - 1;
-					*halo_coords_begin = h;
-					halo_max_neighs = neighbors.at( halo_coords_begin );
-					while( h > 0 && neighbor >= previous_neighs + halo_max_neighs ) {
-						h--;
-						*halo_coords_begin = h;
-						previous_neighs += halo_max_neighs;
-						halo_max_neighs = neighbors.at( halo_coords_begin );
-					}
-					neighbor -= previous_neighs;
-#ifdef _DEBUG
-					std::cout << "- final halo - neighbour " << neighbor << std::endl;
-					std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
-#endif
-					// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
-					// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
-					result[dimension] = dimension_size - 1 - h;
-#ifdef _DEBUG
-					std::cout << "end neighbour " << neighbor << std::endl;
-#endif
-				}
-				return neighbor;
-			}
-
-
-			template< typename CoordType > size_t __accumulate_dimension_neighbours(
-				const NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > &prev_neighs,
-				CoordType* coords_buffer,
-				size_t halo,
-				size_t local_size
-			) {
-				size_t neighs{0};
-				size_t h{0};
-				for( ; h < halo && local_size > 1; h++ ) {
-					*coords_buffer = h;
-
-					const size_t local_neighs{ prev_neighs.at( coords_buffer ) };
-					neighs += 2 * local_neighs; // the 2 sides
-					local_size -= 2;
-				}
-				*coords_buffer = h;
-				neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
-				return neighs;
-			}
-
-			template< typename CoordType > void __populate_halo_neighbors( size_t halo,
-				NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >& container ) {
-
-				using it_type = typename NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >::DomainIterator;
-				it_type end{ container.domain_end() };
-				for( it_type it{ container.domain_begin() }; it != end; ++it ) {
-					size_t res{1};
-					for( size_t h: it->get_position() ) res *= (h + 1 + halo);
-					container.at( it->get_position() ) = res;
-				}
-			}
-
-			template<
-				typename CoordType,
-				size_t DIMS
-			> size_t __init_halo_search(
-				typename LinearizedNDimSystem< CoordType, ArrayVectorStorage< DIMS, CoordType > >::ConstVectorReference
-					sizes,
-				size_t halo,
-				std::vector< NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > > >& dimension_limits
-			) {
-				using nd_vec = NDimVector< CoordType, CoordType, DynamicVectorStorage< CoordType > >;
-				using nd_vec_iterator = typename nd_vec::DomainIterator;
-
-				std::vector<size_t> halo_sizes( DIMS, halo + 1);
-				dimension_limits.emplace_back(halo_sizes);
-				// initialize values
-				__populate_halo_neighbors< CoordType >( halo, dimension_limits[0] );
-				for( size_t i{1}; i < DIMS; i++ ) {
-					std::vector<size_t> halos( DIMS - i, halo + 1 );
-					dimension_limits.emplace_back(halos);
-				}
-
-				std::array< CoordType, DIMS > prev_coords_buffer; // store at most DIMS values
-				CoordType* const prev_coords{ prev_coords_buffer.data() };
-				CoordType* const second{ prev_coords + 1 }; // store previous coordinates from second position
-				for( size_t dimension{1}; dimension < DIMS; dimension++ ) {
-					const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
-					nd_vec& current_neighs{dimension_limits[dimension]};
-
-					nd_vec_iterator end{ current_neighs.domain_end() };
-					for( nd_vec_iterator it{ current_neighs.domain_begin() }; it != end; ++it ) {
-						typename nd_vec::ConstDomainVectorReference current_halo_coords{ it->get_position() };
-
-						std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
-						size_t local_size{ sizes[dimension - 1] };
-						const size_t neighs{ __accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size) };
-						current_neighs.at(current_halo_coords) = neighs;
-					}
-				}
-				return __accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
-			}
-
-		} // namespace multigrid
-	} // namespace utils
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_GEOMETRY
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
index 871d62b7c..62e4dcd4a 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
@@ -15,16 +15,23 @@
  * limitations under the License.
  */
 
+/**
+ * @file linearized_halo_ndim_iterator.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedHaloNDimSystem.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
 
 #include <cstddef>
 #include <vector>
-#include <utility>
 #include <iterator>
 #include <limits>
 #include <cstddef>
 
+#include <graphblas/utils/iterators/utils.hpp>
+
 #include "linearized_ndim_system.hpp"
 #include "array_vector_storage.hpp"
 #include "linearized_ndim_iterator.hpp"
@@ -39,6 +46,60 @@ namespace grb {
 				typename SizeType
 			> class LinearizedHaloNDimSystem;
 
+			/**
+			 * Class to iterate over the \b neighbors of a system with halo: by advancing the iterator,
+			 * the user can traverse all neighbors of all elements one-by-one, in order, for example, to
+			 * emit all possible copies element-neighbor.
+			 *
+			 * Example: for a 2-dimensional 3 x 3 system with halo 1, with elements numbered as in
+			 *
+			 * 0 1 2
+			 * 3 4 5
+			 * 6 7 8
+			 *
+			 * the emitted couples <element-neighbor> are:
+			 *
+			 * 0-0, 0-1, 0-3, 0-4; 1-0, 1-1, 1-2, 1-3, 1-4, 1-5; 2-1, 2-2, 2-4, 2-5;
+			 * 3-0, 3-1, 3-3, 3-4; 4-0, 4-1, 4-2, 4-3, 4-4, 4-5, 4-6, 4-7, 4-8; and so on.
+			 *
+			 * It implements two interfaces for iteration. The first is a standard STL-like
+			 * interface meeting the random-access requirements, with operators \a ++, \a *, \a ->,
+			 * \a +=, \a -, \a ==; these facilities iterate over \b all neighbors of the underlying system,
+			 * automatically updating the corresponding element the neighbor is associated to.
+			 * The second interface is a custom (Java-like) one that allows to iterate separately over elements
+			 * and their neighbors: the user can query whether more elements exist, move to the next element,
+			 * iterate over the neighbors of the current element, query whether more neighbors exist for the
+			 * current element.
+			 *
+			 * The state of this structure essentially contains:
+			 *
+			 * 1. a const-pointer to a LinearizedHaloNDimSystem<DIMS,SizeType> object, storing the geometry
+			 * information of the N-dimensional system.
+			 * 2. the iterator to the current element (which in turn provides the element's vector
+			 *  and linear coordinates)
+			 * 3. the vector coordinate of the current neighbor
+			 * 4. the linear coordinate of the current neighbor
+			 * 5. information about the current element's neighbors space:
+			 *   1. the N-dimensional sub-space of neighbors w.r.t. the current element: this
+			 *    LinearizedHaloNDimSystem<DIMS,SizeType> object stores the sizes of the neighbors's sub-space
+			 *    centered around the current element (at most <em>2 * halo + 1</em> per dimension, if the current
+			 *    element is an inner one); hence, it computes coordinates and provides iterators that are
+			 *    \b relative to the current element
+			 *   2. vector coordinates of the first neighbor of the current element, in the main system
+			 *    (i.e. \b not relative); this allows computing any neighbor as the sum of this vector
+			 *    plus its relative coordinates in the neighbors' sub-space
+			 *   3. iterator to the current neighbor, built out of the relative sub-space, to actually iterate
+			 *    over the current element's neighbors
+			 *   4. iterator to the last neighbor of the current element, to stop the iteration over neighbors
+			 *    and advance to the next element.
+			 *
+			 * The above-mentioned methods to advance the iterator \c this (over neighbors or elements)
+			 * take care of updating these structures properly, keeping the state \b always coherent.
+			 *
+			 * @tparam DIMS syztem number of dimensions
+			 * @tparam SizeType type of coordinates and of sizes (must be large enough to describe the size
+			 * of the system along each direction)
+			 */
 			template<
 				size_t DIMS,
 				typename SizeType
@@ -52,6 +113,11 @@ namespace grb {
 				using ConstVectorReference = typename VectorIteratorType::ConstVectorReference;
 				using SelfType = LinearizedHaloNDimIterator< DIMS, SizeType >;
 
+				/**
+				 * Structure holding the information about a neighbor in a system: its linear
+				 * and vector coordinates and the element it is neighbor of (in the form of both
+				 * linear and vectoor coordinate).
+				 */
 				struct HaloNDimElement {
 				private:
 
@@ -84,22 +150,37 @@ namespace grb {
 
 					HaloNDimElement& operator=( const HaloNDimElement& ) = default;
 
+					/**
+					 * Get the element as vector coordinates.
+					 */
 					ConstVectorReference get_element() const {
 						return this->_element_iter->get_position();
 					}
 
+					/**
+					 * Get the element as linear coordinates.
+					 */
 					size_t get_element_linear() const {
 						return this->_system->ndim_to_linear( this->_element_iter->get_position() );
 					}
 
+					/**
+					 * Get the neighbor as vector coordinates.
+					 */
 					ConstVectorReference get_neighbor() const {
 						return this->_neighbor;
 					}
 
+					/**
+					 * Get the neighbor as linear coordinates.
+					 */
 					size_t get_neighbor_linear() const {
 						return this->_system->ndim_to_linear( this->_neighbor );
 					}
 
+					/**
+					 * Get the (unique) neighbor number in the system.
+					 */
 					SizeType get_position() const {
 						return this->_position;
 					}
@@ -112,54 +193,22 @@ namespace grb {
 				using reference = const HaloNDimElement&;
 				using difference_type = signed long;
 
-			private:
-				HaloNDimElement _point;
-				LinearizedNDimSystem< SizeType, VectorType > _neighbors_linearizer;
-				VectorIteratorType _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
-				VectorType _neighbors_start;
-				VectorIteratorType _neighbor_end;
-
-				inline void __update_neighbor() {
-					for( size_t i{0}; i < DIMS; i++ ) {
-						this->_point._neighbor[i] = this->_neighbors_start[i] + this->_neighbor_iter->get_position()[i];
-					}
-				}
-
-				inline void on_neighbor_iter_update() {
-					this->__update_neighbor();
-				}
-
-				void on_element_update() {
-					// reset everything
-					VectorType neighbors_range( DIMS );
-					this->_point._system->compute_neighbors_range(
-						this->_point._element_iter->get_position(),
-						this->_neighbors_start,
-						neighbors_range
-					);
-					// re-target _neighbors_linearizer
-					this->_neighbors_linearizer.retarget( neighbors_range );
-				}
-
-				void on_element_advance() {
-					this->on_element_update();
-
-					this->_neighbor_iter = VectorIteratorType( this->_neighbors_linearizer );
-					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer );
-
-					this->on_neighbor_iter_update();
-				}
-
-			public:
-
 				LinearizedHaloNDimIterator() = delete;
 
+				/**
+				 * Construct a new LinearizedHaloNDimIterator object from the underlying system
+				 * \p system (whose geometry information is used to iterate). The constructed object
+				 * points to the first neighbor of the first element, i.e. the one with vector coordinates
+				 * \a [0,0,...,0].
+				 *
+				 * IF \p system is not valid anymore, then also \c this is not.
+				 */
 				LinearizedHaloNDimIterator( const SystemType& system ) noexcept :
 					_point( system ),
-					_neighbors_linearizer( DIMS, system.halo() + 1 ),
-					_neighbor_iter( this->_neighbors_linearizer ),
+					_neighbors_subspace( DIMS, system.halo() + 1 ),
 					_neighbors_start( DIMS ),
-					_neighbor_end( VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer ) )
+					_neighbor_iter( this->_neighbors_subspace ),
+					_neighbor_end( VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace ) )
 				{
 					std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
 				}
@@ -180,30 +229,54 @@ namespace grb {
 					return &(this->_point);
 				}
 
+				/**
+				 * Tells whether the current element has more neighbor available (on which the user
+				 * has not iterated yet).
+				 */
 				bool has_more_neighbours() const {
 					return this->_neighbor_iter != this->_neighbor_end;
 				}
 
+				/**
+				 * Moves \c this to point to the next neighbor (if any, exception otherwise).
+				 *
+				 * Does \b not advance the element, which should be done manually via #next_element().
+				 */
 				void next_neighbour() {
+					if( !has_more_neighbours() ) {
+						throw std::out_of_range("the current element has no more neighbors");
+					}
 					++(this->_neighbor_iter);
 					this->on_neighbor_iter_update();
 					this->_point._position++;
 				}
 
+				/**
+				 * Tells whether the system has more elements.
+				 */
 				bool has_more_elements() const {
 					return this->_point.get_element_linear() != (this->_point._system)->base_system_size();
 				}
 
+				/**
+				 * Moves \c this to point to the next element, setting the neighbor as the first one.
+				 */
 				void next_element() {
-					size_t num_neighbours = this->_neighbors_linearizer.system_size();
+					if( !has_more_elements() ) {
+						throw std::out_of_range("the system has no more elements");
+					}
+					size_t num_neighbours = this->_neighbors_subspace.system_size();
 					size_t neighbour_position_offset =
-						this->_neighbors_linearizer.ndim_to_linear( this->_neighbor_iter->get_position() );
+						this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
 					++(this->_point._element_iter);
 					this->on_element_advance();
 					this->_point._position -= neighbour_position_offset;
 					this->_point._position += num_neighbours;
 				}
 
+				/**
+				 * Moves \c this to point to the next neighbor, also advancing the element if needed.
+				 */
 				SelfType & operator++() noexcept {
 					++(this->_neighbor_iter);
 					if( !has_more_neighbours() ) {
@@ -217,60 +290,104 @@ namespace grb {
 					return *this;
 				}
 
+				/**
+				 * Moves \c this ahead of \p offste neighbors, also advancing the element if necessary.
+				 */
 				SelfType & operator+=( size_t offset ) {
 					if( offset == 1UL ) {
 						return this->operator++();
 					}
-					const size_t final_position { this->_point._position + offset };
+					const size_t final_position = this->_point._position + offset;
 					if( final_position > this->_point._system->halo_system_size() ) {
 						throw std::range_error( "neighbor linear value beyond system" );
 					}
 					VectorType final_element( DIMS );
-					size_t neighbor_index{ (this->_point._system->neighbour_linear_to_element( final_position, final_element )) };
+					size_t neighbor_index = (this->_point._system->neighbour_linear_to_element( final_position, final_element ));
 
 					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
 					this->_point._position = final_position;
 
 					this->on_element_update();
-					this->_neighbors_linearizer.linear_to_ndim( neighbor_index, final_element );
+					this->_neighbors_subspace.linear_to_ndim( neighbor_index, final_element );
 
-					this->_neighbor_iter = VectorIteratorType( this->_neighbors_linearizer, final_element.cbegin() );
-					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_linearizer );
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_subspace, final_element.cbegin() );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace );
 					this->on_neighbor_iter_update();
 
 					return *this;
 				}
 
+				/**
+				 * Returns the difference between \c this and \p other in the linear space of neighbors,
+				 * i.e. how many times \p other must be advanced in order to point to the same neighbor of \c this.
+				 *
+				 * It throws if the result cannot be stored as a difference_type variable.
+				 */
 				difference_type operator-( const SelfType &other ) const {
-					/*
-					if( _point.get_position() < a_point.get_position() ) {
-						throw std::invalid_argument( "first iterator is in a lower position than second" );
-					}
-					*/
-					size_t a_pos{ _point.get_position() }, b_pos{ other._point.get_position() };
-					// std::cout << "diff " << a_pos << " - " << b_pos << std::endl;
-					size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
-					using diff_t = typename LinearizedHaloNDimIterator< DIMS, SizeType >::difference_type;
-
-					if( highest - lowest > static_cast< size_t >(
-						std::numeric_limits< diff_t >::max() ) ) {
-						throw std::invalid_argument( "iterators are too distant" );
-					}
-
-					return ( static_cast< diff_t >( a_pos - b_pos ) );
+					return grb::utils::compute_signed_distance< difference_type, SizeType >(
+						_point.get_position(), other._point.get_position() );
 				}
 
-				// implementation depending on logic in operator++
+				/**
+				 * Utility to build an iterator to the end of the system \p system.
+				 *
+				 * The implementation depends on the logic of operator++.
+				 */
 				static SelfType make_system_end_iterator( const SystemType& system ) {
 					SelfType result( system );
-
 					// go to the very first point outside of space
 					result._point._element_iter = VectorIteratorType::make_system_end_iterator( system );
 					result.on_element_advance();
 					result._point._position = system.halo_system_size();
-
 					return result;
 				}
+
+			private:
+				HaloNDimElement _point;
+				LinearizedNDimSystem< SizeType, VectorType > _neighbors_subspace;
+				VectorType _neighbors_start;
+				VectorIteratorType _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
+				VectorIteratorType _neighbor_end;
+
+				/**
+				 * To be called when the iterator pointing to the neighbor is updated in order to update
+				 * the actual neighbor's coordinates.
+				 */
+				inline void on_neighbor_iter_update() {
+					for( size_t i = 0; i < DIMS; i++ ) {
+						this->_point._neighbor[i] = this->_neighbors_start[i]
+							+ this->_neighbor_iter->get_position()[i];
+					}
+				}
+
+				/**
+				 * To be called after the iterator pointing to the element is updated in order to
+				 * reset the information about the neighbor.
+				 */
+				void on_element_update() {
+					// reset everything
+					VectorType neighbors_range( DIMS );
+					this->_point._system->compute_neighbors_range(
+						this->_point._element_iter->get_position(),
+						this->_neighbors_start,
+						neighbors_range
+					);
+					// re-target _neighbors_subspace
+					this->_neighbors_subspace.retarget( neighbors_range );
+				}
+
+				/**
+				 * To be called after the iterator pointing to the element is updated in order to update
+				 * all information about the neighbor, like iterator, sorrounding halo and coordinates.
+				 */
+				void on_element_advance() {
+					this->on_element_update();
+
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_subspace );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace );
+
+					this->on_neighbor_iter_update();
+				}
 			};
 
 		} // namespace multigrid
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
index cc84de621..d448fd426 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -1,4 +1,26 @@
 
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file linearized_halo_ndim_system.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedHaloNDimSystem.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
 #define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
 
@@ -7,17 +29,58 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#ifdef _DEBUG
+#include <iostream>
+#endif
 
 #include "array_vector_storage.hpp"
+#include "dynamic_vector_storage.hpp"
+#include "ndim_vector.hpp"
 #include "linearized_ndim_system.hpp"
-#include "linearized_halo_ndim_geometry.hpp"
 #include "linearized_halo_ndim_iterator.hpp"
 
 namespace grb {
 	namespace utils {
 		namespace multigrid {
 
-			// only with ArrayVectorStorage
+			/**
+			 * Structure to represent an N-dimensional space (or \a system) of given sizes and to
+			 * iterate on both the \a elements of the N-dimensional system and the N-dimensional
+			 * \a neighbors of each element within a given \p halo. This facility takes into account
+			 * the various cases where the element is at the corner, edge or face of the N-dimensional
+			 * system, to which different neighbors correspond. Both elements and their neighbors are
+			 * vectors in the N-dimensional system and as such described via both N-dimensional coordinates
+			 * and a linear coordinate.
+			 *
+			 * This structure returns the number of elements of the underlying N-dimensional system
+			 * (the \a base system) via #base_system_size() and the total sum of neighbors of all
+			 * system elements via #halo_system_size().
+			 *
+			 * The peculiar feature of this structure is the method #neighbour_linear_to_element(), to translate
+			 * a neighbor index (i.e. a value from \a 0 to #halo_system_size(), uniquely identifying an element
+			 * as neighbor of an element) to the N-dimensional coordinates of the corresponding elements in a time
+			 * that is constant with respect to the input value (it depends on \p DIMS and the halo size).
+			 * This facility allows the iterators of a LinearizedNDimSystem to be random-access: when advancing
+			 * an iterator by an \a offset via the \a += method, the logic:
+			 *
+			 * - increments the index of the current neighbor (stored inside the iterator) by \a offset, thus
+			 *  computing the index of the destination neighbor (constant time)
+			 * - translates the index of the destination neighbor to its base element's coordinates via
+			 *  #neighbour_linear_to_element() (constant time)
+			 *
+			 * The same method also returns the index of the destination neighbor within the sub-space of the base
+			 * element's neighbors: hence, the logic can compute in constant time the destination base element
+			 * and its destination neighbor. The constant time of this translation is achieved by pre-computing
+			 * the number of neighbors for each element along each dimension: for example, inner elements in
+			 * a 3D mesh with halo 1 have 27 neighbors. Thus, it suffices in principle to divide the neighbor
+			 * index by 27 to compute the base element of a neighbor. Care must be taken for elements at the
+			 * sides of each dimension: for example, a corner element on a face has 8 neighbors, while a corner
+			 * element in an iternal slab (a 2D "plane" in a 3D mesh) has 12 neighbors. The pre-computed
+			 * information and the logic also account for this.
+			 *
+			 * @tparam DIMS number of dimensions of the system
+			 * @tparam SizeType type storing the system sizes and offsets
+			 */
 			template<
 				size_t DIMS,
 				typename SizeType
@@ -30,10 +93,20 @@ namespace grb {
 				using BaseType = LinearizedNDimSystem< SizeType, VectorType >;
 				using Iterator = LinearizedHaloNDimIterator< DIMS, SizeType >;
 
-				LinearizedHaloNDimSystem( ConstVectorStorageType sizes, SizeType halo ):
+				/**
+				 * Construct a new LinearizedHaloNDimSystem object with given sizes and halo.
+				 *
+				 * The size of \p sizes must be exactly \p DIMS. Each size must be so that there is at least
+				 * en element in the system with full halo neighors, i.e. for each size \a s
+				 * <em>s >= 2 * halo + 1</em> (otherwise an exception is thrown).
+				 */
+				LinearizedHaloNDimSystem(
+					ConstVectorStorageType sizes,
+					SizeType halo
+				) :
 					BaseType( sizes.cbegin(), sizes.cend() ),
-					_halo( halo ) {
-
+					_halo( halo )
+				{
 					for( SizeType __size : sizes ) {
 						if ( __size < 2 * halo + 1 ) {
 							throw std::invalid_argument(
@@ -43,9 +116,8 @@ namespace grb {
 						}
 					}
 
-					this->_system_size = __init_halo_search< SizeType, DIMS >(
-							this->get_sizes(),
-							_halo, this->_dimension_limits );
+					this->_system_size = init_neigh_to_base_search( this->get_sizes(),
+						_halo, this->_dimension_limits );
 					assert( this->_dimension_limits.size() == DIMS );
 				}
 
@@ -61,49 +133,427 @@ namespace grb {
 
 				SelfType & operator=( SelfType && ) = delete;
 
+				/**
+				 * Builds an iterator from the beginning of the system, i.e. from vector \a [0,0,...,0].
+				 * The iterator iterates on each neighbor and allows iterating on each element and on
+				 * its neighbors.
+				 */
 				Iterator begin() const {
 					return Iterator( *this );
 				}
 
+				/**
+				 * Build an iterator marking the end of the system; it should not be accessed.
+				 */
 				Iterator end() const {
 					return Iterator::make_system_end_iterator( *this );
 				}
 
+				/**
+				 * Returns the size of the entire system, i.e. the number of neighbors of all elements.
+				 */
 				size_t halo_system_size() const {
 					return this->_system_size;
 				}
 
+				/**
+				 * Returns the size of the base system, i.e. number of elements (not considering neighbors).
+				 */
 				size_t base_system_size() const {
 					return this->BaseType::system_size();
 				}
 
+				/**
+				 * Returns the halo size.
+				 */
 				size_t halo() const {
 					return this->_halo;
 				}
 
+				/**
+				 * Computes the first neighbor and the size of the N-dimensional range of neighbors
+				 * around the given element's coordinates for the system \c this.
+				 *
+				 * @param[in] element_coordinates coordinates of the element to iterate around
+				 * @param[out] neighbors_start first neighbor around \p element_coordinates to iterate from
+				 * @param[out] neighbors_range vector of halos around \p element_coordinates;
+				 * if \p element_coordinates is an inner point, all values equal #halo(), they are smaller
+				 * otherwise (on corner, edge, or face).
+				 */
 				void compute_neighbors_range(
-					const VectorType &system_coordinates,
+					const VectorType &element_coordinates,
 					VectorType &neighbors_start,
-					VectorType &neighbors_range) const noexcept {
-					__compute_neighbors_range( this->get_sizes(),
+					VectorType &neighbors_range
+				) const noexcept {
+					compute_first_neigh_and_range( this->get_sizes(),
 						this->_halo,
-						system_coordinates,
+						element_coordinates,
 						neighbors_start,
 						neighbors_range
 					);
 				}
 
+				/**
+				 * Maps the linear index \p neighbor_linear of a neighbor to the vector \p base_element_vector
+				 * of the corresponding element \p neighbor_linear is neighbor of, and returns the neighbor's
+				 * number within the sub-space of \p base_element_vector 's neighbors.
+				 *
+				 * @param[in] neighbor_linear linear coordinate of input neighbor
+				 * @param[out] base_element_vector vector of coordinates that identify which element
+				 *  \p neighbor_linear is neighbor of
+				 * @return size_t the neighbor number w.r.t. to the corresponding element: if \a e is the system
+				 * element \p neighbor_linear is neighbor of and \a e has \a n neighbors, then the return value
+				 * \a 0<=i<n is the the index of \p neighbor_linear among \a e's neighbors, computed w.r.t. the
+				 * iteration order.
+				 */
 				size_t neighbour_linear_to_element (
-					SizeType neighbor,
-					VectorType &result) const noexcept {
-					return __neighbour_to_system_coords( this->get_sizes(),
-					this->_system_size, this->_dimension_limits, this->_halo, neighbor, result );
+					SizeType neighbor_linear,
+					VectorType &base_element_vector
+				) const noexcept {
+					return map_neigh_to_base_and_index( this->get_sizes(), this->_system_size,
+						this->_dimension_limits, this->_halo, neighbor_linear, base_element_vector );
 				}
 
 			private:
 				const SizeType _halo;
 				std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > _dimension_limits;
 				size_t _system_size;
+
+				/**
+				 * Computes the total number of neighbors along a certain dimension and configuration by accumulating
+				 * the neighbors along the smaller dimensions.
+				 *
+				 * The logic uses this buffer to iterate over the configurations of
+				 * the previous dimension. Example: to compute in 3D the neighbors of an inner row of a face
+				 * (configuration <em>[0,1,0]</em>, dimension 1 - y), the logic needs the neighbors of
+				 * en edge element and of an element internal to a face of the mesh, corresponding to
+				 * the configurations <em>[0,1,0]</em> and <em>[1,1,0]</em>, respectively. Hence, the caller
+				 * must initialize a buffer with the values <em>[X,1,0]</em> (\a X meaning don't care) and pass
+				 * as \p coords_buffer the pointer to the first position (the \a X ), where this function
+				 * will write all possible values <em>[0, \p halo )</em> to access the number of neighbors
+				 * of the configurations of the previous dimension via \p prev_neighs and accumulate them.
+				 *
+				 * @param[in] prev_neighs neighbors in the configurations of the previous dimension
+				 * @param[in,out] coords_buffer pointer to the first position of the configuration buffer
+				 *  for this dimension
+				 * @param[in] halo halo size
+				 * @param[in] local_size size (i.e., number of elements) along the current dimension,
+				 *  including the edges
+				 * @return size_t the total number of neighbors for this configuration and this dimension
+				 */
+				static size_t accumulate_dimension_neighbours(
+					const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > &prev_neighs,
+					SizeType* coords_buffer,
+					size_t halo,
+					size_t local_size
+				) {
+					size_t neighs =0;
+					size_t h = 0;
+					for( ; h < halo && local_size > 1; h++ ) {
+						*coords_buffer = h;
+
+						const size_t local_neighs = prev_neighs.at( coords_buffer );
+						neighs += 2 * local_neighs; // the 2 sides
+						local_size -= 2;
+					}
+					*coords_buffer = h;
+					neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
+					return neighs;
+				}
+
+				/**
+				 * Computes the number of neighbors for each configuration along dimension 0:
+				 * corner, edge, face, inner element.
+				 *
+				 * Example: in a 3D system with <em>\p halo = 1</em>, the configurations along dimension 0 are 8:
+				 * 1. z axis - face:
+				 *   1. y axis - top row: corner element (8 neighbors), edge element (12 neighbors)
+				 *   2  y axis - inner row: edge element (12 neighbors), face inner element (18 neighbors)
+				 * 2. z axis - inner slab:
+				 *   1. y axis - top row: edge element (12 neighbors), face inner element (18 neighbors)
+				 *   2  y axis - inner row: face inner element (18 neighbors), inner element (27 neighbors)
+				 *
+				 * @param[in] halo halo size
+				 * @param[out] config_neighbors the storage object for each configuration
+				 */
+				static void compute_dim0_neighbors(
+					size_t halo,
+					NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >& config_neighbors
+				) {
+					using it_type = typename NDimVector< SizeType, SizeType,
+						DynamicVectorStorage< SizeType > >::DomainIterator;
+					it_type end = config_neighbors.domain_end();
+					for( it_type it = config_neighbors.domain_begin(); it != end; ++it ) {
+						size_t res = 1;
+						for( size_t h: it->get_position() ) res *= (h + 1 + halo);
+						config_neighbors.at( it->get_position() ) = res;
+					}
+				}
+
+				/**
+				 * Initializes the search space of neighbors for the <neighbor linear> -> <base vector> translation.
+				 *
+				 * This function populates an std::vector<> with the number of neighors for each dimension
+				 * and each configuration (corner, edge, face, inner).
+				 * Along each dimension \a d, it stores an \a n -dimensional vector
+				 * NDimVector<SizeType,SizeType,DynamicVectorStorage< SizeType>> (<em>n = 2 ^ d</em>) with all
+				 * possible numbers of neighbors along that dimension, depending on the position of the element
+				 * (corner, edge, face, inner volume); for example, for 3 dimensions:
+				 *  - dimension 2 (z axis) moves along "slabs" of a 3D systems, where the total number of neighbors
+				 *   depends on whether the slab is a face of the mesh of an internal slab (2 possible configurations:
+				 *   face slabs or inner slabs)
+				 *  - dimension 1 (y axis) moves along "rows" within each slab, whose total number of neighbors
+				 *	  depends on whether the row is at the extreme sides (top or bottom of the face) or inside;
+				 *   in turn, each type of slab has different geometry (face slabs comprise mesh corners, edges and faces,
+				 *   while inner slabs comprise edges, faces and inner elements), thus resulting in 2*2 different
+				 *   configurations of dimension-1 total neighbors
+				 *  - dimension 0 (x axis) moves along "column" elements within each row, where the first (or last)
+				 *   column has a different number of neighbors than the inner ones; here again are two configuration
+				 *   for each dimension-1 configuration, leading to a total of 8 dimension-1 configurations
+				 * Within each dimension \a d, each configuration (as per the above explanation) can be identified
+				 * via a vector of <em>N - d</em> coordinates; to limit the data storage, every dimension stores the
+				 * total number of neighbors only at the first side and inside, since the second side  is identical
+				 * to the first one: for example, along the z axis the first and last slab (those on the two extremes)
+				 * have the same size, and one only is stored. Therefore, with <em>halo = 1</em> a vector identifying
+				 * a configuration is composed only of 0s and 1s. For example, the vector <em>[0,1,0]</em> identifies:
+				 * - rightmost 0 (z axis): first (or last) slab, i.e. face slab
+				 * - (middle) 1 (y axis): inner row
+				 * - leftmost 0 (x axis): first (or last) element, i.e. on the edge of the mesh
+				 * In a 3D space with <em>halo = 1</em>, this element has 12 neighbors (it is on the edge of a face).
+				 *
+				 * @paragraph[in] vector of sizes sizes of the N-dimensional system
+				 * @param[in] halo halo size
+				 * @param[out] dimension_limits the std::vector<> with the neighbors information for each dimension
+				 *  and each configuration
+				 * @return size_t the number of neighbors of the entire system
+				 */
+				static size_t init_neigh_to_base_search(
+					typename LinearizedNDimSystem< SizeType,
+						ArrayVectorStorage< DIMS, SizeType > >::ConstVectorReference
+						sizes,
+					size_t halo,
+					std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > >& dimension_limits
+				) {
+					using nd_vec = NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >;
+					using nd_vec_iterator = typename nd_vec::DomainIterator;
+
+					std::vector<size_t> halo_sizes( DIMS, halo + 1);
+					dimension_limits.emplace_back(halo_sizes);
+					// initialize values
+					compute_dim0_neighbors( halo, dimension_limits[0] );
+					for( size_t i = 1; i < DIMS; i++ ) {
+						std::vector<size_t> halos( DIMS - i, halo + 1 );
+						dimension_limits.emplace_back(halos);
+					}
+
+					std::array< SizeType, DIMS > prev_coords_buffer; // store at most DIMS values
+					SizeType* const prev_coords = prev_coords_buffer.data();
+					SizeType* const second = prev_coords + 1; // store previous coordinates from second position
+					for( size_t dimension = 1; dimension < DIMS; dimension++ ) {
+						const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
+						nd_vec& current_neighs{dimension_limits[dimension]};
+
+						nd_vec_iterator end = current_neighs.domain_end();
+						for( nd_vec_iterator it = current_neighs.domain_begin(); it != end; ++it ) {
+							typename nd_vec::ConstDomainVectorReference current_halo_coords = it->get_position();
+
+							std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
+							size_t local_size = sizes[dimension - 1];
+							const size_t neighs = accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size);
+							current_neighs.at(current_halo_coords) = neighs;
+						}
+					}
+					return accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
+				}
+
+				/**
+				 * For the given system (with sizes \p _system_sizes), the given halo size \p halo,
+				 * the given element's coordinates \p element_coordinates, computes the coordinates
+				 * of the first neighbor of \p element_coordinates into \p neighbors_start (within the main system)
+				 * and the range of neighbors of \p element_coordinates, i.e. the sub-space of neighbors of
+				 * \p element_coordinates; hence, \p neighbors_range stores at most <em>2 *<\em> \p halo
+				 * <em> + 1</em> per coordinate.
+				 *
+				 * @param[in] _system_sizes sizes of the N-dimensional system
+				 * @param[in] halo halo size
+				 * @param[in] element_coordinates coordinates of the considered element
+				 * @param[out] neighbors_start stores the (absolute) coordinates of the first neighbor
+				 *  of \p element_coordinates
+				 * @param[out] neighbors_range stores the range of neighbors around \p element_coordinates
+				 */
+				static void compute_first_neigh_and_range(
+					const ArrayVectorStorage< DIMS, SizeType > &_system_sizes,
+					const SizeType halo,
+					const ArrayVectorStorage< DIMS, SizeType > &element_coordinates,
+					ArrayVectorStorage< DIMS, SizeType > &neighbors_start,
+					ArrayVectorStorage< DIMS, SizeType > &neighbors_range
+				) {
+					for( SizeType i = 0; i < DIMS/* - 1*/; i++ ) {
+						const SizeType start = element_coordinates[i] <= halo ? 0 : element_coordinates[i] - halo;
+						const SizeType end = std::min( element_coordinates[i] + halo, _system_sizes[i] - 1 );
+						neighbors_start[i] = start;
+						neighbors_range[i] = end - start + 1;
+					}
+				}
+
+#ifdef _DEBUG
+				template< typename IterType > static std::ostream & print_sequence( IterType begin, IterType end ) {
+					for( ; begin != end; ++begin ) {
+						std::cout << *begin << ' ';
+					}
+					return std::cout;
+				}
+#endif
+
+				/**
+				 * Maps a neighbor's linear coordinate \p neighbor_linear to the element \p element_vector it is
+				 * neighbor of and also returns the neighbor index of \p neighbor_linear within the sub-space
+				 * of \p element_vector's neighbors.
+				 *
+				 * @param[in] sizes main system sizes along all dimensions
+				 * @param[in] system_size total size of the neighbors system, i.e. the total number of neighbors
+				 * @param[in] neighbors_per_dimension along each dimension \a d, it stores an \a n -dimensional vector
+				 *  NDimVector<SizeType,SizeType,DynamicVectorStorage< SizeType>> (<em>n = 2 ^ d</em>) with all
+				 *  possible numbers of neighbors along that dimension, depending on the position of the element
+				 *  (corner, edge, face, inner volume)
+				 * @param[in] halo halo size
+				 * @param[in] neighbor_linear linear coordinate of the neighbor
+				 * @param[out] element_vector coordinates vector representing the element \p neighbor_linear is
+				 *  neighbor of
+				 * @return size_t the index of the neighbor within the element's neighbors
+				 */
+				static size_t map_neigh_to_base_and_index(
+					const std::array< SizeType, DIMS > &sizes,
+					size_t system_size,
+					const std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > >
+						&neighbors_per_dimension,
+					SizeType halo,
+					SizeType neighbor_linear,
+					ArrayVectorStorage< DIMS, SizeType > &element_vector
+				){
+					if( neighbor_linear > system_size ) {
+						throw std::invalid_argument( "neighbor number ( " + std::to_string( neighbor_linear )
+							+ " ) >= system size ( " + std::to_string( system_size ) + " )");
+					}
+					ArrayVectorStorage< DIMS, SizeType > configuration( DIMS );
+#ifdef _DEBUG
+					size_t * const halo_coords_end = configuration.data() + DIMS;
+#endif
+					std::fill_n( configuration.begin(), DIMS, 0 );
+
+					for( size_t _dim = DIMS; _dim > 0; _dim--) {
+
+						// each iteration looks for the base element along a dimension via the number of neighbors
+						// each element has: once previous_neighs reaches neighbor_linear, the corresponding
+						// base element is found; if the control reaches the end, this means it must explore
+						// the following dimension to find the base element: this is why dimensions are explored
+						// starting from the highest, because moving along a higher dimension means "skipping"
+						// more neighbors; then the search "zooms in"to a smaller dimension to find the base element
+
+						// start from highest dimension
+						const size_t dimension = _dim - 1;
+						// how many elements along this dimension
+						const size_t dimension_size = sizes[dimension];
+						// configurations of neighbors along this dimension
+						// (e.g., corner, edge; or edge, inner element)
+						const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >
+							& neighbors = neighbors_per_dimension[dimension];
+
+						// coordinate to modify to identify each configuration
+						SizeType * const halo_coords_begin = configuration.data() + dimension;
+#ifdef _DEBUG
+						std::cout << "DIMENSION " << dimension << std::endl
+							<< "- setup - neighbour " << neighbor_linear << std::endl
+							<< "\thalo : ";
+						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+#endif
+						size_t h =0; // configuration type along this dimension
+						size_t previous_neighs = 0;
+						*halo_coords_begin = h;
+						// account for neighbors in the first elements along the dimension, within halo distance:
+						// these elements have a number of neighbors that depends on the distance h
+						// and on the configuration
+						size_t halo_max_neighs = neighbors.at( halo_coords_begin );
+						while( h < halo && neighbor_linear >= previous_neighs + halo_max_neighs ) {
+							h++;
+							*halo_coords_begin = h;
+							previous_neighs += halo_max_neighs;
+							halo_max_neighs = neighbors.at( halo_coords_begin );
+						}
+#ifdef _DEBUG
+						std::cout << "- initial halo - neighbour " << neighbor_linear << std::endl
+							<< "\th " << h << std::endl
+							<< "\thalo : ";
+						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+						if ( h < halo ){
+							// we have already counted enough neighbors: neighbor_linear is thus a neighbor
+							// of one of the first (< halo) elements along this dimension: go to next dimension
+							element_vector[dimension] = h;
+							neighbor_linear -= previous_neighs;
+#ifdef _DEBUG
+							std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+							continue;
+						}
+						// saturation occurred: the base element is beyond the halo: go on with the search
+
+						// inner elements have the same number of neighbors halo_max_neighs: compute
+						// the base element via division
+						const size_t distance_from_halo = ( neighbor_linear - previous_neighs ) / halo_max_neighs;
+#ifdef _DEBUG
+						std::cout << "- before middle elements - neighbour " << neighbor_linear << std::endl
+							<< "\tprevious_neighs " << previous_neighs << std::endl
+							<< "\thalo_max_neighs " << halo_max_neighs << std::endl
+							<< "\tdistance_from_halo " << distance_from_halo << std::endl
+							<< "\tdimension_size " << dimension_size << std::endl;
+#endif
+						if ( distance_from_halo < dimension_size - 2 * halo ) {
+							// the base element is one of the internal elements along this dimension:
+							// hence return its diatance from the halo + the halo itself (= distance from
+							// beginning of the space)
+							element_vector[dimension] =  distance_from_halo + halo;
+							neighbor_linear -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
+#ifdef _DEBUG
+							std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+							continue;
+						}
+						// base element is even beyond inner elements, it might be among the elements at the end,
+						// which also have different numbers of neighbors (specular to initial elements)
+						previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
+#ifdef _DEBUG
+						std::cout << "- after middle elements -neighbour " << neighbor_linear << std::endl;
+						std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+						// look for base the element at the end of the dimension: specular search to beginning,
+						// just with h decreasing
+						h = halo - 1;
+						*halo_coords_begin = h;
+						halo_max_neighs = neighbors.at( halo_coords_begin );
+						while( h > 0 && neighbor_linear >= previous_neighs + halo_max_neighs ) {
+							h--;
+							*halo_coords_begin = h;
+							previous_neighs += halo_max_neighs;
+							halo_max_neighs = neighbors.at( halo_coords_begin );
+						}
+						neighbor_linear -= previous_neighs;
+#ifdef _DEBUG
+						std::cout << "- final halo - neighbour " << neighbor_linear << std::endl;
+						std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
+#endif
+						// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
+						// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
+						element_vector[dimension] = dimension_size - 1 - h;
+#ifdef _DEBUG
+						std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+					}
+					return neighbor_linear;
+				}
+
 			};
 
 		} // namespace multigrid
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
index f65ec8831..199d08926 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file linearized_ndim_iterator.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedNDimIterator.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
 
@@ -25,6 +31,8 @@
 #include <limits>
 #include <cstddef>
 
+#include <graphblas/utils/iterators/utils.hpp>
+
 #include "array_vector_storage.hpp"
 
 namespace grb {
@@ -37,6 +45,17 @@ namespace grb {
 				typename InternalVectorType
 			> class LinearizedNDimSystem;
 
+			/**
+			 * Iterator object couled to a LinearizedNDimSystem: each object points to a vector
+			 * in the creating LinearizedNDimSystem#dimensions()-dimensions space, to which also a
+			 * linear position is associated; both the vector and the linear position can be retrieved
+			 * via the \a -> method.
+			 *
+			 * It meets the requirements of a random access iterator.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
 			template<
 				typename SizeType,
 				typename InternalVectorType
@@ -47,6 +66,11 @@ namespace grb {
 				using ConstVectorReference = const VectorType&;
 				using SelfType = LinearizedNDimIterator< SizeType, InternalVectorType >;
 
+				/**
+				 * Structure describing a couple vector/linear coordinate: the vector
+				 * can be obtained via #get_position() while the linear coordinate via
+				 * #get_linear_position().
+				 */
 				struct NDimPoint {
 				private:
 					const LinNDimSysType* system; // pointer because of copy assignment
@@ -86,11 +110,30 @@ namespace grb {
 				using reference = const value_type&;
 				using difference_type = signed long;
 
+				/**
+				 * Construct a new LinearizedNDimIterator object from the original LinNDimSysType
+				 * object, storing the information about system dimensionality and sizes. The referenced
+				 * vector is the first one in the system, i.e. with all coordinates being \a 0.
+				 *
+				 * If \p _system is not a valid object anymore, all iterators created from it are also
+				 * not valid.
+				 */
 				LinearizedNDimIterator( const LinNDimSysType &_system ) noexcept :
 					_p( _system )
 				{}
 
-				template< typename IterT > LinearizedNDimIterator( const LinNDimSysType &_system, IterT begin ) noexcept :
+				/**
+				 * Construct a new LinearizedNDimIterator object from the original LinNDimSysType
+				 * object, storing the information about system dimensionality and sizes. The referenced
+				 * vector is initialized with the coordinates referenced via the iterator \p begin,
+				 * which should have at least \p _system.dimensions() valid successors.
+				 *
+				 * If \p _system is not a valid object anymore, all iterators created from it are also
+				 * not valid.
+				 */
+				template< typename IterT > LinearizedNDimIterator(
+					const LinNDimSysType &_system, IterT begin
+				) noexcept :
 					_p( _system )
 				{
 					std::copy_n( begin, _system.dimensions(), this->_p.coords.begin() );
@@ -105,10 +148,14 @@ namespace grb {
 
 				~LinearizedNDimIterator() {}
 
+				/**
+				 * Moves to the next vector in the multi-dimensional space, corresponding to
+				 * advancing the linear coordinate by 1.
+				 */
 				SelfType & operator++() noexcept {
-					bool rewind{ true };
+					bool rewind = true;
 					// rewind only the first N-1 coordinates
-					for( size_t i { 0 }; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
+					for( size_t i = 0; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
 						SizeType& coord = this->_p.coords[ i ];
 						// must rewind dimension if we wrap-around
 						SizeType plus = coord + 1;
@@ -122,24 +169,35 @@ namespace grb {
 					return *this;
 				}
 
+				/**
+				 * Moves \p _offset vectors ahead in the multi-dimensional space, corresponding to
+				 * advancing the linear coordinate by \p _offset.
+				 *
+				 * If the destination vector is outside of the system (i.e. the corresponding
+				 * linear coordinate is beyond the underlying LinearizedNDimSystem#system_size()),
+				 * an exception is thrown.
+				 */
 				SelfType & operator+=( size_t offset ) {
-					size_t linear{ _p.get_linear_position() + offset };
+					size_t linear = _p.get_linear_position() + offset;
 					if( linear > _p.system->system_size() ) {
 						throw std::invalid_argument("increment is too large");
 					}
+					if( offset == 1 ) {
+						return operator++();
+					}
 					_p.system->linear_to_ndim( linear, _p.coords );
 					return *this;
 				}
 
+				/**
+				 * Returns the difference between \p _other and \c this in the linear space.
+				 *
+				 * It throws if the result cannot be stored as a difference_type variable.
+				 */
 				difference_type operator-( const SelfType &other ) const {
-					size_t a_pos{ _p.get_linear_position() },
-						b_pos{ other._p.get_linear_position() };
-					size_t lowest{ std::min( a_pos, b_pos ) }, highest{ std::max( a_pos, b_pos )};
-					if( highest - lowest > static_cast< size_t >(
-						std::numeric_limits< difference_type >::max() ) ) {
-						throw std::invalid_argument( "iterators are too distant" );
-					}
-					return ( static_cast< difference_type >( a_pos - b_pos ) );
+					return grb::utils::compute_signed_distance< difference_type, SizeType >(
+						_p.get_linear_position(), other._p.get_linear_position() );
+
 				}
 
 				reference operator*() const {
@@ -151,22 +209,26 @@ namespace grb {
 				}
 
 				bool operator!=( const SelfType &o ) const {
-					const size_t dims{ this->_p.system->dimensions() };
+					const size_t dims = this->_p.system->dimensions();
 					if( dims != o._p.system->dimensions() ) {
 						throw std::invalid_argument("system sizes do not match");
 					}
-					bool equal{ true };
-					for( size_t i{0}; i < dims && equal; i++) {
+					bool equal = true;
+					for( size_t i =0; i < dims && equal; i++) {
 						equal &= ( this->_p.coords[i] == o._p.coords[i] );
 					}
 					return !equal;
 				}
 
-				// implementation depending on logic in operator++
+				/**
+				 * Facility to build an end iterator.
+				 *
+				 * Its implementation depending on the logic in operator++.
+				 */
 				static SelfType make_system_end_iterator( const LinNDimSysType &_system ) {
 					// fill with 0s
 					SelfType iter( _system );
-					size_t last{ iter->system->dimensions() - 1 };
+					size_t last = iter->system->dimensions() - 1;
 					// store last size in last position
 					iter._p.coords[ last ] = iter->system->get_sizes()[ last ];
 					return iter;
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
index 3e4c15b14..7b3c94341 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file linearized_ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of \p LinearizedNDimSystem.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
 
@@ -29,29 +35,24 @@
 
 #include "ndim_system.hpp"
 #include "linearized_ndim_iterator.hpp"
-// #include "array_vector_storage.hpp"
-
-/**
- * @file linearized_ndim_system.cpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * Definition of \p LinearizedNDimSystem.
- *
- * @date 2022-10-24
- */
 
 namespace grb {
 	namespace utils {
 		namespace multigrid {
 
 			/**
-			 * Extends a \p NDimSystem by linearizing it, i.e. it provides facilities to map a vector in
-			 * NDimSystem#dimensions() dimensions to a linear value ranging from \a 0 to #system_size()
+			 * Extends an NDimSystem by linearizing it, i.e. it provides facilities to map a vector in
+			 * NDimSystem#dimensions() dimensions to a linear value ranging from \a 0 to #system_size() (excluded)
 			 * and vice versa. Such a linearized representation allows user logic to iterate over the system:
-			 * iterators are indeed available via #begin()/#end().
+			 * iterators are indeed available via #begin()/#end(). Consecutive system elements along dimension 0
+			 * are mapped to consecutive linear values, while elements consecutive along dimension 1
+			 * are mapped at offset #get_offsets()[1] = #get_sizes()[0], elements along dimension 2
+			 * are mapped at offset #get_offsets()[2] = #get_sizes()[0] * #get_sizes()[0], and so on.
 			 *
 			 * Further facilities are methods to map users' vectors from linear to NDimSystem#dimensions()-dimensional
 			 * or vice versa and also to "retaget" the system, i.e. to represent a system of same dimensionality
-			 * but different sizes.
+			 * but different sizes; this last feature is a mere performance optimization aimed at
+			 * reusing existing objects instead of deleting them and allocating new memory.
 			 *
 			 * @tparam SizeType integral type to store the size of each dimension
 			 * @tparam InternalStorageType internal vector type to store the sizes
@@ -66,32 +67,44 @@ namespace grb {
 				using BaseType = NDimSystem< SizeType, InternalVectorType >;
 				using SelfType = LinearizedNDimSystem< SizeType, InternalVectorType >;
 				using VectorType = typename BaseType::VectorType;
-
 				using VectorReference = typename BaseType::VectorReference;
 				using ConstVectorReference = typename BaseType::ConstVectorReference;
 				using VectorStorageType = typename VectorType::VectorStorageType;
 				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
 				using Iterator = LinearizedNDimIterator< SizeType, InternalVectorType >;
 
+				/**
+				 * Construct a new LinearizedNDimSystem object from an iterable range,
+				 * where each iterator's position stores the size along each dimension; example:
+				 * *begin is the size along dimension 0, *(++begin) is the size along dimension 1 ...
+				 */
 				template< typename IterT > LinearizedNDimSystem( IterT begin, IterT end) noexcept :
 					BaseType( begin, end ),
-					offsets( std::distance( begin, end ) )
+					_offsets( std::distance( begin, end ) )
 				{
-					this->_system_size = compute_offsets( begin, end, this->offsets.begin() ) ;
+					this->_system_size = compute_range_product( begin, end, this->_offsets.begin() ) ;
 				}
 
+				/**
+				 * Construct a new LinearizedNDimSystem object with dimensions \p _sizes.size()
+				 * and sizes stored in \p _sizes.
+				 */
 				LinearizedNDimSystem( const std::vector< size_t > &_sizes ) noexcept :
 					LinearizedNDimSystem( _sizes.cbegin(), _sizes.cend() ) {}
 
-				LinearizedNDimSystem( size_t _dimensions, size_t max_value ) noexcept :
-					BaseType( _dimensions, max_value ),
-					offsets( _dimensions ),
+				/**
+				 * Construct a new LinearizedNDimSystem object with \p _dimensions dimensions
+				 * and sizes all equal to \p max_value.
+				 */
+				LinearizedNDimSystem( size_t _dimensions, size_t _size ) noexcept :
+					BaseType( _dimensions, _size ),
+					_offsets( _dimensions ),
 					_system_size( _dimensions )
 				{
-					SizeType v{1};
-					for( size_t i{0}; i < _dimensions; i++ ) {
-						this->offsets[i] = v;
-						v *= max_value;
+					SizeType v = 1;
+					for( size_t i =0; i < _dimensions; i++ ) {
+						this->_offsets[i] = v;
+						v *= _size;
 					}
 					this->_system_size = v;
 				}
@@ -101,7 +114,7 @@ namespace grb {
 				LinearizedNDimSystem( const SelfType &original ) = default;
 
 				LinearizedNDimSystem( SelfType &&original ) noexcept:
-					BaseType( std::move(original) ), offsets( std::move( original.offsets ) ),
+					BaseType( std::move(original) ), _offsets( std::move( original._offsets ) ),
 					_system_size( original._system_size ) {
 						original._system_size = 0;
 				}
@@ -112,34 +125,59 @@ namespace grb {
 
 				SelfType& operator=( SelfType &&original ) = delete;
 
+				/**
+				 * Computes the size of the system, i.e. its number of elements;
+				 * this corresponds to the product of the sizes along all dimensions.
+				 */
 				inline size_t system_size() const {
 					return this->_system_size;
 				}
 
+				/**
+				 * Get the offsets of the system, i.e. by how many linear elements moving along
+				 * a dimension corresponds to.
+				 */
 				inline ConstVectorReference get_offsets() const {
-					return this->offsets;
+					return this->_offsets;
 				}
 
+				/**
+				 * Computes the #dimensions()-dimensions vector the linear value in input corresponds to.
+				 *
+				 * @param[in] linear linear index
+				 * @param[out] output output vector \p linear corresponds to
+				 */
 				void linear_to_ndim( size_t linear, VectorReference output ) const {
 					if( linear > this->_system_size ) {
 						throw std::range_error( "linear value beyond system" );
 					}
-					for( size_t _i{ this->offsets.dimensions() }; _i > 0; _i-- ) {
-						const size_t dim{ _i - 1 };
-						const size_t coord{ linear / this->offsets[dim] };
+					for( size_t _i = this->_offsets.dimensions(); _i > 0; _i-- ) {
+						const size_t dim = _i - 1;
+						const size_t coord = linear / this->_offsets[dim];
 						output[dim] = coord;
-						linear -= ( coord * this->offsets[dim] );
+						linear -= ( coord * this->_offsets[dim] );
 					}
 					assert( linear == 0 );
 				}
 
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to \p InternalVectorType and checks whether each value in the input
+				 * vector \p ndim_vector is within the system sizes (otherwise it throws).
+				 */
 				size_t ndim_to_linear_check( ConstVectorReference ndim_vector) const {
 					return this->ndim_to_linear_check( ndim_vector.storage() );
 				}
 
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to the underlying storage of \p InternalVectorType and checks
+				 * whether each value in the input vector \p ndim_vector is within the system sizes
+				 * (otherwise it throws).
+				 */
 				size_t ndim_to_linear_check( ConstVectorStorageType ndim_vector ) const {
-					size_t linear { 0 };
-					for( size_t i { 0 }; i < this->dimensions(); i++ ) {
+					size_t linear = 0;
+					for( size_t i = 0; i < this->dimensions(); i++ ) {
 						if( ndim_vector[i] >= this->get_sizes()[i] ) {
 							throw std::invalid_argument( "input vector beyond system sizes" );
 						}
@@ -147,19 +185,34 @@ namespace grb {
 					return ndim_to_linear( ndim_vector );
 				}
 
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to \p InternalVectorType but does not check whether each value in the input
+				 * vector \p ndim_vector is within the system sizes.
+				 */
 				size_t ndim_to_linear( ConstVectorReference ndim_vector) const {
 					return this->ndim_to_linear( ndim_vector.storage() );
 				}
 
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to the underlying storage of \p InternalVectorType but does not check
+				 * whether each value in the input vector \p ndim_vector is within the system sizes.
+				 */
 				size_t ndim_to_linear( ConstVectorStorageType ndim_vector ) const {
-					size_t linear { 0 };
-					for( size_t i { 0 }; i < this->dimensions(); i++ ) {
-						linear += this->offsets[i] * ndim_vector[i];
+					size_t linear = 0;
+					for( size_t i = 0; i < this->dimensions(); i++ ) {
+						linear += this->_offsets[i] * ndim_vector[i];
 					}
 					return linear;
 				}
 
 				// must be same dimensionality
+				/**
+				 * Retargets the current object to describe a system with the same number of dimensions
+				 * and sizes \p _new_sizes. If the number of dimensions of \p _new_sizes does not match
+				 * #dimensions(), an exception is thrown.
+				 */
 				void retarget( ConstVectorReference _new_sizes ) {
 					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
 						throw std::invalid_argument("new system must have same dimensions as previous: new "
@@ -167,26 +220,42 @@ namespace grb {
 							+ std::to_string( this->_sizes.dimensions() ) );
 					}
 					this->_sizes = _new_sizes; // copy
-					this->_system_size = compute_offsets( _new_sizes.begin(), _new_sizes.end(), this->offsets.begin() ) ;
+					this->_system_size = compute_range_product( _new_sizes.begin(), _new_sizes.end(), this->_offsets.begin() ) ;
 				}
 
+				/**
+				 * Returns a beginning iterator to the #dimensions()-dimensional system \c this describes.
+				 * The provided iterator references a system point, described both via its #dimensions()-dimensional
+				 * coordinates and via a linear value from \a 0 to #system_size() (excluded).
+				 */
 				Iterator begin() const {
 					return Iterator( *this );
 				}
 
+				/**
+				 * Return an iterator to the end of the system; this iterator should not be
+				 * referenced nor incremented.
+				 */
 				Iterator end() const {
 					return Iterator::make_system_end_iterator( *this );
 				}
 
 			private:
-				VectorType offsets;
+				VectorType _offsets;
 				size_t _system_size;
 
+				/**
+				 * Incrementally computes the product of the input iterator's range, storing each value
+				 * into the position pointed to the output iterator; the accumulation starts from 1
+				 * (also the first output values), and the last accumulated value is returned directly
+				 * (and not stored). This assumes that the output container can store at least as many values
+				 * as in the input range.
+				 */
 				template<
 					typename IterIn,
 					typename IterOut
-				> static size_t compute_offsets( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
-					size_t prod{1};
+				> static size_t compute_range_product( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
+					size_t prod = 1;
 					for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
 						*out_begin = prod;
 						prod *= *in_begin;
diff --git a/include/graphblas/utils/multigrid/ndim_system.hpp b/include/graphblas/utils/multigrid/ndim_system.hpp
index 9d387ce32..f184a7042 100644
--- a/include/graphblas/utils/multigrid/ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/ndim_system.hpp
@@ -15,26 +15,21 @@
  * limitations under the License.
  */
 
+/**
+ * @file ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of NDimSystem.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
 
 #include <cstddef>
 #include <algorithm>
 #include <vector>
-#include <utility>
 #include <type_traits>
 #include <cstddef>
 
-#include "array_vector_storage.hpp"
-
-/**
- * @file ndim_system.cpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * Definition of \p NDimSystem.
- *
- * @date 2022-10-24
- */
-
 namespace grb {
 	namespace utils {
 		namespace multigrid {
@@ -61,7 +56,8 @@ namespace grb {
 				using SelfType = NDimSystem< SizeType, InternalVectorType >;
 
 				/**
-				 * Construct a new NDimSystem object from an iterable range.
+				 * Construct a new NDimSystem object from an iterable range, where each referenced value
+				 * is a size of the system.
 				 *
 				 * The dimension is computed as \a std::distance(begin,end), i.e.
 				 * \p IterT should be a random-access iterator for performance.
diff --git a/include/graphblas/utils/multigrid/ndim_vector.hpp b/include/graphblas/utils/multigrid/ndim_vector.hpp
index 26ee084e6..7992f23f6 100644
--- a/include/graphblas/utils/multigrid/ndim_vector.hpp
+++ b/include/graphblas/utils/multigrid/ndim_vector.hpp
@@ -1,4 +1,26 @@
 
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file ndim_vector.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of NDimVector.
+ */
+
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
 
@@ -20,7 +42,7 @@ namespace grb {
 			 * The user constructs an object by passing the sizes (as an N-dimensional vector)
 			 * of the iteration space and accesses the stored data via an N-dimensional vector of coordinates.
 			 *
-			 * Example: if the user constructs an \p NDimVector with 3D sizes \a [2,3,4], she can access data
+			 * Example: if the user constructs an NDimVector with 3D sizes \a [2,3,4], she can access data
 			 * via a 3D coordinates vector of ranges \a [0-1]x[0-2]x[0-3] (here \a x denoting the cartesian product)
 			 * by using the #at() method.
 			 *
@@ -50,12 +72,22 @@ namespace grb {
 
 				NDimVector() = delete;
 
+				/**
+				 * Construct a new NDimVector object with sizes read from the iteration range
+				 * and number of dimensions equal to the range distance; the data values are
+				 * \b not initialized.
+				 */
 				template< typename IterT > NDimVector( IterT begin, IterT end) :
 					_linearizer( begin, end )
 				{
 					this->data = new DataType[ _linearizer.system_size() ];
 				}
 
+				/**
+				 * Construct a new NDimVector object with sizes read from the \p _sizes
+				 * and number of dimensions equal to \p _sizes.size(); the data values are
+				 * \b not initialized.
+				 */
 				NDimVector( const std::vector< size_t > &_sizes ) :
 					NDimVector( _sizes.cbegin(), _sizes.cend() ) {}
 
@@ -81,34 +113,64 @@ namespace grb {
 					this->clean_mem();
 				}
 
+				/**
+				 * Number of dimensions of the underlying geometrical space.
+				 */
 				size_t dimensions() const {
 					return this->_linearizer.dimensions();
 				}
 
+				/**
+				 * Size of the the underlying geometrical space, i.e. number of stored data elements.
+				 */
 				size_t data_size() const {
 					return this->_linearizer.system_size();
 				}
 
+				/**
+				 * Access the data element at N-dimension coordinate given by the iterable
+				 * \p coordinates.
+				 */
 				inline DataType& at( ConstDomainVectorReference coordinates ) {
 					return this->data[ this->get_coordinate( coordinates.storage() ) ];
 				}
 
+				/**
+				 * Const-access the data element at N-dimension coordinate given by the iterable
+				 * \p coordinates.
+				 */
 				inline const DataType& at( ConstDomainVectorReference coordinates ) const {
 					return this->data[ this->get_coordinate( coordinates.storage() ) ];
 				}
 
+				/**
+				 * Access the data element at N-dimension coordinate given by the vector
+				 * storage object \p coordinates.
+				 */
 				inline DataType& at( ConstDomainVectorStorageType coordinates ) {
 					return this->data[ this->get_coordinate( coordinates ) ];
 				}
 
+				/**
+				 * Const-access the data element at N-dimension coordinate given by the vector
+				 * storage object \p coordinates.
+				 */
 				inline const DataType& at( ConstDomainVectorStorageType coordinates ) const {
 					return this->data[ this->get_coordinate( coordinates ) ];
 				}
 
+				/**
+				 * Returns an iterator to the beginning of the N-dimensional underlyign space,
+				 * i.e. a vector \a [0,0,0,...,0].
+				 */
 				DomainIterator domain_begin() const {
 					return this->_linearizer.begin();
 				}
 
+				/**
+				 * Returns an iterator to the end of the N-dimensional underlyign space.
+				 * This iterator should not be referenced nor incremented.
+				 */
 				DomainIterator domain_end() const {
 					return this->_linearizer.end();
 				}
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 3e318b0cb..c95cfba85 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,10 @@
 /**
  * @file hpcg_test.cpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Test for HPCG simulations on N-dimensional physical problems.
+ * Test for HPCG simulations on N-dimensional physical problems.
  *
  * This test strictly follows the parameter and the formulation of the reference HPCG
  * benchmark impementation in https://github.com/hpcg-benchmark/hpcg.
- *
- * @date 2021-04-30
  */
 
 #include <array>
@@ -41,23 +39,20 @@
 // to easily trace the steps of the solver, just define this symbol
 // #define HPCG_PRINT_STEPS
 
-// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
+// here we define a custom macro, which enables tracing only for HPCG code
 #ifdef HPCG_PRINT_STEPS
 #include <cstdio>
 
 // HPCG_PRINT_STEPS requires defining the following symbols
 
-/**
- * @brief simply prints \p args on a dedicated line.
- */
+// prints args on a dedicated line
 #define DBG_println( args ) std::cout << args << std::endl;
 // forward declaration for the tracing facility
 template< typename T > void print_norm( const grb::Vector< T > &r, const char * head );
-/**
- * @brief prints \p head and the norm of \p r.
- */
+// prints head and the norm of r
 #define DBG_print_norm( vec, head ) print_norm( vec, head )
 #endif
+//============================================
 
 #include <graphblas/algorithms/hpcg/hpcg.hpp>
 #include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
@@ -73,8 +68,8 @@ template< typename T > void print_norm( const grb::Vector< T > &r, const char *
 // default simulation parameters, set as in reference HPCG
 // users can input different ones via the cmd line
 constexpr size_t PHYS_SYSTEM_SIZE_DEF = 16UL;
-constexpr size_t PHYS_SYSTEM_SIZE_MIN = 2UL;
-constexpr size_t MAX_COARSENING_LEVELS = 3U;
+constexpr size_t PHYS_SYSTEM_SIZE_MIN = 4UL;
+constexpr size_t MAX_COARSENING_LEVELS = 3UL;
 constexpr size_t MAX_ITERATIONS_DEF = 56UL;
 constexpr size_t SMOOTHER_STEPS_DEF = 1;
 
@@ -94,18 +89,37 @@ static const char * const TEXT_HIGHLIGHT = "===> ";
 #define thcerr ( std::cerr << TEXT_HIGHLIGHT )
 #define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
 
-/**
- * Container for system parameters to create the HPCG problem.
- */
-struct system_input {
-	size_t nx, ny, nz;
-	size_t max_coarsening_levels;
-};
+// default types
+using IOType = double;
+using NonzeroType = double;
+using InputType = double;
+using ResidualType = double;
+using StdRing = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
+	grb::identities::zero, grb::identities::one >;
+using StdMinus = operators::subtract< NonzeroType >;
+using coord_t = size_t;
+
+// assembled types for simulation runners and input/output structures
+using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
+	StdRing, StdMinus >;
+using mg_data_t = MultiGridData< IOType, NonzeroType >;
+using coarsening_data_t = CoarseningData< IOType, NonzeroType >;
+using smoothing_data_t = SmootherData< IOType >;
+using hpcg_data_t = MultiGridCGData< IOType, NonzeroType, InputType >;
+
+static const IOType io_zero = StdRing(). template getZero< IOType >();
+static const NonzeroType nz_zero = StdRing(). template getZero< NonzeroType >();
+static const InputType input_zero = StdRing(). template getZero< InputType >();
+static const ResidualType residual_zero = StdRing(). template getZero< ResidualType >();
 
 /**
  * Container for the parameters for the HPCG simulation.
  */
-struct simulation_input : public system_input {
+struct simulation_input {
+	// physical parameters for the multi-grid
+	size_t nx, ny, nz;
+	size_t max_coarsening_levels;
+	// solver options
 	size_t inner_test_repetitions;
 	size_t max_iterations;
 	size_t smoother_steps;
@@ -114,15 +128,6 @@ struct simulation_input : public system_input {
 	bool print_iter_stats;
 };
 
-using IOType = double;
-using NonzeroType = double;
-using InputType = double;
-using ResidualType = double;
-using StdRing = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
-	grb::identities::zero, grb::identities::one >;
-using StdMinus = operators::subtract< NonzeroType >;
-using coord_t = size_t;
-
 /**
  * Container for test outputs.
  */
@@ -131,17 +136,10 @@ struct output {
 	size_t inner_test_repetitions = 0;
 	grb::utils::TimerResults times;
 	std::unique_ptr< PinnedVector< IOType > > pinnedVector;
-	NonzeroType square_norm_diff = 0.0;
-	cg_out_data< NonzeroType > cg_out = { 0, 0.0 };
+	NonzeroType square_norm_diff = nz_zero;
+	CGOutInfo< NonzeroType > cg_out = { 0, nz_zero };
 };
 
-using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
-	StdRing, StdMinus >;
-using mg_data_t = multigrid_data< IOType, NonzeroType >;
-using coarsening_data_t = coarsening_data< IOType, NonzeroType >;
-using smoothing_data_t = smoother_data< IOType >;
-using hpcg_data_t = mg_cg_data< IOType, NonzeroType, InputType >;
-
 #ifdef HPCG_PRINT_SYSTEM
 static void print_system(
 	const std::vector< std::unique_ptr< mg_data_t > > &system_levels,
@@ -156,18 +154,21 @@ static void print_system(
 }
 #endif
 
+//========== ROUTINES TO TRACE SOLVER STEPS =========
 #ifdef HPCG_PRINT_STEPS
 template<
 	typename T,
 	class Ring
 > void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm = 0;
+	T norm = ring. template getZero< T >();
 	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
 	if( spmd<>::pid() != 0 ) {
 		return;
 	}
+	// printf makes more likely to get single lineas in output with multiple processes
+	// additionally, it doesn't approximate double values
 	if( head != nullptr ) {
 		printf(">>> %s: %lf\n", head, norm );
 	} else {
@@ -179,7 +180,27 @@ template< typename T > void print_norm( const grb::Vector< T > & r, const char *
 	return print_norm( r, head, StdRing() );
 }
 #endif
+//============================================
+
 
+/**
+ * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
+ * for each level of the multi-grid. The input is the vector of system sizes \p mg_sizes, with sizes in
+ * monotonically \b decreasing order (finest system first).
+ *
+ * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
+ * explained in \ref multigrid_allocate_data().
+ */
+template< typename T > T static next_pow_2( T n ) {
+	static_assert( std::is_integral< T >::value, "Integral required." );
+	--n;
+	n |= ( n >> 1 );
+	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
+		const unsigned shift = static_cast< T >( 1U ) << i;
+		n |= ( n >> shift );
+	}
+	return n + 1;
+}
 
 /**
  * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
@@ -194,13 +215,13 @@ static void allocate_system_structures(
 	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &holder
+	std::unique_ptr< hpcg_data_t > &cg_system_data
 ) {
-	const size_t pid { spmd<>::pid() };
+	const size_t pid = spmd<>::pid() ;
 	grb::utils::Timer timer;
 
-	hpcg_data_t *data{ new hpcg_data_t( mg_sizes[ 0 ] ) };
-	holder = std::unique_ptr< hpcg_data_t >( data );
+	hpcg_data_t *data = new hpcg_data_t( mg_sizes[ 0 ] );
+	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
 	MASTER_PRINT( pid, "allocating data for the MultiGrid simulation...");
 	timer.reset();
 	multigrid_allocate_data( mg_sizes, system_levels, coarsener_levels, smoother_levels );
@@ -210,34 +231,35 @@ static void allocate_system_structures(
 	// zero all vectors
 	MASTER_PRINT( pid, "zeroing all vectors...");
 	timer.reset();
-	grb::RC rc = data->zero_temp_vectors();
+	grb::RC rc = data->init_vectors( io_zero );
 	ASSERT_RC_SUCCESS( rc );
 	std::for_each( system_levels.begin(), system_levels.end(),
-		[]( std::unique_ptr< mg_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+		[]( std::unique_ptr< mg_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
 	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
-		[]( std::unique_ptr< coarsening_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+		[]( std::unique_ptr< coarsening_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
 	std::for_each( smoother_levels.begin(), smoother_levels.end(),
-		[]( std::unique_ptr< smoothing_data_t > &s) { ASSERT_RC_SUCCESS( s->zero_temp_vectors() ); } );
+		[]( std::unique_ptr< smoothing_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
 	time = timer.time();
 	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
 }
 
 /**
  * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
+ * It allocates the data structures and populates them according to the algorithms chosen for HPCG.
  */
 static void build_3d_system(
-	const system_input & in,
+	const simulation_input & in,
 	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &holder
+	std::unique_ptr< hpcg_data_t > &cg_system_data
 ) {
 	constexpr size_t DIMS = 3;
 	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
-	const size_t pid { spmd<>::pid() };
+	const size_t pid = spmd<>::pid();
 	grb::utils::Timer timer;
 
-	hpcg_system_params< DIMS, NonzeroType > params {
+	HPCGSystemParams< DIMS, NonzeroType > params = {
 		{ in.nx, in.ny, in.nz }, HALO_RADIUS, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE,
 			PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
@@ -246,22 +268,25 @@ static void build_3d_system(
 	MASTER_PRINT( pid, "building HPCG generators for " << ( in.max_coarsening_levels + 1 )
 		<< " levels..." );
 	timer.reset();
+	// construct the builder_t generator for each grid level, which depends on the system physics
 	hpcg_build_multigrid_generators( params, mg_generators );
 	double time = timer.time();
 	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
 	MASTER_PRINT( pid, "built HPCG generators for " << mg_generators.size()
 		<< " levels" << std::endl );
 
-
+	// extract the size for each level
 	std::vector< size_t > mg_sizes;
-	// exclude main system
 	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
 		[] ( const builder_t &b ) { return b.system_size(); } );
-	allocate_system_structures( mg_sizes, system_levels, coarsener_levels, smoother_levels, holder );
+	// given the sizes, allocate the data structures for all the inputs of the algorithms
+	allocate_system_structures( mg_sizes, system_levels, coarsener_levels, smoother_levels, cg_system_data );
 	assert( mg_generators.size() == system_levels.size() );
 	assert( mg_generators.size() == smoother_levels.size() );
-	assert( mg_generators.size() - 1 == coarsener_levels.size() );
+	assert( mg_generators.size() - 1 == coarsener_levels.size() ); // coarsener acts between two levels
 
+	// for each grid level, populate the data structures according to the specific algorithm
+	// and track the time for diagnostics purposes
 	for( size_t i = 0; i < mg_generators.size(); i++) {
 		MASTER_PRINT( pid, "SYSTEM LEVEL " << i << std::endl );
 		MASTER_PRINT( pid, " populating system matrix: " );
@@ -290,31 +315,31 @@ static void build_3d_system(
 }
 
 /**
- * @brief Main test, building an HPCG problem and running the simulation closely following the
+ * Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
-	const size_t pid { spmd<>::pid() };
-	MASTER_PRINT( pid, "beginning input generation..." << std::endl );
-
+	const size_t pid = spmd<>::pid();
 	grb::utils::Timer timer;
+	MASTER_PRINT( pid, "beginning input generation..." << std::endl );
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data_t > hpcg_state;
 
+	// define the main HPCG runner and initialize the options of its components
 	hpcg_runner_t hpcg_runner( build_hpcg_runner< IOType, NonzeroType, InputType, ResidualType,
 		StdRing, StdMinus >( in.smoother_steps ) );
 	auto &mg_runner = hpcg_runner.mg_runner;
 	auto &coarsener = mg_runner.coarsener_runner;
 	auto &smoother = mg_runner.smoother_runner;
-	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
-	hpcg_runner.cg_opts.tolerance = 0.0;
+	hpcg_runner.cg_opts.tolerance = residual_zero;
 	hpcg_runner.cg_opts.with_preconditioning = ! in.no_preconditioning;
 
 	timer.reset();
+	// build the entire multi-grid system
 	build_3d_system( in, mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state );
-	double input_duration { timer.time() };
+	double input_duration = timer.time();
 	MASTER_PRINT( pid, "input generation time (ms): " << input_duration << std::endl );
 
 #ifdef HPCG_PRINT_SYSTEM
@@ -323,16 +348,16 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	Matrix< NonzeroType > & A { mg_runner.system_levels[ 0 ]->A };
-	Vector< NonzeroType > & x { hpcg_state->x };
-	Vector< NonzeroType > & b { hpcg_state->b };
+	Matrix< NonzeroType > &A = mg_runner.system_levels[ 0 ]->A;
+	Vector< IOType > &x = hpcg_state->x;
+	Vector< NonzeroType > &b = hpcg_state->b;
 
-	RC rc { SUCCESS };
+	RC rc = SUCCESS;
 	// set vectors as from standard HPCG benchmark
 	set( x, 1.0 );
-	set( b, 0.0 );
+	set( b, nz_zero );
 	rc = grb::mxv( b, A, x, StdRing() );
-	set( x, 0.0 );
+	set( x, io_zero );
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
@@ -343,49 +368,39 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	out.times.preamble = timer.time();
 
-	cg_out_data< NonzeroType > &cg_out = out.cg_out;
 	mg_data_t &grid_base = *mg_runner.system_levels[ 0 ];
 
 	// do a cold run to warm the system up
 	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning cold run..." << std::endl );
 	hpcg_runner.cg_opts.max_iterations = 1;
 	timer.reset();
-	rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
-	double iter_duration { timer.time() };
+	rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
+	double iter_duration = timer.time();
 	ASSERT_RC_SUCCESS( rc );
 	MASTER_PRINT( pid, " time (ms): " << iter_duration << std::endl );
 
+	// restore CG options to user-given values
 	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
 	hpcg_runner.cg_opts.print_iter_stats = in.print_iter_stats;
-	// do benchmark
-	const size_t inner_test_repetitions = in.evaluation_run ? 1 : in.inner_test_repetitions;
-	if( in.evaluation_run ) {
-		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning evaluation run..." << std::endl );
-	} else {
-		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning test run..." << std::endl );
-	}
+	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning solver..." << std::endl );
 	out.inner_test_repetitions = 0;
 	out.times.useful = 0.0;
-	for( size_t i = 0; i < inner_test_repetitions; ++i ) {
-		rc = set( x, 0.0 );
+	// do benchmark
+	for( size_t i = 0; i < in.inner_test_repetitions; ++i ) {
+		rc = set( x, io_zero );
 		ASSERT_RC_SUCCESS( rc );
 		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl );
 		timer.reset();
-		rc = hpcg_runner( grid_base, *hpcg_state, cg_out );
-		out.times.useful += timer.time();
+		rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
+		iter_duration = timer.time();
+		out.times.useful += iter_duration;
 		ASSERT_RC_SUCCESS( rc );
 		MASTER_PRINT( pid, "repetition,duration (ms): " << i << "," << iter_duration << std::endl );
 		out.inner_test_repetitions++;
 	}
 	if( in.evaluation_run ) {
-		rc = collectives<>::reduce( iter_duration, 0, operators::max< double >() );
-		ASSERT_RC_SUCCESS( rc );
-		out.inner_test_repetitions = static_cast< size_t >( 1000.0 / out.times.useful ) + 1;
-		MASTER_PRINT( pid, "Evaluation run" << std::endl
-			<< "  computed residual: " << cg_out.norm_residual << std::endl
-			<< "  iterations: " << cg_out.iterations << std::endl
-			<< "  time taken (ms): " << out.times.useful << std::endl
-			<< "  deduced inner repetitions for 1s duration: " << out.inner_test_repetitions << std::endl );
+		// get maximum execution time among processes
+		rc = collectives<>::reduce( out.times.useful, 0, operators::max< double >() );
 		return;
 	}
 	out.times.useful /= static_cast< double >( in.inner_test_repetitions );
@@ -400,7 +415,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	grb::set( b, 1.0 );
 	grb::eWiseMul( b, -1.0, x, StdRing() );
-	out.square_norm_diff = 0.0;
+	out.square_norm_diff = nz_zero;
 	grb::dot( out.square_norm_diff, b, b, StdRing() );
 
 	// output
@@ -410,7 +425,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 }
 
 /**
- * @brief Parser the command-line arguments to extract the simulation information and checks they are valid.
+ * Parser the command-line arguments to extract the simulation information and checks they are valid.
  */
 static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
 
@@ -437,44 +452,47 @@ int main( int argc, char ** argv ) {
 	struct output out;
 
 	// set standard exit code
-	grb::RC rc { SUCCESS };
+	grb::RC rc = SUCCESS;
 
 	// launch estimator (if requested)
 	if( sim_in.evaluation_run ) {
 		grb::Launcher< AUTOMATIC > launcher;
+		// run just one inner iteration for evaluation purposes
+		sim_in.inner_test_repetitions = 1;
+		thcout << "beginning evaluation run..." << std::endl;
 		rc = launcher.exec( &grbProgram, sim_in, out, true );
-		if( rc == SUCCESS ) {
-			sim_in.inner_test_repetitions = out.inner_test_repetitions;
-		} else {
-			thcout << "launcher.exec returns with non-SUCCESS error code " << grb::toString( rc ) << std::endl;
-			std::exit( -1 );
-		}
+		ASSERT_RC_SUCCESS( rc );
+		ASSERT_EQ( out.inner_test_repetitions, 1 );
+		// compute number of inner repetitions to achieve at least 1s duration
+		sim_in.inner_test_repetitions = static_cast< size_t >( 1000.0 / out.times.useful ) + 1;
+		thcout << "Evaluation run" << std::endl
+			<< "  computed residual: " << out.cg_out.norm_residual << std::endl
+			<< "  iterations: " << out.cg_out.iterations << std::endl
+			<< "  time taken (ms): " << out.times.useful << std::endl
+			<< "  deduced inner repetitions for 1s duration: " << sim_in.inner_test_repetitions << std::endl;
 	}
 
 	// launch full benchmark
 	grb::Benchmarker< AUTOMATIC > benchmarker;
+	thcout << "beginning test run..." << std::endl;
 	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
 	ASSERT_RC_SUCCESS( rc );
-	thcout << "Benchmark completed successfully and took " << out.cg_out.iterations
-		<< " iterations to converge with residual " << out.cg_out.norm_residual << std::endl;
-
-	if( ! out.pinnedVector ) {
-		thcerr << "no output vector to inspect" << std::endl;
-	} else {
-		const PinnedVector< double > &solution { *out.pinnedVector };
-		thcout << "Size of x is " << solution.size() << std::endl;
-		if( solution.size() > 0 ) {
-			print_vector( solution, 30, "SOLUTION" );
-		} else {
-			thcerr << "ERROR: solution contains no values" << std::endl;
-		}
-	}
-
 	ASSERT_RC_SUCCESS( out.error_code );
-
-	double diff_norm { sqrt( out.square_norm_diff ) };
+	thcout << "completed successfully!" << std::endl
+		<< "  final residual: " << out.cg_out.norm_residual << std::endl
+		<< "  solver iterations: " << out.cg_out.iterations << std::endl
+		<< "  total time (ms): " << out.times.useful << std::endl;
+
+	// check result vector, stored inside a pinned vector
+	ASSERT_TRUE( out.pinnedVector );
+	const PinnedVector< double > &solution = *out.pinnedVector;
+	thcout << "Size of x is " << solution.size() << std::endl;
+	ASSERT_GT( solution.size(), 0 );
+	print_vector( solution, 30, "SOLUTION" );
+
+	// check norm of solution w.r.t. expected solution (i.e. vector of all 1)
+	double diff_norm = sqrt( out.square_norm_diff );
 	thcout << "Norm of difference vector |<exact solution> - <actual solution>|: " << diff_norm << std::endl;
-
 	ASSERT_LT( diff_norm, max_diff_norm );
 
 	thcout << "Test OK" << std::endl;
@@ -496,7 +514,7 @@ static void parse_arguments(
 		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, MAX_COARSENING_LEVELS,
 			"maximum level for coarsening; 0 means no coarsening; note: actual level may be limited"
 			" by the minimum system dimension" )
-		.add_optional_argument( "--test-rep", sim_in.inner_test_repetitions, grb::config::BENCHMARKING::inner(),
+		.add_optional_argument( "--inner-iterations", sim_in.inner_test_repetitions, 1,
 			"consecutive test repetitions before benchmarking" )
 		.add_optional_argument( "--outer-iterations", outer_iterations, 1,
 			"test repetitions with complete initialization" )
@@ -522,7 +540,7 @@ static void parse_arguments(
 		std::exit( -1 );
 	}
 	if( sim_in.inner_test_repetitions == 0 ) {
-		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
+		std::cerr << "ERROR no test runs selected: set \"--inner-iterations\" > 0" << std::endl;
 		std::exit( -1 );
 	}
 	if( sim_in.max_iterations == 0 ) {
diff --git a/tests/utils/matrix_generators.hpp b/tests/utils/matrix_generators.hpp
index be45890c6..65fe789be 100644
--- a/tests/utils/matrix_generators.hpp
+++ b/tests/utils/matrix_generators.hpp
@@ -35,6 +35,7 @@
 #include <iterator>
 #include <algorithm>
 
+#include <graphblas/utils/iterators/utils.hpp>
 
 namespace grb {
 
@@ -114,28 +115,6 @@ namespace grb {
 
 		namespace internal {
 
-			/**
-			 * Computes the difference between \a a and \a b and returns it as the given
-			 * type \a DiffT.
-			 *
-			 * Raises an exception if \a DiffT cannot store the difference.
-			 */
-			template<
-				typename SizeT,
-				typename DiffT
-			>
-			DiffT compute_distance(
-				const SizeT a,
-				const SizeT b
-			) {
-				const SizeT diff = std::max( a, b ) - std::min( a, b );
-				if( diff > static_cast< SizeT >( std::numeric_limits< DiffT >::max() ) ) {
-					throw std::range_error( "cannot represent difference" );
-				}
-				DiffT result = static_cast< DiffT >( diff );
-				return a >= b ? result : -result ;
-			}
-
 			/**
 			 * Stores the coordinate for a generator of diagonal matrices.
 			 */
@@ -240,9 +219,8 @@ namespace grb {
 				typename SelfType::difference_type operator-(
 					const SelfType &other
 				) const {
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this->_v.coord, other._v.coord );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this->_v.coord, other._v.coord );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }
@@ -461,9 +439,8 @@ namespace grb {
 					const size_t this_position = coords_to_linear( _v.size, _v.row, _v.col );
 					const size_t other_position =
 						coords_to_linear( other._v.size, other._v.row, other._v.col );
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this_position, other_position );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this_position, other_position );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }
@@ -584,9 +561,8 @@ namespace grb {
 				typename SelfType::difference_type operator-(
 					const SelfType &other
 				) const {
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this->_v.offset, other._v.offset );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this->_v.offset, other._v.offset );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }

From 11931e18d506bff853aa44d178b08386afadab64 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Tue, 29 Nov 2022 15:49:03 +0100
Subject: [PATCH 13/28] removing limit to smallest MG system

---
 include/graphblas/algorithms/hpcg/system_builder.hpp     | 2 +-
 .../utils/multigrid/linearized_halo_ndim_system.hpp      | 2 +-
 tests/smoke/hpcg.cpp                                     | 9 +++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
index 48a2e640d..e19ba208d 100644
--- a/include/graphblas/algorithms/hpcg/system_builder.hpp
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -100,7 +100,7 @@ namespace grb {
 					throw std::invalid_argument( "halo should be higher than 0" );
 				}
 				for( const auto i : sizes ) {
-					if( i < 2 * halo + 1 ) {
+					if( i < halo + 1 ) {
 						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
 					}
 				}
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
index d448fd426..400fdd3ab 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -108,7 +108,7 @@ namespace grb {
 					_halo( halo )
 				{
 					for( SizeType __size : sizes ) {
-						if ( __size < 2 * halo + 1 ) {
+						if ( __size < halo + 1 ) {
 							throw std::invalid_argument(
 								std::string( "the halo (" + std::to_string(halo) +
 								std::string( ") goes beyond a system size (" ) +
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index c95cfba85..696c177fc 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -24,7 +24,6 @@
  * benchmark impementation in https://github.com/hpcg-benchmark/hpcg.
  */
 
-#include <array>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -68,7 +67,7 @@ template< typename T > void print_norm( const grb::Vector< T > &r, const char *
 // default simulation parameters, set as in reference HPCG
 // users can input different ones via the cmd line
 constexpr size_t PHYS_SYSTEM_SIZE_DEF = 16UL;
-constexpr size_t PHYS_SYSTEM_SIZE_MIN = 4UL;
+constexpr size_t PHYS_SYSTEM_SIZE_MIN = 2UL;
 constexpr size_t MAX_COARSENING_LEVELS = 3UL;
 constexpr size_t MAX_ITERATIONS_DEF = 56UL;
 constexpr size_t SMOOTHER_STEPS_DEF = 1;
@@ -289,6 +288,12 @@ static void build_3d_system(
 	// and track the time for diagnostics purposes
 	for( size_t i = 0; i < mg_generators.size(); i++) {
 		MASTER_PRINT( pid, "SYSTEM LEVEL " << i << std::endl );
+		auto& sizes = mg_generators[ i ].get_generator().get_sizes();
+		MASTER_PRINT( pid, " sizes: " );
+		for( size_t s = 0; s < DIMS - 1; s++ ) {
+			MASTER_PRINT( pid,sizes[ s ] << " x " );
+		}
+		MASTER_PRINT( pid, sizes[ DIMS - 1 ] << std::endl );
 		MASTER_PRINT( pid, " populating system matrix: " );
 		timer.reset();
 		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A );

From 012f3e82cd2e668a418ea070d2faf7b5a96151f7 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Wed, 23 Nov 2022 16:48:01 +0100
Subject: [PATCH 14/28] adding average coarsener and invoking it from the
 benchmark (on user's choice)

---
 .../algorithms/hpcg/average_coarsener.hpp     | 349 ++++++++++++++++++
 .../algorithms/hpcg/system_building_utils.hpp |  53 ++-
 tests/smoke/hpcg.cpp                          |  13 +-
 3 files changed, 409 insertions(+), 6 deletions(-)
 create mode 100644 include/graphblas/algorithms/hpcg/average_coarsener.hpp

diff --git a/include/graphblas/algorithms/hpcg/average_coarsener.hpp b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
new file mode 100644
index 000000000..6af5e5ff7
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
@@ -0,0 +1,349 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file average_coarsener.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to build the coarsening matrix for an HPCG simulation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+#define _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+
+#include <cstddef>
+#include <array>
+#include <iterator>
+#include <stdexcept>
+#include <cmath>
+#include <numeric>
+
+#include <graphblas/utils/multigrid/array_vector_storage.hpp>
+#include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+		// forward declaration
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class AverageCoarsenerBuilder;
+
+		/**
+		 * Iterator class to generate the coarsening matrix that averages over the elements of the finer
+		 * domain corresponding to the element of the coarser domain.
+		 *
+		 * The coarsening matrix averages \b all elements that are coarsened into one.
+		 *
+		 * This coarsening method requires some computation but should be relatively robust to noise
+		 * or to partitioning strategies to parallelize the smoother (usually run before coarsening).
+		 *
+		 * This iterator is random-access.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 /
+		 * 	<number of finer elements per coarser elements>
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> struct AverageGeneratorIterator {
+
+			friend AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+			using RowIndexType = CoordType; ///< numeric type of rows
+			using ColumnIndexType = CoordType;
+			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
+			using LinearSystemIterType = typename LinearSystemType::Iterator;
+			using SelfType = AverageGeneratorIterator< DIMS, CoordType, ValueType >;
+			using ArrayType = std::array< CoordType, DIMS >;
+
+			struct _ValueGenerator {
+
+				friend SelfType;
+
+				_ValueGenerator(
+					RowIndexType i,
+					ColumnIndexType j,
+					ValueType value
+				) noexcept :
+					_i( i ),
+					_j( j ),
+					_value( value )
+				{}
+
+				_ValueGenerator( const _ValueGenerator & ) = default;
+
+				_ValueGenerator & operator=( const _ValueGenerator & ) = default;
+
+				inline RowIndexType i() const { return _i; }
+				inline ColumnIndexType j() const { return _j; }
+				inline ValueType v() const { return _value; }
+
+			private:
+				RowIndexType _i;
+				ColumnIndexType _j;
+				ValueType _value;
+			};
+
+			// interface for std::random_access_iterator
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = _ValueGenerator;
+			using pointer = const value_type;
+			using reference = const value_type&;
+			using difference_type = typename LinearSystemIterType::difference_type;
+
+			AverageGeneratorIterator( const SelfType &o ) = default;
+
+			AverageGeneratorIterator( SelfType &&o ) = default;
+
+			SelfType & operator=( const SelfType & ) = default;
+
+			SelfType & operator=( SelfType && ) = default;
+
+			/**
+			 * Advances \c this by 1 in constant time.
+			 */
+			SelfType & operator++() noexcept {
+				(void) ++_subspace_iter;
+				size_t subspace_position = _subspace_iter->get_linear_position();
+				// std::cout << "subspace_position " << subspace_position << std::endl;
+				if( subspace_position == _num_neighbors ) {
+					(void) ++_sys_iter;
+					_subspace_iter = _finer_subspace->begin();
+				}
+				update_coords();
+				return *this;
+			}
+
+			/**
+			 * Advances \c this by \p offset in constant time.
+			 */
+			SelfType & operator+=( size_t offset ) {
+				CoordType sub_offset = _subspace_iter->get_linear_position() + offset;
+				std::ldiv_t res = std::ldiv( sub_offset, _num_neighbors );
+				_sys_iter += res.quot;
+				_subspace_iter = _finer_subspace->begin();
+				_subspace_iter += res.rem;
+				update_coords();
+				return *this;
+			}
+
+			/**
+			 * Computes the difference between \c this and \p o as integer.
+			 */
+			difference_type operator-( const SelfType &o ) const {
+				return this->_sys_iter - o._sys_iter;
+			}
+
+			/**
+			 * Returns whether \c this and \p o differ.
+			 */
+			bool operator!=( const SelfType &o ) const {
+				return this->_sys_iter != o._sys_iter;
+			}
+
+			/**
+			 * Returns whether \c this and \p o are equal.
+			 */
+			bool operator==( const SelfType &o ) const {
+				return ! this->operator!=( o );
+			}
+
+			reference operator*() const {
+				return _val;
+			}
+
+			pointer operator->() const {
+				return &_val;
+			}
+
+			/**
+			 * Returns the current row, within the coarser system.
+			 */
+			inline RowIndexType i() const {
+				return _val.i();
+			}
+
+			/**
+			 * Returns the current column, within the finer system.
+			 */
+			inline ColumnIndexType j() const {
+				return _val.j();
+			}
+
+			/**
+			 * Returns always 1, as the coarsening keeps the same value.
+			 */
+			inline ValueType v() const {
+				return _val.v();
+			}
+
+		private:
+			const LinearSystemType *_lin_sys;
+			const LinearSystemType *_finer_subspace;
+			const ArrayType *_steps;
+			CoordType _num_neighbors;
+			LinearSystemIterType _sys_iter;
+			LinearSystemIterType _subspace_iter;
+			value_type _val;
+
+			/**
+			 * Construct a new AverageGeneratorIterator object starting from the LinearizedNDimSystem
+			 * object \p system describing the \b coarser system and the \b ratios \p steps between each finer and
+			 * the corresponding corser dimension.
+			 *
+			 * @param system LinearizedNDimSystem object describing the coarser system
+			 * @param finer_subspace LinearizedNDimSystem object describing the subspace of each element
+			 *  in the finer system
+			 * @param steps ratios per dimension between finer and coarser system
+			 */
+			AverageGeneratorIterator(
+				const LinearSystemType &system,
+				const LinearSystemType &finer_subspace,
+				const ArrayType &steps
+			) noexcept :
+				_lin_sys( &system ),
+				_finer_subspace( &finer_subspace ),
+				_steps( &steps ),
+				_num_neighbors( std::accumulate( steps.cbegin(), steps.cend(), 1UL, std::multiplies< CoordType >() ) ),
+				_sys_iter( system.begin() ),
+				_subspace_iter( finer_subspace.begin() ),
+				_val( 0, 0, static_cast< ValueType >( 1 ) / static_cast< ValueType >( _num_neighbors ) )
+			{
+				update_coords();
+			}
+
+			void update_coords() noexcept {
+				_val._i = _sys_iter->get_linear_position();
+				_val._j = coarse_rows_to_finer_col();
+			}
+
+			/**
+			 * Returns the row coordinates converted to the finer system, to compute
+			 * the column value.
+			 */
+			ColumnIndexType coarse_rows_to_finer_col() const noexcept {
+				ColumnIndexType finer = 0;
+				ColumnIndexType s = 1;
+				for( size_t i = 0; i < DIMS; i++ ) {
+					finer += s * _subspace_iter->get_position()[ i ];
+					s *= (*_steps)[ i ];
+					finer += s * _sys_iter->get_position()[ i ];
+					s *= _lin_sys->get_sizes()[ i ];
+				}
+				return finer;
+			}
+		};
+
+		/**
+		 * Builder object to create iterators that generate an averaging-coarsening matrix.
+		 *
+		 * It is a facility to generate beginning and end iterators and abstract the logic away from users.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
+		 *  the finer value)
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class AverageCoarsenerBuilder {
+		public:
+			using ArrayType = std::array< CoordType, DIMS >;
+			using Iterator = AverageGeneratorIterator< DIMS, CoordType, ValueType >;
+			using SelfType = AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+			/**
+			 * Construct a new AverageCoarsenerBuilder object from the sizes of finer system
+			 * and those of the coarser system; finer sizes must be an exact multiple of coarser sizes,
+			 * otherwise an exception is raised.
+			 */
+			AverageCoarsenerBuilder(
+				const ArrayType &_finer_sizes,
+				const ArrayType &_coarser_sizes
+			) :
+				system( _coarser_sizes.begin(), _coarser_sizes.end() ),
+				_finer_subspace( _coarser_sizes.cbegin(), _coarser_sizes.cend() ),
+				steps( DIMS )
+			{
+				for( size_t i = 0; i < DIMS; i++ ) {
+					// finer size MUST be an exact multiple of coarser_size
+					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
+					if( ratio.quot < 2 || ratio.rem != 0 ) {
+						throw std::invalid_argument(
+							std::string( "finer size of dimension " ) + std::to_string( i ) +
+							std::string( "is not an exact multiple of coarser size" )
+						);
+					}
+					steps[ i ] = ratio.quot;
+				}
+				_finer_subspace.retarget( steps );
+			}
+
+			AverageCoarsenerBuilder( const SelfType & ) = delete;
+
+			AverageCoarsenerBuilder( SelfType && ) = delete;
+
+			SelfType & operator=( const SelfType & ) = delete;
+
+			SelfType & operator=( SelfType && ) = delete;
+
+			/**
+			 * Returns the size of the finer system, i.e. its number of elements.
+			 */
+			size_t system_size() const {
+				return system.system_size();
+			}
+
+			/**
+			 * Produces a beginning iterator to generate the coarsening matrix.
+			 */
+			Iterator make_begin_iterator() {
+				return Iterator( system, _finer_subspace, steps );
+			}
+
+			/**
+			 * Produces an end iteratormto stop the generation of the coarsening matrix.
+			 */
+			Iterator make_end_iterator() {
+				Iterator result( system, _finer_subspace, steps );
+				result += ( system_size() * _finer_subspace.system_size() ); // do not trigger boundary checks
+				// ++result;
+				return result;
+			}
+
+		private:
+			const grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
+			grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > _finer_subspace;
+
+			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
+			//// incremented when incrementing the row coordinates; is is the ration between
+			//// #finer_sizes and row_generator#physical_sizes
+		};
+
+	} // namespace algorithms
+} // namespace grb
+#endif // _H_GRB_ALGORITHMS_AVERAGE_COARSENER
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index c0b522521..9503f77ff 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -39,6 +39,7 @@
 
 #include "system_builder.hpp"
 #include "single_point_coarsener.hpp"
+#include "average_coarsener.hpp"
 #include "greedy_coloring.hpp"
 
 namespace grb {
@@ -168,13 +169,22 @@ namespace grb {
 		 * This function takes care of parallelizing the generation by using a random-access iterator
 		 * to generate the coarsening matrix and by distributing the generation across nodes
 		 * of a distributed system (if any).
+		 * @tparam IterBuilderType type of the matrix builder, either SinglePointCoarsenerBuilder
+		 *  or AverageCoarsenerBuilder
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam NonzeroType type of the nonzero
+		 * @param finer_system_generator object generating the finer system
+		 * @param coarser_system_generator object generating the finer system
+		 * @param coarsener structure with the matrix to populate
 		 */
 		template<
+			typename IterBuilderType,
 			size_t DIMS,
 			typename CoordType,
 			typename IOType,
 			typename NonzeroType
-		> grb::RC hpcg_populate_coarsener(
+		> grb::RC hpcg_populate_coarsener_any_builder(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
 			CoarseningData< IOType, NonzeroType > &coarsener
@@ -201,15 +211,50 @@ namespace grb {
 											" with rows == <coarser size> and cols == <finer size>" );
 			}
 
-			using gen_t = typename grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType >;
-			gen_t coarsener_builder( finer_sizes, coarser_sizes );
-			typename gen_t::Iterator begin( coarsener_builder.make_begin_iterator() ),
+			IterBuilderType coarsener_builder( finer_sizes, coarser_sizes );
+			typename IterBuilderType::Iterator begin( coarsener_builder.make_begin_iterator() ),
 				end( coarsener_builder.make_end_iterator() );
 			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
 				coarsener_builder.system_size(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
+		/**
+		 * Populates a coarsener that samples one element every \a 2^DIMS .
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_coarsener(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > &coarsener
+		) {
+			return hpcg_populate_coarsener_any_builder<
+				grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType > >
+				( finer_system_generator, coarser_system_generator, coarsener );
+		}
+
+		/**
+		 * Populates a coarsener that averages over \a 2^DIMS elements.
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_coarsener_avg(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > &coarsener
+		) {
+			return hpcg_populate_coarsener_any_builder<
+				grb::algorithms::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >
+				( finer_system_generator, coarser_system_generator, coarsener );
+		}
+
 		namespace internal {
 
 			/**
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 696c177fc..7edd9a1d4 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -119,6 +119,7 @@ struct simulation_input {
 	size_t nx, ny, nz;
 	size_t max_coarsening_levels;
 	// solver options
+	bool use_average_coarsener;
 	size_t inner_test_repetitions;
 	size_t max_iterations;
 	size_t smoother_steps;
@@ -311,7 +312,11 @@ static void build_3d_system(
 		if( i > 0 ) {
 			MASTER_PRINT( pid, " populating coarsening data: " );
 			timer.reset();
-			rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			if( !in.use_average_coarsener ) {
+				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			} else {
+				rc = hpcg_populate_coarsener_avg( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			}
 			time = timer.time();
 			ASSERT_RC_SUCCESS( rc );
 			MASTER_PRINT( pid, " time (ms) " << time << std::endl )
@@ -443,6 +448,7 @@ int main( int argc, char ** argv ) {
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
 	thcout << "System size z: " << sim_in.nz << std::endl;
+	thcout << "Coarsener: " << (sim_in.use_average_coarsener ? "average" : "single point sampler" ) << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
 	thcout << "Test repetitions: " << sim_in.inner_test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
@@ -453,6 +459,7 @@ int main( int argc, char ** argv ) {
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
 	thcout << "Maximum norm for residual: " << max_diff_norm << std::endl;
 
+
 	// the output struct
 	struct output out;
 
@@ -535,7 +542,9 @@ static void parse_arguments(
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
 			"do not apply pre-conditioning via multi-grid V cycle" )
 		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false,
-			"on each iteration, print more statistics" );
+			"on each iteration, print more statistics" )
+		.add_option( "--use-average-coarsener", sim_in.use_average_coarsener, false,
+			"coarsen by averaging instead of by sampling a single point (slower, but more accurate)" );
 
 	parser.parse( argc, argv );
 

From 2f7c223f0ef48f392f75d02a941c5dd74eec36db Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 24 Nov 2022 15:46:36 +0100
Subject: [PATCH 15/28] replacing eWiseMulAdd() with eWiseMul() + eWiseApply()

---
 .../algorithms/multigrid/multigrid_cg.hpp     | 64 ++++++++++---------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 735f87d81..2738864ef 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -152,13 +152,11 @@ namespace grb {
 			MultiGridrunnerType &multigrid_runner,
 			CGOutInfo< ResidualType > &out_info
 		) {
-			ResidualType alpha;
-
-			const grb::Matrix< NonzeroType > &A = grid_base.A;
+			const grb::Matrix< NonzeroType > &A = grid_base.A; // system matrix
 			grb::Vector< IOType > &r = grid_base.r;  // residual vector
 			grb::Vector< IOType > &z = grid_base.z;  // pre-conditioned residual vector
-			grb::Vector< IOType > &x = cg_data.x;
-			const grb::Vector< InputType > &b = cg_data.b;
+			grb::Vector< IOType > &x = cg_data.x; // initial (and final) solution
+			const grb::Vector< InputType > &b = cg_data.b; // right-side value
 			grb::Vector< IOType > &p = cg_data.p;  // direction vector
 			grb::Vector< IOType > &Ap = cg_data.u; // temp vector
 			grb::RC ret = SUCCESS;
@@ -169,15 +167,17 @@ namespace grb {
 			ret = ret ? ret : grb::set( p, io_zero );
 
 			ret = ret ? ret : grb::set( p, x );
-			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, cg_opts.ring ); // Ap = A * x
+			// Ap = A * x
+			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, cg_opts.ring );
 			assert( ret == SUCCESS );
-
-			ret = ret ? ret : grb::eWiseApply( r, b, Ap, cg_opts.minus ); // r = b - Ap;
+			// r = b - Ap
+			ret = ret ? ret : grb::eWiseApply( r, b, Ap, cg_opts.minus );
 			assert( ret == SUCCESS );
 
 			const ResidualType residual_zero = cg_opts.ring.template getZero< ResidualType >();
 			ResidualType norm_residual = residual_zero;
-			ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring ); // norm_residual = r' * r;
+			// norm_residual = r' * r
+			ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring );
 			assert( ret == SUCCESS );
 
 			// compute sqrt to avoid underflow
@@ -196,7 +196,6 @@ namespace grb {
 			DBG_print_norm( Ap, "start Ap" );
 			DBG_print_norm( r, "start r" );
 #endif
-
 			do {
 #ifdef HPCG_PRINT_STEPS
 				DBG_println( "========= iteration " << iter << " =========" );
@@ -219,58 +218,63 @@ namespace grb {
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( z, "initial z" );
 #endif
-
-				ResidualType pAp;
-
 				if( iter == 0 ) {
 					ret = ret ? ret : grb::set( p, z ); //  p = z;
 					assert( ret == SUCCESS );
-
 					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring ); // r_dot_z = r' * z;
 					assert( ret == SUCCESS );
 				} else {
 					old_r_dot_z = r_dot_z;
-
+					// r_dot_z = r' * z
 					r_dot_z = cg_opts.ring.template getZero< ResidualType >();
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring ); // r_dot_z = r' * z;
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring );
 					assert( ret == SUCCESS );
 
 					beta = r_dot_z / old_r_dot_z;
-					ret = ret ? ret : grb::set( Ap, io_zero );                         // Ap  = 0;
-					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, cg_opts.ring ); // Ap += beta * p + z;
-					std::swap( Ap, p );                                         // p = Ap;
+					// Ap  = 0
+					ret = ret ? ret : grb::set( Ap, io_zero );
+					assert( ret == SUCCESS );
+					// Ap += beta * p
+					ret = ret ? ret : grb::eWiseMul( Ap, beta, p, cg_opts.ring );
+					assert( ret == SUCCESS );
+					// Ap = Ap + z
+					ret = ret ? ret : grb::eWiseApply( Ap, Ap, z, cg_opts.ring.getAdditiveOperator() );
+					assert( ret == SUCCESS );
+					// p = Ap
+					std::swap( Ap, p );
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( p, "middle p" );
 #endif
-
+				// Ap = A * p
 				ret = ret ? ret : grb::set( Ap, io_zero );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, cg_opts.ring ); // Ap = A * p;
+				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( Ap, "middle Ap" );
 #endif
-				pAp = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot( pAp, Ap, p, cg_opts.ring ); // pAp = p' * Ap
+				// pAp = p' * Ap
+				ResidualType pAp = cg_opts.ring.template getZero< ResidualType >();
+				ret = ret ? ret : grb::dot( pAp, Ap, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 
-				alpha = r_dot_z / pAp;
-
-				ret = ret ? ret : grb::eWiseMul( x, alpha, p, cg_opts.ring ); // x += alpha * p;
+				ResidualType alpha = r_dot_z / pAp;
+				// x += alpha * p
+				ret = ret ? ret : grb::eWiseMul( x, alpha, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( x, "end x" );
 #endif
-
-				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, cg_opts.ring ); // r += - alpha * Ap;
+				// r += - alpha * Ap
+				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( r, "end r" );
 #endif
-
+				// residual = r' * r
 				norm_residual = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring ); // residual = r' * r;
+				ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring );
 				assert( ret == SUCCESS );
 
 				norm_residual = std::sqrt( norm_residual );

From 216b99612f67cebdf4b621aaec2519475f4296e2 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 24 Nov 2022 16:07:49 +0100
Subject: [PATCH 16/28] logging per-iteration MG time and residual separately

---
 .../algorithms/multigrid/multigrid_cg.hpp     | 21 ++++++-------------
 .../multigrid/multigrid_v_cycle.hpp           | 13 +++++++++++-
 tests/smoke/hpcg.cpp                          |  3 ++-
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 2738864ef..4c4e0b0cf 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -36,7 +36,6 @@
 #include <utility>
 
 #include <graphblas.hpp>
-#include <graphblas/utils/Timer.hpp>
 
 #include "multigrid_data.hpp"
 
@@ -67,7 +66,6 @@ namespace grb {
 			grb::Vector< IOType > x;    ///< system solution being refined over the iterations: it us up to the user
 			///< to set the initial solution value to something meaningful
 
-
 			/**
 			 * Construct a new \c MultiGridCGData object by building its vectors with size \p sys_size.
 			 */
@@ -98,7 +96,7 @@ namespace grb {
 									///< and the result achieved so far returned
 			ResidualType tolerance; ///< ratio between initial residual and current residual that halts the solver
 										///< if reached, for the solution is to be considered "good enough"
-			bool print_iter_stats; ///< whether to print information on the multi-grid and the residual on each iteration
+			bool print_iter_residual; ///< whether to print information on the multi-grid and the residual on each iteration
 			Ring ring; ///< algebraic ring to be used
 			Minus minus; ///< minus operator to be used
 		};
@@ -189,8 +187,6 @@ namespace grb {
 			ResidualType old_r_dot_z = residual_zero, r_dot_z = residual_zero, beta = residual_zero;
 			size_t iter = 0;
 
-			grb::utils::Timer timer;
-
 #ifdef HPCG_PRINT_STEPS
 			DBG_print_norm( p, "start p" );
 			DBG_print_norm( Ap, "start Ap" );
@@ -200,17 +196,12 @@ namespace grb {
 #ifdef HPCG_PRINT_STEPS
 				DBG_println( "========= iteration " << iter << " =========" );
 #endif
+				if( cg_opts.print_iter_residual ) {
+					std::cout << "iteration " << iter;
+				}
 				if( cg_opts.with_preconditioning ) {
-					if( cg_opts.print_iter_stats ) {
-						timer.reset();
-					}
 					ret = ret ? ret : multigrid_runner( grid_base );
 					assert( ret == SUCCESS );
-					if( cg_opts.print_iter_stats ) {
-						double duration = timer.time();
-						std::cout << "iteration, pre-conditioner: " << iter << ","
-							<< duration << std::endl;
-					}
 				} else {
 					ret = ret ? ret : grb::set( z, r ); // z = r;
 					assert( ret == SUCCESS );
@@ -279,8 +270,8 @@ namespace grb {
 
 				norm_residual = std::sqrt( norm_residual );
 
-				if( cg_opts.print_iter_stats ) {
-					std::cout << "iteration, residual: " << iter << "," << norm_residual << std::endl;
+				if( cg_opts.print_iter_residual ) {
+					std::cout << " residual " << norm_residual << std::endl;
 				}
 
 				++iter;
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 963da74d5..f6dbfbd03 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -33,6 +33,7 @@
 
 #include <graphblas.hpp>
 #include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
+#include <graphblas/utils/Timer.hpp>
 
 #include "multigrid_data.hpp"
 
@@ -201,6 +202,8 @@ namespace grb {
 			MGSmootherType smoother_runner; ///< object to run the smoother
 			CoarsenerType coarsener_runner; ///< object to run the coarsener
 			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
+			bool print_duration = false; ///< whether to print the duration of a full multi-grid call
+			grb::utils::Timer timer;
 			Ring ring; ///< algebraic ring
 			Minus minus; ///< minus operator
 
@@ -238,11 +241,19 @@ namespace grb {
 			 * Operator to invoke a full multi-grid run starting from the given level.
 			 */
 			inline grb::RC operator()( MultiGridInputType &system ) {
-				return multi_grid< IOType, NonzeroType, __unique_ptr_extractor,
+				if( print_duration ) {
+					timer.reset();
+				}
+				grb::RC ret = multi_grid< IOType, NonzeroType, __unique_ptr_extractor,
 					MGSmootherType, CoarsenerType, Ring, Minus >(
 					__unique_ptr_extractor( system_levels.begin() += system.level ),
 					__unique_ptr_extractor( system_levels.end() ),
 					smoother_runner, coarsener_runner, ring, minus );
+				if( print_duration ) {
+					double duration = timer.time();
+					std::cout << " pre-conditioner (ms) "<< duration;
+				}
+				return ret;
 			}
 		};
 
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 7edd9a1d4..bba6e9c2f 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -391,7 +391,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	// restore CG options to user-given values
 	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
-	hpcg_runner.cg_opts.print_iter_stats = in.print_iter_stats;
+	hpcg_runner.cg_opts.print_iter_residual = in.print_iter_stats;
+	mg_runner.print_duration = in.print_iter_stats;
 	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning solver..." << std::endl );
 	out.inner_test_repetitions = 0;
 	out.times.useful = 0.0;

From ffa7ba321adce69ec691032ce743e17a561a48c7 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 25 Nov 2022 14:55:18 +0100
Subject: [PATCH 17/28] allowing colors to start from highest first during
 greedy assignment

---
 .../algorithms/hpcg/greedy_coloring.hpp       | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
index 5b6f80b2c..5519a6504 100644
--- a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -53,8 +53,14 @@ namespace grb {
 		 * most sizes, as the constants in front of this algorithms are very small. Implementing a distributed
 		 * coloring algorithm is anyway out of the scope of this prototype.
 		 *
+		 * Colors are by default assigned in a greedy way from the lowest one up, making this coloring scheme very
+		 * regular: close elements tend to have similar colors. This can be changed with \p lower_color_first
+		 * \c = \c false , which assigns colors from the highest one. This may avoid "destructive interference"
+		 * with following coarsening schemes.
+		 *
 		 * @tparam DIMS dimensions of the system
 		 * @tparam CoordType type of the coordinates
+		 * @tparam lower_color_first start greedy assignment of colors from lowest first
 		 * @param[in] system generator for an \p DIMS - dimesional system with halo
 		 * @param[out] row_colors if \p reorder_rows_per_color is false, stores the color of each row;
 		 * 	if \p reorder_rows_per_color is true, stores the new position of each row, so that rows
@@ -66,7 +72,8 @@ namespace grb {
 		 */
 		template<
 			size_t DIMS,
-			typename CoordType
+			typename CoordType,
+			bool lowest_color_first = true
 		> void hpcg_greedy_color_ndim_system(
 			const grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType > &system,
 			std::vector< CoordType > &row_colors,
@@ -112,11 +119,24 @@ namespace grb {
 				if( currentlyAssigned < totalColors ) {
 					// if there is at least one color left to use, look for it
 					// smallest possible
-					for( CoordType j = 0; j < totalColors; ++j ) {
-						if( !assigned[ j ] ) {
-							// if no neighbor with this color, use it for this row
-							row_colors[ curRow ] = j;
-							break;
+					if( lowest_color_first ) {
+						// here, assign colors greedily starting from the lowest available one
+						for( CoordType j = 0; j < totalColors; ++j ) {
+							if( !assigned[ j ] ) {
+								// if no neighbor with this color, use it for this row
+								row_colors[ curRow ] = j;
+								break;
+							}
+						}
+					} else {
+						// here, assign colors greedily starting from the highest available one
+						for( CoordType j = totalColors; j > 0; --j ) {
+							CoordType color = j - 1;
+							if( !assigned[ color ] ) {
+								// if no neighbor with this color, use it for this row
+								row_colors[ curRow ] = color;
+								break;
+							}
 						}
 					}
 				} else {

From 9c3b2bae5f0caa3022cb9309e61b4e9bdd5bfb73 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 28 Nov 2022 17:16:03 +0100
Subject: [PATCH 18/28] adding descriptor template parameter all over to MG
 kernels; adding dense descriptor to HPCG builder; moving zero'ing of vector
 in RBGS smoother to main caller

---
 include/graphblas/algorithms/hpcg/hpcg.hpp    | 28 ++++++------
 .../algorithms/multigrid/multigrid_cg.hpp     | 43 ++++++++++--------
 .../multigrid/multigrid_v_cycle.hpp           | 21 +++++----
 .../multigrid/red_black_gauss_seidel.hpp      | 44 ++++++++++++-------
 .../multigrid/single_matrix_coarsener.hpp     | 29 ++++++------
 tests/smoke/hpcg.cpp                          |  5 ++-
 6 files changed, 100 insertions(+), 70 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index c4598323a..b4884f4e1 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -42,6 +42,7 @@ namespace grb {
 
 		// simply "assemble" types
 		template<
+			Descriptor descr,
 			typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
@@ -49,11 +50,11 @@ namespace grb {
 			class Ring,
 			class Minus
 		> using HPCGRunnerType = MultiGridCGRunner< IOType, NonzeroType, InputType, ResidualType,
-			MultiGridRunner< IOType, NonzeroType,
-				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >,
-				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus >,
-				Ring, Minus >,
-			Ring, Minus
+			MultiGridRunner<
+				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >,
+				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr >,
+				IOType, NonzeroType, Ring, Minus, descr
+			>, Ring, Minus, descr
 		>;
 
 		/**
@@ -63,26 +64,27 @@ namespace grb {
 		 * @param[in] smoother_steps how many times the smoother should run (both pre- and post-smoothing)
 		 */
 		template<
+			Descriptor descr,
 			typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
 			typename InputType,
 			class Ring,
 			class Minus
-		> HPCGRunnerType< IOType, ResidualType, NonzeroType, InputType, Ring, Minus >
+		> HPCGRunnerType< descr, IOType, ResidualType, NonzeroType, InputType, Ring, Minus >
 			build_hpcg_runner( size_t smoother_steps ) {
 
-			SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus > coarsener;
-			RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >
+			SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr > coarsener;
+			RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >
 				smoother( { smoother_steps, smoother_steps, 1UL, {}, Ring() } );
 
-			MultiGridRunner< IOType, NonzeroType,
-				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring >,
-				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus >,
-				Ring, Minus
+			MultiGridRunner<
+				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >,
+				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr >,
+				IOType, NonzeroType, Ring, Minus, descr
 			> mg_runner( std::move( smoother ), std::move( coarsener ) );
 
-			return HPCGRunnerType< IOType, ResidualType, NonzeroType, InputType, Ring, Minus >(
+			return HPCGRunnerType< descr, IOType, ResidualType, NonzeroType, InputType, Ring, Minus >(
 				std::move( mg_runner ) );
 		}
 
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 4c4e0b0cf..c517c8cc4 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -120,6 +120,7 @@ namespace grb {
 		 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
 		 * the failure code.
 		 *
+		 * @tparam descr descriptor for static information
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam ResidualType type of the residual norm
 		 * @tparam NonzeroType type of matrix values
@@ -136,6 +137,7 @@ namespace grb {
 		 * @return grb::RC SUCCESS in case of succesful run
 		 */
 		template<
+			Descriptor descr,
 			typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
@@ -166,16 +168,16 @@ namespace grb {
 
 			ret = ret ? ret : grb::set( p, x );
 			// Ap = A * x
-			ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, x, cg_opts.ring );
+			ret = ret ? ret : grb::mxv< descr >( Ap, A, x, cg_opts.ring );
 			assert( ret == SUCCESS );
 			// r = b - Ap
-			ret = ret ? ret : grb::eWiseApply( r, b, Ap, cg_opts.minus );
+			ret = ret ? ret : grb::eWiseApply< descr >( r, b, Ap, cg_opts.minus );
 			assert( ret == SUCCESS );
 
 			const ResidualType residual_zero = cg_opts.ring.template getZero< ResidualType >();
 			ResidualType norm_residual = residual_zero;
 			// norm_residual = r' * r
-			ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring );
+			ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, cg_opts.ring );
 			assert( ret == SUCCESS );
 
 			// compute sqrt to avoid underflow
@@ -203,33 +205,36 @@ namespace grb {
 					ret = ret ? ret : multigrid_runner( grid_base );
 					assert( ret == SUCCESS );
 				} else {
-					ret = ret ? ret : grb::set( z, r ); // z = r;
+					// z = r
+					ret = ret ? ret : grb::set( z, r );
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( z, "initial z" );
 #endif
 				if( iter == 0 ) {
-					ret = ret ? ret : grb::set( p, z ); //  p = z;
+					//  p = z
+					ret = ret ? ret : grb::set< descr >( p, z );
 					assert( ret == SUCCESS );
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring ); // r_dot_z = r' * z;
+					// r_dot_z = r' * z
+					ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, cg_opts.ring );
 					assert( ret == SUCCESS );
 				} else {
 					old_r_dot_z = r_dot_z;
 					// r_dot_z = r' * z
 					r_dot_z = cg_opts.ring.template getZero< ResidualType >();
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, cg_opts.ring );
+					ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, cg_opts.ring );
 					assert( ret == SUCCESS );
 
 					beta = r_dot_z / old_r_dot_z;
 					// Ap  = 0
-					ret = ret ? ret : grb::set( Ap, io_zero );
+					ret = ret ? ret : grb::set< descr >( Ap, io_zero );
 					assert( ret == SUCCESS );
 					// Ap += beta * p
-					ret = ret ? ret : grb::eWiseMul( Ap, beta, p, cg_opts.ring );
+					ret = ret ? ret : grb::eWiseMul< descr >( Ap, beta, p, cg_opts.ring );
 					assert( ret == SUCCESS );
 					// Ap = Ap + z
-					ret = ret ? ret : grb::eWiseApply( Ap, Ap, z, cg_opts.ring.getAdditiveOperator() );
+					ret = ret ? ret : grb::eWiseApply< descr >( Ap, Ap, z, cg_opts.ring.getAdditiveOperator() );
 					assert( ret == SUCCESS );
 					// p = Ap
 					std::swap( Ap, p );
@@ -239,33 +244,33 @@ namespace grb {
 				DBG_print_norm( p, "middle p" );
 #endif
 				// Ap = A * p
-				ret = ret ? ret : grb::set( Ap, io_zero );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( Ap, A, p, cg_opts.ring );
+				ret = ret ? ret : grb::set< descr >( Ap, io_zero );
+				ret = ret ? ret : grb::mxv< descr >( Ap, A, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( Ap, "middle Ap" );
 #endif
 				// pAp = p' * Ap
 				ResidualType pAp = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot( pAp, Ap, p, cg_opts.ring );
+				ret = ret ? ret : grb::dot< descr >( pAp, Ap, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 
 				ResidualType alpha = r_dot_z / pAp;
 				// x += alpha * p
-				ret = ret ? ret : grb::eWiseMul( x, alpha, p, cg_opts.ring );
+				ret = ret ? ret : grb::eWiseMul< descr >( x, alpha, p, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( x, "end x" );
 #endif
 				// r += - alpha * Ap
-				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, cg_opts.ring );
+				ret = ret ? ret : grb::eWiseMul< descr >( r, -alpha, Ap, cg_opts.ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
 				DBG_print_norm( r, "end r" );
 #endif
 				// residual = r' * r
 				norm_residual = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot( norm_residual, r, r, cg_opts.ring );
+				ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, cg_opts.ring );
 				assert( ret == SUCCESS );
 
 				norm_residual = std::sqrt( norm_residual );
@@ -299,6 +304,7 @@ namespace grb {
 		 * @tparam MultiGridrunnerType type for the multi-grid runner object
 		 * @tparam Ring algebraic ring type
 		 * @tparam Minus minus operator
+		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template<
 			typename IOType,
@@ -307,7 +313,8 @@ namespace grb {
 			typename ResidualType,
 			typename MultiGridRunnerType,
 			class Ring,
-			class Minus
+			class Minus,
+			Descriptor descr = descriptors::no_operation
 		> struct MultiGridCGRunner {
 
 			using HPCGInputType = MultiGridCGData< IOType, NonzeroType, InputType >;
@@ -348,7 +355,7 @@ namespace grb {
 				MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
 				CGOutInfo< ResidualType > &out_info
 			) {
-				return multigrid_conjugate_gradient( cg_data, cg_opts, grid_base, mg_runner, out_info );
+				return multigrid_conjugate_gradient< descr >( cg_data, cg_opts, grid_base, mg_runner, out_info );
 			}
 
 		};
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index f6dbfbd03..177027f3e 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -58,6 +58,7 @@ namespace grb {
 		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
 		 * and returning the failure code.
 		 *
+		 * @tparam descr descriptor for static information
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam NonzeroType type of matrix values
 		 * @tparam MGSysIterType type of the iterator across grid levels
@@ -78,6 +79,7 @@ namespace grb {
 		 *  unsuccessful operation otherwise
 		 */
 		template <
+			Descriptor descr,
 			typename IOType,
 			typename NonzeroType,
 			typename MGSysIterType,
@@ -107,7 +109,7 @@ namespace grb {
 #endif
 
 			// clean destination vector
-			ret = ret ? ret : grb::set( finer_system.z, ring. template getZero< IOType >() );
+			ret = ret ? ret : grb::set< descr >( finer_system.z, ring. template getZero< IOType >() );
 #ifdef HPCG_PRINT_STEPS
 			DBG_print_norm( finer_system.r, "initial r" );
 #endif
@@ -136,7 +138,7 @@ namespace grb {
 			DBG_print_norm( coarser_system.r, "coarse r" );
 #endif
 
-			ret = ret ? ret : multi_grid< IOType, NonzeroType, MGSysIterType,
+			ret = ret ? ret : multi_grid< descr, IOType, NonzeroType, MGSysIterType,
 				MGSmootherType, CoarsenerType, Ring, Minus >( mgiter_begin, mgiter_end,
 				smoother, coarsener, ring, minus );
 			assert( ret == SUCCESS );
@@ -165,23 +167,24 @@ namespace grb {
 		 * It is built by transferring into it the state of both the smoother and the coarsener,
 		 * in order to avoid use-after-free issues.
 		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam MGSysIterType type of the iterator across grid levels
 		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
 		 *  smoothing steps
 		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
 		 *  and prolongation
 		 * @tparam Ring the ring of algebraic operators and zero values
 		 * @tparam Minus the minus operator for subtractions
+		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template<
-			typename IOType,
-			typename NonzeroType,
 			typename MGSmootherType,
 			typename CoarsenerType,
+			typename IOType,
+			typename NonzeroType,
 			class Ring,
-			class Minus
+			class Minus,
+			Descriptor descr = descriptors::no_operation
 		> struct MultiGridRunner {
 
 			static_assert( std::is_default_constructible< Ring >::value,
@@ -244,7 +247,7 @@ namespace grb {
 				if( print_duration ) {
 					timer.reset();
 				}
-				grb::RC ret = multi_grid< IOType, NonzeroType, __unique_ptr_extractor,
+				grb::RC ret = multi_grid< descr, IOType, NonzeroType, __unique_ptr_extractor,
 					MGSmootherType, CoarsenerType, Ring, Minus >(
 					__unique_ptr_extractor( system_levels.begin() += system.level ),
 					__unique_ptr_extractor( system_levels.end() ),
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 97d0c80e4..3193b46fe 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -64,6 +64,7 @@ namespace grb {
 			/**
 			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
 			 *
+			 * @tparam descr descriptor for static information
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
@@ -79,6 +80,7 @@ namespace grb {
 			 *  unsuccessful operation otherwise
 			 */
 			template<
+				Descriptor descr,
 				typename IOType,
 				typename NonzeroType,
 				class Ring
@@ -92,10 +94,12 @@ namespace grb {
 				const Ring & ring
 			) {
 				RC ret = SUCCESS;
-				ret = ret ? ret : grb::set( smoother_temp, ring. template getZero< IOType >() );
 
-				// acc_temp[mask] = A[mask] * x[mask]
-				ret = ret ? ret : grb::mxv< grb::descriptors::safe_overlap >( smoother_temp, color_mask, A, x, ring );
+				// smoother_temp[color_mask] = A[color_mask] * x[color_mask]
+				// use the structural descriptors, assuming ONLY the values of the current color are set
+				// note that if this assumption does not hold, also the following eWiseLambda() is wrong
+				ret = ret ? ret : grb::mxv< grb::descriptors::safe_overlap | grb::descriptors::structural >(
+					smoother_temp, color_mask, A, x, ring );
 				assert( ret == SUCCESS );
 
 				// TODO internal issue #201
@@ -106,13 +110,10 @@ namespace grb {
 					grb::eWiseLambda(
 						[ &x, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
 							// if the mask was properly initialized, the check on the mask value is unnecessary;
-							// nonetheless, it is left not to violate the semantics of RBGS in case also the false values
-							// had been initialized (in which case the check is fundamental); if only true values were initialized,
-							// we expect CPU branch prediction to neutralize the branch cost
 							// if( color_mask[ i ] ) {
-								IOType d = A_diagonal[ i ];
-								IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
-								x[ i ] = v / d;
+							IOType d = A_diagonal[ i ];
+							IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
+							x[ i ] = v / d;
 							// }
 						},
 						color_mask, x, r, smoother_temp, A_diagonal );
@@ -130,6 +131,7 @@ namespace grb {
 			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
 			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
 			 *
+			 * @tparam descr descriptor for static information
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
@@ -140,6 +142,7 @@ namespace grb {
 			 *                          unsuccessful operation otherwise
 			 */
 			template<
+				Descriptor descr,
 				typename IOType,
 				typename NonzeroType,
 				class Ring
@@ -149,19 +152,27 @@ namespace grb {
 				const Ring & ring
 			) {
 				RC ret = SUCCESS;
+				// zero the temp output just once, assuming proper masking avoids
+				// interference among different colors
+				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
+					ring. template getZero< IOType >() );
+
 				// forward step
 				using cit_t = typename std::vector< grb::Vector< bool > >::const_iterator;
 				cit_t end = smoothing_info.color_masks.cend();
 				for( cit_t it = smoothing_info.color_masks.cbegin(); it != end && ret == SUCCESS; ++it ) {
-					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
-						smoothing_info.smoother_temp, *it, ring );
+					ret = rbgs_single_step< descr >( data.A, smoothing_info.A_diagonal, data.r,
+						data.z, smoothing_info.smoother_temp, *it, ring );
 				}
+				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
+					ring. template getZero< IOType >() );
+
 				// backward step
 				using crit_t = typename std::vector< grb::Vector< bool > >::const_reverse_iterator;
 				crit_t rend = smoothing_info.color_masks.crend();
 				for( crit_t rit = smoothing_info.color_masks.crbegin(); rit != rend && ret == SUCCESS; ++rit ) {
-					ret = rbgs_single_step( data.A, smoothing_info.A_diagonal, data.r, data.z,
-						smoothing_info.smoother_temp, *rit, ring );
+					ret = rbgs_single_step< descr >( data.A, smoothing_info.A_diagonal, data.r,
+						data.z, smoothing_info.smoother_temp, *rit, ring );
 				}
 				return ret;
 			}
@@ -177,11 +188,13 @@ namespace grb {
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam NonzeroType type of matrix values
 		 * @tparam Ring the ring of algebraic operators
+		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template <
 			typename IOType,
 			typename NonzeroType,
-			class Ring
+			class Ring,
+			Descriptor descr = descriptors::no_operation
 		> struct RedBlackGSSmootherRunner {
 
 			size_t presmoother_steps; ///< number of pre-smoother steps
@@ -224,7 +237,8 @@ namespace grb {
 				SmootherData< IOType > &smoothing_info = *( levels.at( data.level ).get() );
 
 				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : internal::red_black_gauss_seidel( data, smoothing_info, ring );
+					ret = ret ? ret : internal::red_black_gauss_seidel< descr >(
+						data, smoothing_info, ring );
 					assert( ret == SUCCESS );
 				}
 				return ret;
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index e1ef7db73..f2b008e6f 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -69,6 +69,7 @@ namespace grb {
 			 *
 			 * The coarsening information are stored inside \p CoarseningData.
 			 *
+			 * @tparam descr descriptor for static information
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
@@ -82,6 +83,7 @@ namespace grb {
 			 *                          unsuccessful operation otherwise
 			 */
 			template<
+				Descriptor descr,
 				typename IOType,
 				typename NonzeroType,
 				class Ring,
@@ -94,16 +96,14 @@ namespace grb {
 				const Minus & minus
 			) {
 				RC ret = SUCCESS;
-				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer prima" );
-				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine,
+				ret = ret ? ret : grb::eWiseApply< descr >( coarsening_data.Ax_finer, r_fine,
 					coarsening_data.Ax_finer, minus ); // Ax_finer = r_fine - Ax_finer
-				// DBG_print_norm( coarsening_data.Ax_finer, "+++ Ax_finer dopo" );
 				assert( ret == SUCCESS );
 
 				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
 				// to *coarsening_data->system_size
-				ret = ret ? ret : grb::set( r_coarse, ring.template getZero< IOType >() );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( r_coarse, coarsening_data.coarsening_matrix,
+				ret = ret ? ret : grb::set< descr >( r_coarse, ring.template getZero< IOType >() );
+				ret = ret ? ret : grb::mxv< descr >( r_coarse, coarsening_data.coarsening_matrix,
 					coarsening_data.Ax_finer, ring ); // r = coarsening_matrix * Ax_finer
 				return ret;
 			}
@@ -114,6 +114,7 @@ namespace grb {
 			 *
 			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
 			 *
+			 * @tparam descr descriptor for static information
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
@@ -125,6 +126,7 @@ namespace grb {
 			 * unsuccessful operation otherwise
 			 */
 			template<
+				Descriptor descr,
 				typename IOType,
 				typename NonzeroType,
 				class Ring
@@ -137,13 +139,13 @@ namespace grb {
 				RC ret = SUCCESS;
 				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
 				// to nrows(x_fine)
-				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
+				ret = ret ? ret : grb::set< descr >( coarsening_data.Ax_finer, ring.template getZero< IOType >() );
 
-				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix | grb::descriptors::dense >(
+				ret = ret ? ret : grb::mxv< descr | grb::descriptors::transpose_matrix >(
 					coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, z_coarse, ring );
 				assert( ret == SUCCESS );
 
-				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
+				ret = ret ? ret : grb::foldl< descr >( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
 				assert( ret == SUCCESS );
 				return ret;
 			}
@@ -160,7 +162,8 @@ namespace grb {
 			typename IOType,
 			typename NonzeroType,
 			class Ring,
-			class Minus
+			class Minus,
+			Descriptor descr = descriptors::no_operation
 		> struct SingleMatrixCoarsener {
 
 			static_assert( std::is_default_constructible< Ring >::value,
@@ -189,10 +192,10 @@ namespace grb {
 			) {
 				// first compute the residual
 				CoarseningData< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
-				grb::RC ret = grb::set( coarsener.Ax_finer, ring. template getZero< IOType >() );
-				ret = ret ? ret : grb::mxv< grb::descriptors::dense >( coarsener.Ax_finer, finer.A, finer.z, ring );
+				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring. template getZero< IOType >() );
+				ret = ret ? ret : grb::mxv< descr >( coarsener.Ax_finer, finer.A, finer.z, ring );
 
-				return internal::compute_coarsening( finer.r, coarser.r, coarsener, ring, minus );
+				return internal::compute_coarsening< descr >( finer.r, coarser.r, coarsener, ring, minus );
 			}
 
 			/**
@@ -203,7 +206,7 @@ namespace grb {
 				const MultiGridInputType &coarser,
 				MultiGridInputType &finer
 			) {
-				return internal::compute_prolongation( coarser.z, finer.z, *coarsener_levels[ finer.level ], ring );
+				return internal::compute_prolongation< descr >( coarser.z, finer.z, *coarsener_levels[ finer.level ], ring );
 			}
 		};
 
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index bba6e9c2f..f7cd05787 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -97,9 +97,10 @@ using StdRing = Semiring< grb::operators::add< NonzeroType >, grb::operators::mu
 	grb::identities::zero, grb::identities::one >;
 using StdMinus = operators::subtract< NonzeroType >;
 using coord_t = size_t;
+constexpr Descriptor hpcg_desc = descriptors::dense;
 
 // assembled types for simulation runners and input/output structures
-using hpcg_runner_t = HPCGRunnerType< IOType, NonzeroType, InputType, ResidualType,
+using hpcg_runner_t = HPCGRunnerType< hpcg_desc, IOType, NonzeroType, InputType, ResidualType,
 	StdRing, StdMinus >;
 using mg_data_t = MultiGridData< IOType, NonzeroType >;
 using coarsening_data_t = CoarseningData< IOType, NonzeroType >;
@@ -338,7 +339,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	std::unique_ptr< hpcg_data_t > hpcg_state;
 
 	// define the main HPCG runner and initialize the options of its components
-	hpcg_runner_t hpcg_runner( build_hpcg_runner< IOType, NonzeroType, InputType, ResidualType,
+	hpcg_runner_t hpcg_runner( build_hpcg_runner< hpcg_desc, IOType, NonzeroType, InputType, ResidualType,
 		StdRing, StdMinus >( in.smoother_steps ) );
 	auto &mg_runner = hpcg_runner.mg_runner;
 	auto &coarsener = mg_runner.coarsener_runner;

From 571fdd5391880fe522146eecb5b8753b9a0c2067 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Wed, 22 Feb 2023 21:02:36 +0100
Subject: [PATCH 19/28] adding telemetry functionalities: * token to
 enable/disable all functionalities at compile time/run time * stopwatch to
 measure elapsed time * output stream to selectively log information
 (depending on token) * CSV writer object to emit tracing info in a convenient
 format

---
 .../utils/iterators/IteratorValueAdaptor.hpp  |   8 +-
 .../graphblas/utils/telemetry/CSVWriter.hpp   | 293 ++++++++++++++++++
 .../utils/telemetry/OutputStream.hpp          | 152 +++++++++
 .../graphblas/utils/telemetry/Stopwatch.hpp   | 143 +++++++++
 .../graphblas/utils/telemetry/Telemetry.hpp   |  32 ++
 .../utils/telemetry/TelemetryBase.hpp         |  98 ++++++
 .../utils/telemetry/TelemetryToken.hpp        | 145 +++++++++
 .../graphblas/utils/telemetry/Timeable.hpp    | 101 ++++++
 8 files changed, 968 insertions(+), 4 deletions(-)
 create mode 100644 include/graphblas/utils/telemetry/CSVWriter.hpp
 create mode 100644 include/graphblas/utils/telemetry/OutputStream.hpp
 create mode 100644 include/graphblas/utils/telemetry/Stopwatch.hpp
 create mode 100644 include/graphblas/utils/telemetry/Telemetry.hpp
 create mode 100644 include/graphblas/utils/telemetry/TelemetryBase.hpp
 create mode 100644 include/graphblas/utils/telemetry/TelemetryToken.hpp
 create mode 100644 include/graphblas/utils/telemetry/Timeable.hpp

diff --git a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
index bca870af8..2c0383325 100644
--- a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
+++ b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
@@ -56,8 +56,8 @@ namespace grb {
 			static_assert( std::is_copy_assignable< AdaptorType >::value,
 				"AdaptorType must be copy-assignable" );
 
-			typedef decltype( std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) ) reference;
-			typedef typename std::decay< reference >::type value_type;
+			typedef typename std::decay< decltype( *std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) ) >::type value_type;
+			typedef value_type & reference;
 			typedef value_type * pointer;
 			typedef const value_type * const_pointer;
 			typedef typename std::iterator_traits< InnerIterType >::iterator_category iterator_category;
@@ -129,9 +129,9 @@ namespace grb {
 
 			bool operator==( const SelfType & o ) const { return ! operator!=( o ); }
 
-			reference operator*() { return adaptor( *iter ); }
+			reference operator*() { return *adaptor( *iter ); }
 
-			const reference operator*() const { return adaptor( *iter ); }
+			const reference operator*() const { return *adaptor( *iter ); }
 
 			pointer operator->() { return adaptor( *iter ); }
 
diff --git a/include/graphblas/utils/telemetry/CSVWriter.hpp b/include/graphblas/utils/telemetry/CSVWriter.hpp
new file mode 100644
index 000000000..969b73be8
--- /dev/null
+++ b/include/graphblas/utils/telemetry/CSVWriter.hpp
@@ -0,0 +1,293 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_CSV_WRITER
+#define _H_GRB_UTILS_TELEMETRY_CSV_WRITER
+
+#include <type_traits>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <fstream>
+#include <utility>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			static constexpr char STD_CSV_SEP = ',';
+
+			template<
+				typename TelTokenType,
+				bool enabled,
+				class T1,
+				class ...Ts
+			> class CSVWriter : public TelemetryBase< TelTokenType, enabled > {
+			public:
+				template< class U, class ...Us > struct is_csv_printable {
+					static constexpr bool value = std::is_arithmetic< U >::value;
+				};
+
+				template< class U1, class U2, class ...Us > struct is_csv_printable< U1, U2, Us...>  {
+					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
+				};
+
+				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
+
+				using self_t = CSVWriter< TelTokenType, enabled, T1, Ts... >;
+
+				using base_t = TelemetryBase< TelTokenType, enabled >;
+
+				CSVWriter() = delete;
+
+				CSVWriter(
+					const TelTokenType & tt,
+					std::initializer_list< const char * > _headers,
+					char _separator,
+					size_t size
+				) :
+					base_t( tt )
+				 {
+					( void ) tt;
+					( void ) _headers;
+					( void ) _separator;
+					( void ) size;
+				}
+
+				CSVWriter( const TelTokenType & tt, std::initializer_list< const char * > _headers ) :
+					CSVWriter( tt, _headers, STD_CSV_SEP, 10 )
+				{}
+
+				CSVWriter( const self_t & ) = delete;
+
+				CSVWriter( self_t && ) = delete;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				self_t & operator=( self_t && ) = delete;
+
+				template< class... UTypes > void add_line( UTypes&&... ) {
+				}
+
+				void clear() {}
+
+				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				// print nothing
+				char last_line() const { return '\0'; }
+
+				std::ostream & write_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				void write_to_file( const char * name ) const {
+					( void ) name;
+				}
+			};
+
+
+			template<
+				typename TelTokenType,
+				class T1,
+				class ...Ts
+			> class CSVWriter< TelTokenType, true, T1, Ts... >  : public TelemetryBase< TelTokenType, true > {
+			public:
+				template< class U, class ...Us > struct is_csv_printable {
+					static constexpr bool value = std::is_arithmetic< U >::value;
+				};
+
+				template< class U1, class U2, class ...Us > struct is_csv_printable< U1, U2, Us...>  {
+					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
+				};
+
+				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
+
+				using self_t = CSVWriter< TelTokenType, true, T1, Ts... >;
+
+				using base_t = TelemetryBase< TelTokenType, true >;
+
+				class CSVLastTuple {
+				public:
+					CSVLastTuple( const self_t & _csv ) : csv( _csv ) {}
+
+					CSVLastTuple( const CSVLastTuple & clt ) : csv( clt.csv ) {}
+
+					inline friend std::ostream & operator<<(
+						std::ostream & stream,
+						const CSVLastTuple & t
+					) {
+						return t.csv.write_last_line_to_stream( stream );
+					}
+
+				private:
+					const self_t & csv;
+				};
+
+				CSVWriter() = delete;
+
+				CSVWriter(
+					const TelTokenType & tt,
+					std::initializer_list< const char * > _headers,
+					char _separator,
+					size_t size
+				) :
+					base_t( tt ),
+					separator( _separator )
+				{
+					if( _headers.size() != NUM_FIELDS ) {
+						throw std::runtime_error( "wrong number of headers, it must match the unmber of line elements" );
+					}
+					// emplace anyway, so that the object is always in a consistent state and can be
+					// activated/deactivated at runtime
+					for( const auto & h : _headers ) {
+						headers.emplace_back( h );
+					}
+					if ( !tt.is_active() ) {
+						return;
+					}
+					lines.reserve( size );
+					// zero to force physical allocation
+					//std::memset( reinterpret_cast< void * >( lines.data() ), 0, lines.size() * sizeof( tuple_t ) );
+				}
+
+				CSVWriter( const TelTokenType & tt, std::initializer_list< const char * > _headers ) :
+					CSVWriter( tt, _headers, STD_CSV_SEP, 10 )
+				{}
+
+				CSVWriter( const self_t & ) = delete;
+
+				CSVWriter( self_t && ) = delete;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				self_t & operator=( self_t && ) = delete;
+
+				template< class... UTypes > void add_line( UTypes&&... vs ) {
+					if ( this->is_active() ) {
+						lines.emplace_back( std::forward<UTypes>( vs )...  );
+					}
+				}
+
+				void clear() {
+					lines.clear();
+				}
+
+				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
+					if ( lines.size() > 0 && this->is_active() ) {
+						write_line( stream, lines.back() );
+					}
+					return stream;
+				}
+
+				CSVLastTuple last_line() const {
+					if ( lines.size() == 0 ) {
+						throw std::runtime_error( "no measures" );
+					}
+					return CSVLastTuple( *this );
+				}
+
+				std::ostream & write_to_stream( std::ostream & stream ) const {
+					if ( !this->is_active() ) {
+						return stream;
+					}
+					write_header( stream );
+					stream << NEW_LINE;
+					for( const tuple_t & line : lines ) {
+						write_line( stream, line );
+						stream << NEW_LINE;
+					}
+					return stream;
+				}
+
+				void write_to_file( const char * name ) const {
+					if ( !this->is_active() ) {
+						return;
+					}
+					std::ofstream file( name );
+					if( !file.is_open() ) {
+						throw std::runtime_error( "cannot open file" );
+					}
+					write_to_stream( file );
+					file.close();
+				}
+
+			private:
+				static constexpr char NEW_LINE = '\n';
+
+				static constexpr size_t NUM_FIELDS = sizeof...( Ts ) + 1;
+
+				using tuple_t = std::tuple< T1, Ts... >;
+
+				std::vector< std::string > headers;
+				const char separator;
+				std::vector< tuple_t > lines;
+
+				std::ostream & write_header( std::ostream & stream ) const {
+					stream << headers[ 0 ];
+					for( size_t i = 1; i < headers.size(); i++ ) {
+						stream << separator << headers[ i ];
+					}
+					return stream;
+				}
+
+				void write_line( std::ostream & stream, const tuple_t & line ) const {
+					write_val< 0 >( stream, line );
+				}
+
+				// recursive case
+				template< size_t OFFS > inline void write_val(
+					std::ostream & stream,
+					typename std::enable_if< OFFS < NUM_FIELDS - 1, const tuple_t &>::type _tup
+				) const {
+					stream << std::get< OFFS >( _tup ) << separator;
+					write_val< OFFS + 1 >( stream, _tup ); // tail recursion
+				}
+
+				// base case
+				template< size_t OFFS > inline void write_val(
+					std::ostream & stream,
+					typename std::enable_if< OFFS == NUM_FIELDS - 1, const tuple_t &>::type _tup
+				) const {
+					(void) separator;
+					stream << std::get< OFFS >( _tup );
+				}
+
+			};
+
+			template<
+				class T1,
+				class ...Ts
+			> using StaticCSVWriter = CSVWriter< TelemetryTokenAlwaysOn, true, T1, Ts... >;
+
+		}
+	}
+}
+
+
+#endif // _H_GRB_UTILS_TELEMETRY_CSV_WRITER
diff --git a/include/graphblas/utils/telemetry/OutputStream.hpp b/include/graphblas/utils/telemetry/OutputStream.hpp
new file mode 100644
index 000000000..35622b11a
--- /dev/null
+++ b/include/graphblas/utils/telemetry/OutputStream.hpp
@@ -0,0 +1,152 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
+#define _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
+
+#include <ostream>
+#include <type_traits>
+#include <utility>
+#include <functional>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			template< typename T > struct is_ostream_input {
+
+				template< typename U > static constexpr bool is_input(
+					typename std::enable_if< std::is_same<
+						// this means that the expression std::cout << obj is valid, where obj is of type T
+						decltype( std::declval< std::ostream& >() << std::declval< U >() ),
+						std::ostream& >::value, nullptr_t >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool is_input( ... ) {
+					return false;
+				}
+
+				static constexpr bool value = is_input< T >( nullptr );
+			};
+
+			class OutputStreamLazy {
+				constexpr char operator()() const { return '\0'; }
+			};
+
+			template<
+				typename TelTokenType,
+				bool enabled = TelTokenType::enabled
+			> class OutputStream : public TelemetryBase< TelTokenType, enabled > {
+			public:
+				using self_t = OutputStream< TelTokenType, enabled >;
+
+				OutputStream() = default;
+
+				OutputStream( const TelTokenType & _tt, std::ostream & _out ) :
+					TelemetryBase< TelTokenType, enabled >( _tt )
+				{
+					( void ) _out;
+				}
+
+				OutputStream( const self_t & _out ) = default;
+
+				OutputStream & operator=( const self_t & _out ) = delete;
+
+				template< typename T > inline typename std::enable_if<
+					is_ostream_input< T >::value,
+				self_t & >::type operator<<( T&& v ) {
+					( void ) v;
+					return *this;
+				}
+
+				inline self_t & operator<<( std::ostream& (*func)( std::ostream& ) ) {
+					( void ) func;
+					return *this;
+				}
+
+				template< class F > inline typename std::enable_if<
+					is_ostream_input< decltype( std::declval< F >()() ) >::value
+					&& std::is_base_of< OutputStreamLazy, F >::value,
+				self_t & >::type operator<<( F&& fun ) {
+					( void ) fun;
+					return *this;
+				}
+			};
+
+			template< typename TelTokenType > class OutputStream< TelTokenType, true > :
+				public TelemetryBase< TelTokenType, true > {
+			public:
+				using self_t = OutputStream< TelTokenType, true >;
+
+				using base_t = TelemetryBase< TelTokenType, true >;
+
+				OutputStream( const TelTokenType & _tt, std::ostream & _out ) :
+					TelemetryBase< TelTokenType, true >( _tt ),
+					out( _out )
+				{}
+
+				OutputStream( const self_t & _outs ) = default;
+
+				OutputStream & operator=( const self_t & _out ) = delete;
+
+				template< typename T > inline typename std::enable_if<
+					is_ostream_input< T >::value,
+				self_t & >::type operator<<( T&& v ) {
+					if ( this->is_active() ) {
+						out << std::forward< T >( v );
+					}
+					return *this;
+				}
+
+				inline self_t & operator<<( std::ostream& (*func)( std::ostream& ) ) {
+					if ( this->is_active() ) {
+						out << func;
+					}
+					return *this;
+				}
+
+				template< class F > inline typename std::enable_if<
+					is_ostream_input< decltype( std::declval< F >()() ) >::value
+					&& std::is_base_of< OutputStreamLazy, F >::value,
+				self_t & >::type operator<<( F&& fun ) {
+					if ( this->is_active() ) {
+						out << fun();
+					}
+					return *this;
+				}
+
+			private:
+				std::ostream & out;
+			};
+
+			using OutputStreamOff = OutputStream< TelemetryTokenAlwaysOff, false >;
+
+			using OutputStreamOn = OutputStream< TelemetryTokenAlwaysOn, true >;
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
diff --git a/include/graphblas/utils/telemetry/Stopwatch.hpp b/include/graphblas/utils/telemetry/Stopwatch.hpp
new file mode 100644
index 000000000..2cc900b61
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Stopwatch.hpp
@@ -0,0 +1,143 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_STOPWATCH
+#define _H_GRB_UTILS_TELEMETRY_STOPWATCH
+
+#include <chrono>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			using duration_nano_t = size_t;
+
+			using duration_float_t = double;
+
+			class StopwatchBase {
+			public:
+				static inline duration_float_t nano2Micro( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000UL;
+				}
+
+				static inline duration_float_t nano2Milli( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000000UL;
+				}
+
+				static inline duration_float_t nano2Sec( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000000000UL;
+				}
+
+			};
+
+			template<
+				typename TelTokenType,
+				bool enabled = TelTokenType::enabled
+			> class Stopwatch:
+				public StopwatchBase, public TelemetryBase< TelTokenType, enabled > {
+			public:
+				Stopwatch( const TelTokenType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelTokenType, enabled >( tt )
+					{}
+
+				Stopwatch( const Stopwatch & ) = default;
+
+				constexpr inline void start() {}
+
+				constexpr inline duration_nano_t stop() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t reset() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( 0 );
+				}
+			};
+
+
+			template<
+				typename TelTokenType
+			> class Stopwatch< TelTokenType, true >:
+				public StopwatchBase, public TelemetryBase< TelTokenType, true > {
+
+				typedef typename std::chrono::high_resolution_clock clock_t;
+
+				typedef typename std::chrono::nanoseconds duration_t;
+
+				typedef typename std::chrono::high_resolution_clock::time_point time_point_t;
+
+				duration_t elapsedTime;
+
+				time_point_t beginning;
+
+			public:
+				Stopwatch( const TelTokenType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelTokenType, true >( tt ),
+					elapsedTime( duration_t::zero() )
+					{}
+
+				Stopwatch( const Stopwatch & s ) = default;
+
+				inline void start() {
+					if ( this->is_active() ) {
+						beginning = clock_t::now();
+					}
+				}
+
+				inline duration_nano_t stop() {
+					duration_nano_t count = 0;
+					if ( this->is_active() ) {
+						time_point_t end = clock_t::now();
+						duration_t d = end - beginning;
+						count = d.count();
+						elapsedTime += d;
+					}
+					return count;
+				}
+
+				inline duration_nano_t reset() {
+					duration_t r = duration_t::zero();
+					if ( this->is_active() ) {
+						r = elapsedTime;
+						elapsedTime = duration_t::zero();
+					}
+					return static_cast< duration_nano_t >( r.count() );
+				}
+
+				inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( elapsedTime.count() );
+				}
+			};
+
+			using StaticStopwatch = Stopwatch< TelemetryTokenAlwaysOn, true >;
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_STOPWATCH
diff --git a/include/graphblas/utils/telemetry/Telemetry.hpp b/include/graphblas/utils/telemetry/Telemetry.hpp
new file mode 100644
index 000000000..f8369d1d1
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Telemetry.hpp
@@ -0,0 +1,32 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY
+
+#include "TelemetryToken.hpp"
+#include "Stopwatch.hpp"
+#include "Timeable.hpp"
+#include "CSVWriter.hpp"
+#include "OutputStream.hpp"
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY
diff --git a/include/graphblas/utils/telemetry/TelemetryBase.hpp b/include/graphblas/utils/telemetry/TelemetryBase.hpp
new file mode 100644
index 000000000..969f93213
--- /dev/null
+++ b/include/graphblas/utils/telemetry/TelemetryBase.hpp
@@ -0,0 +1,98 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
+
+#include "TelemetryToken.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			template<
+				typename TelTokenType,
+				bool enabled = TelTokenType::enabled
+			> class TelemetryBase {
+			public:
+				static_assert( is_telemetry_token< TelTokenType >::value,
+					"type TelTokenType does not implement Telemetry Token interface" );
+
+				using self_t = TelemetryBase< TelTokenType, enabled >;
+
+				TelemetryBase() = default;
+
+				TelemetryBase( const TelTokenType & tt ) {
+					( void ) tt;
+				}
+
+				TelemetryBase( const self_t & ) = default;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				constexpr bool is_active() const { return false; }
+			};
+
+
+			template<
+				typename TelTokenType
+			> class TelemetryBase< TelTokenType, true > {
+
+				const TelTokenType & telemetry_token;
+
+			public:
+				static_assert( is_telemetry_token< TelTokenType >::value,
+					"type TelTokenType does not implement Telemetry Token interface" );
+
+				using self_t = TelemetryBase< TelTokenType, true >;
+
+				TelemetryBase( const TelTokenType & tt ): telemetry_token( tt ) {}
+
+				TelemetryBase( const self_t & tb ) : telemetry_token( tb.telemetry_token ) {}
+
+				self_t & operator=( const self_t & ) = delete;
+
+				bool is_active() const { return telemetry_token.is_active(); }
+			};
+
+			// always actibe base, especially for prototyping scenarios
+			template<> class TelemetryBase< TelemetryTokenAlwaysOn, true > {
+			public:
+				static_assert( is_telemetry_token< TelemetryTokenAlwaysOn >::value,
+					"type TelTokenType does not implement Telemetry Token interface" );
+
+				using self_t = TelemetryBase< TelemetryTokenAlwaysOn, true >;
+
+				TelemetryBase( const TelemetryTokenAlwaysOn & tt ) { (void) tt; }
+
+				TelemetryBase( const self_t & tb ) = default;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				constexpr bool is_active() const { return true; }
+			};
+
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
diff --git a/include/graphblas/utils/telemetry/TelemetryToken.hpp b/include/graphblas/utils/telemetry/TelemetryToken.hpp
new file mode 100644
index 000000000..dabac3c2e
--- /dev/null
+++ b/include/graphblas/utils/telemetry/TelemetryToken.hpp
@@ -0,0 +1,145 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
+
+#include <type_traits>
+#include <utility>
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			template< typename T > constexpr bool is_token_enabled() { return false; }
+
+			// OFF
+			template< bool en > class TelemetryTokenBase {
+			public:
+				using self_t = TelemetryTokenBase< en >;
+
+				TelemetryTokenBase( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryTokenBase() = delete;
+
+				TelemetryTokenBase( const self_t & ) = delete;
+
+				TelemetryTokenBase& operator=( const self_t & ) = delete;
+
+				constexpr bool is_active() const { return false; }
+
+				static constexpr bool enabled = false;
+			};
+
+			using TelemetryTokenAlwaysOff = TelemetryTokenBase< false >;
+
+			template<> class TelemetryTokenBase< true > {
+			public:
+				using self_t = TelemetryTokenBase< true >;
+
+				TelemetryTokenBase( bool _active ) : active( _active ) {}
+
+				TelemetryTokenBase() = delete;
+
+				TelemetryTokenBase( const self_t & ) = delete;
+
+				TelemetryTokenBase& operator=( const self_t & ) = delete;
+
+				bool is_active() const { return this->active; }
+
+				static constexpr bool enabled = true;
+
+			protected:
+				const bool active;
+			};
+
+			// always active token, especially for prototyping scenarios
+			class TelemetryTokenAlwaysOn {
+			public:
+				TelemetryTokenAlwaysOn( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryTokenAlwaysOn() = delete;
+
+				TelemetryTokenAlwaysOn( const TelemetryTokenAlwaysOn & ) = delete;
+
+				TelemetryTokenAlwaysOn& operator=( const TelemetryTokenAlwaysOn & ) = delete;
+
+				constexpr bool is_active() const { return true; }
+
+				static constexpr bool enabled = true;
+			};
+
+
+			template< typename T > struct is_telemetry_token {
+			private:
+				template< typename U > static constexpr bool has_enabled_field(
+					typename std::enable_if<
+						std::is_same< typename std::decay< decltype( U::enabled ) >::type, bool >::value,
+							bool * >::type
+					) {
+						return true;
+					}
+
+				template< typename U > static constexpr bool has_enabled_field( ... ) { return false; }
+
+				template< typename U > static constexpr bool has_is_active_method(
+					typename std::enable_if<
+						std::is_same< typename std::decay< decltype( std::declval< U >().is_active() ) >::type, bool >::value,
+						bool * >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool has_is_active_method( ... ) { return false; }
+
+			public:
+				static constexpr bool value = has_enabled_field< T >( nullptr ) && has_is_active_method< T >( nullptr );
+			};
+		}
+
+	}
+}
+
+#define __TELEMETRY_TOKEN_ENABLER_NAME( name ) __ ## name ## Enabler
+#define __TELEMETRY_TOKEN_NAME( name ) name
+
+#define DECLARE_TELEMETRY_TOKEN( name ) 																			\
+	class __TELEMETRY_TOKEN_ENABLER_NAME( name ) {};																\
+	template< typename T > class __TELEMETRY_TOKEN_NAME( name ) :													\
+		public grb::utils::telemetry::TelemetryTokenBase< grb::utils::telemetry::is_token_enabled< T >() > {		\
+	public:																											\
+		using base_t = grb::utils::telemetry::TelemetryTokenBase< grb::utils::telemetry::is_token_enabled< T >() >;	\
+		__TELEMETRY_TOKEN_NAME( name )( bool _enabled ) : base_t( _enabled ) {}										\
+	};
+
+
+#define ACTIVATE_TOKEN( name ) namespace grb { namespace utils { namespace telemetry {					\
+	template<> constexpr bool is_token_enabled< __TELEMETRY_TOKEN_ENABLER_NAME( name ) >() { return true; } \
+} } }
+
+#define TELEMETRY_TOKEN_TYPE( name ) __TELEMETRY_TOKEN_NAME( name )< __TELEMETRY_TOKEN_ENABLER_NAME( name ) >
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
diff --git a/include/graphblas/utils/telemetry/Timeable.hpp b/include/graphblas/utils/telemetry/Timeable.hpp
new file mode 100644
index 000000000..02dd85b9e
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Timeable.hpp
@@ -0,0 +1,101 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Alberto Scolari
+ * @date 14th February, 2023
+ */
+
+#ifndef _H_GRB_UTILS_TIMEABLE
+#define _H_GRB_UTILS_TIMEABLE
+
+#include "Stopwatch.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			template<
+				typename TelTokenType,
+				bool enabled = TelTokenType::enabled
+			> class Timeable {
+			public:
+				using self_t = Timeable< TelTokenType, enabled >;
+
+				Timeable( const TelTokenType & tt ) {
+					(void) tt;
+				}
+
+				Timeable( const self_t & ) = default;
+
+				Timeable& operator=( const self_t & ) = delete;
+
+				constexpr inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t reset() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			protected:
+				inline void start() {}
+
+				constexpr inline duration_nano_t stop() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			};
+
+			template< typename TelTokenType > class Timeable< TelTokenType, true > {
+			public:
+				using self_t = Timeable< TelTokenType, true >;
+
+				Timeable( const TelTokenType & tt ) : swatch( tt ) {}
+
+				Timeable( const self_t & ) = default;
+
+				Timeable& operator=( const self_t & ) = delete;
+
+				inline duration_nano_t getElapsedNano() const {
+					return swatch.getElapsedNano();
+				}
+
+				inline duration_nano_t reset() {
+					return swatch.reset();
+				}
+
+			protected:
+				inline void start() {
+					swatch.start();
+				}
+
+				inline duration_nano_t stop() {
+					return swatch.stop();
+				}
+
+			private:
+				Stopwatch< TelTokenType > swatch;
+			};
+
+			using StaticTimeable = Timeable< TelemetryTokenAlwaysOn, true >;
+
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TIMEABLE

From 7833b319752498c7d197815cc88c8dad345fe33c Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 23 Feb 2023 15:46:51 +0100
Subject: [PATCH 20/28] restructuring Multigrid and HPCG to clean code and add
 flexible logging

fixing MG telemetry - missing reset
---
 include/graphblas/algorithms/hpcg/hpcg.hpp    |  94 ----
 .../algorithms/hpcg/system_builder.hpp        |   6 +
 .../algorithms/hpcg/system_building_utils.hpp |  37 +-
 .../multigrid/multigrid_building_utils.hpp    |  18 +-
 .../algorithms/multigrid/multigrid_cg.hpp     | 405 +++++++++---------
 .../algorithms/multigrid/multigrid_data.hpp   |  14 +-
 .../multigrid/multigrid_v_cycle.hpp           | 292 ++++++-------
 .../multigrid/red_black_gauss_seidel.hpp      | 223 +++++-----
 .../multigrid/single_matrix_coarsener.hpp     | 166 +++----
 tests/smoke/hpcg.cpp                          | 354 +++++++++------
 10 files changed, 761 insertions(+), 848 deletions(-)
 delete mode 100644 include/graphblas/algorithms/hpcg/hpcg.hpp

diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
deleted file mode 100644
index b4884f4e1..000000000
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-
-/*
- *   Copyright 2022 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @dir include/graphblas/algorithms/hpcg
- * This folder contains the code specific to the HPCG benchmark implementation: generation of the physical system,
- * generation of the single point coarsener and coloring algorithm.
- */
-
-/**
- * @file hpcg.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * Utility to build a full HPCG runner, bringing together all needed data structures.
- */
-
-#ifndef _H_GRB_ALGORITHMS_HPCG_HPCG
-#define _H_GRB_ALGORITHMS_HPCG_HPCG
-
-#include <utility>
-
-#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
-#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
-#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
-#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
-
-namespace grb {
-	namespace algorithms {
-
-		// simply "assemble" types
-		template<
-			Descriptor descr,
-			typename IOType,
-			typename ResidualType,
-			typename NonzeroType,
-			typename InputType,
-			class Ring,
-			class Minus
-		> using HPCGRunnerType = MultiGridCGRunner< IOType, NonzeroType, InputType, ResidualType,
-			MultiGridRunner<
-				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >,
-				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr >,
-				IOType, NonzeroType, Ring, Minus, descr
-			>, Ring, Minus, descr
-		>;
-
-		/**
-		 * Builds a full HPCG runner object by "assemblying" all needed information,
-		 * with default type for smoother, coarsener and multi-grid runner.
-		 *
-		 * @param[in] smoother_steps how many times the smoother should run (both pre- and post-smoothing)
-		 */
-		template<
-			Descriptor descr,
-			typename IOType,
-			typename ResidualType,
-			typename NonzeroType,
-			typename InputType,
-			class Ring,
-			class Minus
-		> HPCGRunnerType< descr, IOType, ResidualType, NonzeroType, InputType, Ring, Minus >
-			build_hpcg_runner( size_t smoother_steps ) {
-
-			SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr > coarsener;
-			RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >
-				smoother( { smoother_steps, smoother_steps, 1UL, {}, Ring() } );
-
-			MultiGridRunner<
-				RedBlackGSSmootherRunner< IOType, NonzeroType, Ring, descr >,
-				SingleMatrixCoarsener< IOType, NonzeroType, Ring, Minus, descr >,
-				IOType, NonzeroType, Ring, Minus, descr
-			> mg_runner( std::move( smoother ), std::move( coarsener ) );
-
-			return HPCGRunnerType< descr, IOType, ResidualType, NonzeroType, InputType, Ring, Minus >(
-				std::move( mg_runner ) );
-		}
-
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_HPCG_HPCG
diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
index e19ba208d..94d1565f2 100644
--- a/include/graphblas/algorithms/hpcg/system_builder.hpp
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @dir include/graphblas/algorithms/hpcg
+ * This folder contains the code specific to the HPCG benchmark implementation: generation of the physical system,
+ * generation of the single point coarsener and coloring algorithm.
+ */
+
 /**
  * @file system_builders.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 9503f77ff..b86564def 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -141,16 +141,15 @@ namespace grb {
 		template <
 			size_t DIMS,
 			typename CoordType,
-			typename NonzeroType
+			typename NonzeroType,
+			typename Logger
 		> grb::RC hpcg_populate_system_matrix(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			grb::Matrix< NonzeroType > &M
+			grb::Matrix< NonzeroType > &M,
+			Logger & logger
 		) {
-			const size_t pid = spmd<>::pid();
 
-			if( pid == 0) {
-				std::cout << "- generating system matrix...";
-			}
+			logger << "- generating system matrix...";
 			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator begin(
 				system_generator.make_begin_iterator() );
 			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator end(
@@ -421,24 +420,20 @@ namespace grb {
 		template<
 			size_t DIMS,
 			typename CoordType,
-			typename NonzeroType
+			typename NonzeroType,
+			typename Logger
 		> grb::RC hpcg_populate_smoothing_data(
 			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			SmootherData< NonzeroType > &smoothing_info
+			SmootherData< NonzeroType > &smoothing_info,
+			Logger & logger
 		) {
-			const size_t pid = spmd<>::pid();
-
 			grb::RC rc = set( smoothing_info.A_diagonal, system_generator.get_diag_value() );
 			if( rc != grb::SUCCESS ) {
-				if( pid == 0 ) {
-					std::cout << "error: " << __LINE__ << std::endl;
-				}
+				logger << "error: " << __LINE__ << std::endl;
 				return rc;
 			}
 
-			if( pid == 0 ) {
-				std::cout << "- running coloring heuristics...";
-			}
+			logger << "- running coloring heuristics...";
 			std::vector< CoordType > colors, color_counters;
 			hpcg_greedy_color_ndim_system( system_generator.get_generator(), colors, color_counters );
 			std::vector< std::vector< CoordType > > per_color_rows;
@@ -446,15 +441,11 @@ namespace grb {
 			colors.clear();
 			colors.shrink_to_fit();
 			if( rc != grb::SUCCESS ) {
-				if( pid == 0 ) {
-					std::cout << "error: " << __LINE__ << std::endl;
-				}
+				logger << "error: " << __LINE__ << std::endl;
 				return rc;
 			}
-			if( pid == 0 ) {
-				std::cout <<"- found " << color_counters.size() << " colors,"
-					<< " generating color masks...";
-			}
+			logger <<"- found " << color_counters.size() << " colors,"
+				<< " generating color masks...";
 			return internal::hpcg_build_static_color_masks( system_generator.system_size(),
 				per_color_rows, smoothing_info.color_masks );
 		}
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
index 75d23a7cc..ad09f4c9f 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -21,13 +21,13 @@
  * Utilities to allocate data for an entire multi-grid simulation.
  */
 
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+
 #include <vector>
 #include <memory>
 #include <cstddef>
 
-#ifndef _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
-
 namespace grb {
 	namespace algorithms {
 
@@ -66,18 +66,20 @@ namespace grb {
 		template<
 			typename MGInfoType,
 			typename CoarsenerInfoType,
-			typename SmootherInfoType
+			typename SmootherInfoType,
+			typename TelTokenType
 		> void multigrid_allocate_data(
-			const std::vector< size_t > &mg_sizes,
 			std::vector< std::unique_ptr< MGInfoType > > &system_levels,
 			std::vector< std::unique_ptr< CoarsenerInfoType > > &coarsener_levels,
-			std::vector< std::unique_ptr< SmootherInfoType > > &smoother_levels
+			std::vector< std::unique_ptr< SmootherInfoType > > &smoother_levels,
+			const std::vector< size_t > &mg_sizes,
+			const TelTokenType & tt
 		) {
 			if( mg_sizes.size() == 0 ) {
 				throw std::invalid_argument( "at least one size should be available" );
 			}
 			size_t finer_size = mg_sizes[ 0 ];
-			system_levels.emplace_back( new MGInfoType( 0, finer_size ) ); // create main system
+			system_levels.emplace_back( new MGInfoType( tt, 0, finer_size ) ); // create main system
 			smoother_levels.emplace_back( new SmootherInfoType( finer_size ) ); // create smoother for main
 			for( size_t i = 1; i < mg_sizes.size(); i++ ) {
 				size_t coarser_size = mg_sizes[ i ];
@@ -85,7 +87,7 @@ namespace grb {
 					throw std::invalid_argument( "system sizes not monotonically decreasing" );
 				}
 				coarsener_levels.emplace_back( new CoarsenerInfoType( finer_size, coarser_size ) );
-				system_levels.emplace_back( new MGInfoType( i, coarser_size ) );
+				system_levels.emplace_back( new MGInfoType( tt, i, coarser_size ) );
 				smoother_levels.emplace_back( new SmootherInfoType( coarser_size ) );
 				finer_size = coarser_size;
 			}
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index c517c8cc4..2bb936a1c 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -36,6 +36,8 @@
 #include <utility>
 
 #include <graphblas.hpp>
+#include <graphblas/utils/telemetry/Timeable.hpp>
+#include <graphblas/utils/telemetry/OutputStream.hpp>
 
 #include "multigrid_data.hpp"
 
@@ -82,25 +84,6 @@ namespace grb {
 			}
 		};
 
-		/**
-		 * Container for various options and algebraic abstractions to be passed to a CG simulation with multi-grid.
-		 */
-		template <
-			typename IOType,
-			typename ResidualType,
-			class Ring,
-			class Minus
-		> struct CGOptions {
-			bool with_preconditioning; ///<  whether preconditioning is enabled
-			size_t max_iterations; ///< max number of allowed iterations for CG: after that, the solver is halted
-									///< and the result achieved so far returned
-			ResidualType tolerance; ///< ratio between initial residual and current residual that halts the solver
-										///< if reached, for the solution is to be considered "good enough"
-			bool print_iter_residual; ///< whether to print information on the multi-grid and the residual on each iteration
-			Ring ring; ///< algebraic ring to be used
-			Minus minus; ///< minus operator to be used
-		};
-
 		/**
 		 * Structure for the output information of a CG run.
 		 */
@@ -109,185 +92,6 @@ namespace grb {
 			ResidualType norm_residual; ///< norm of the final residual
 		};
 
-		/**
-		 * Conjugate Gradient algorithm implementation augmented by a Multi-Grid solver,
-		 * inspired to the High Performance Conjugate Gradient benchmark.
-		 *
-		 * This CG solver calls the MG solver at the beginning of each iteration to improve
-		 * the initial solution via the residual (thanks to the smoother) and then proceeds with
-		 * the standard CG iteration.
-		 *
-		 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
-		 * the failure code.
-		 *
-		 * @tparam descr descriptor for static information
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam ResidualType type of the residual norm
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam InputType type of values of the right-hand side vector b
-		 * @tparam MultiGridrunnerType type for the multi-grid runner object
-		 * @tparam Ring algebraic ring type
-		 * @tparam Minus minus operator
-		 *
-		 * @param cg_data data for the CG solver only
-		 * @param cg_opts options for the CG solver
-		 * @param grid_base base (i.e., finer) level of the multi-grid, with the information of the physical system
-		 * @param MultiGridRunner runner object (functor) to call the multi-grid solver
-		 * @param out_info solver output information
-		 * @return grb::RC SUCCESS in case of succesful run
-		 */
-		template<
-			Descriptor descr,
-			typename IOType,
-			typename ResidualType,
-			typename NonzeroType,
-			typename InputType,
-			typename MultiGridrunnerType,
-			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
-			class Minus = operators::subtract< IOType >
-		> grb::RC multigrid_conjugate_gradient(
-			MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
-			const CGOptions< IOType, ResidualType, Ring, Minus > &cg_opts,
-			MultiGridData< IOType, NonzeroType > &grid_base,
-			MultiGridrunnerType &multigrid_runner,
-			CGOutInfo< ResidualType > &out_info
-		) {
-			const grb::Matrix< NonzeroType > &A = grid_base.A; // system matrix
-			grb::Vector< IOType > &r = grid_base.r;  // residual vector
-			grb::Vector< IOType > &z = grid_base.z;  // pre-conditioned residual vector
-			grb::Vector< IOType > &x = cg_data.x; // initial (and final) solution
-			const grb::Vector< InputType > &b = cg_data.b; // right-side value
-			grb::Vector< IOType > &p = cg_data.p;  // direction vector
-			grb::Vector< IOType > &Ap = cg_data.u; // temp vector
-			grb::RC ret = SUCCESS;
-
-			const IOType io_zero = cg_opts.ring.template getZero< IOType >();
-			ret = ret ? ret : grb::set( Ap, io_zero );
-			ret = ret ? ret : grb::set( r, io_zero );
-			ret = ret ? ret : grb::set( p, io_zero );
-
-			ret = ret ? ret : grb::set( p, x );
-			// Ap = A * x
-			ret = ret ? ret : grb::mxv< descr >( Ap, A, x, cg_opts.ring );
-			assert( ret == SUCCESS );
-			// r = b - Ap
-			ret = ret ? ret : grb::eWiseApply< descr >( r, b, Ap, cg_opts.minus );
-			assert( ret == SUCCESS );
-
-			const ResidualType residual_zero = cg_opts.ring.template getZero< ResidualType >();
-			ResidualType norm_residual = residual_zero;
-			// norm_residual = r' * r
-			ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, cg_opts.ring );
-			assert( ret == SUCCESS );
-
-			// compute sqrt to avoid underflow
-			norm_residual = std::sqrt( norm_residual );
-
-			// initial norm of residual
-			out_info.norm_residual = norm_residual;
-			const ResidualType norm_residual_initial = norm_residual;
-			ResidualType old_r_dot_z = residual_zero, r_dot_z = residual_zero, beta = residual_zero;
-			size_t iter = 0;
-
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( p, "start p" );
-			DBG_print_norm( Ap, "start Ap" );
-			DBG_print_norm( r, "start r" );
-#endif
-			do {
-#ifdef HPCG_PRINT_STEPS
-				DBG_println( "========= iteration " << iter << " =========" );
-#endif
-				if( cg_opts.print_iter_residual ) {
-					std::cout << "iteration " << iter;
-				}
-				if( cg_opts.with_preconditioning ) {
-					ret = ret ? ret : multigrid_runner( grid_base );
-					assert( ret == SUCCESS );
-				} else {
-					// z = r
-					ret = ret ? ret : grb::set( z, r );
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( z, "initial z" );
-#endif
-				if( iter == 0 ) {
-					//  p = z
-					ret = ret ? ret : grb::set< descr >( p, z );
-					assert( ret == SUCCESS );
-					// r_dot_z = r' * z
-					ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, cg_opts.ring );
-					assert( ret == SUCCESS );
-				} else {
-					old_r_dot_z = r_dot_z;
-					// r_dot_z = r' * z
-					r_dot_z = cg_opts.ring.template getZero< ResidualType >();
-					ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, cg_opts.ring );
-					assert( ret == SUCCESS );
-
-					beta = r_dot_z / old_r_dot_z;
-					// Ap  = 0
-					ret = ret ? ret : grb::set< descr >( Ap, io_zero );
-					assert( ret == SUCCESS );
-					// Ap += beta * p
-					ret = ret ? ret : grb::eWiseMul< descr >( Ap, beta, p, cg_opts.ring );
-					assert( ret == SUCCESS );
-					// Ap = Ap + z
-					ret = ret ? ret : grb::eWiseApply< descr >( Ap, Ap, z, cg_opts.ring.getAdditiveOperator() );
-					assert( ret == SUCCESS );
-					// p = Ap
-					std::swap( Ap, p );
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( p, "middle p" );
-#endif
-				// Ap = A * p
-				ret = ret ? ret : grb::set< descr >( Ap, io_zero );
-				ret = ret ? ret : grb::mxv< descr >( Ap, A, p, cg_opts.ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( Ap, "middle Ap" );
-#endif
-				// pAp = p' * Ap
-				ResidualType pAp = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot< descr >( pAp, Ap, p, cg_opts.ring );
-				assert( ret == SUCCESS );
-
-				ResidualType alpha = r_dot_z / pAp;
-				// x += alpha * p
-				ret = ret ? ret : grb::eWiseMul< descr >( x, alpha, p, cg_opts.ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( x, "end x" );
-#endif
-				// r += - alpha * Ap
-				ret = ret ? ret : grb::eWiseMul< descr >( r, -alpha, Ap, cg_opts.ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( r, "end r" );
-#endif
-				// residual = r' * r
-				norm_residual = cg_opts.ring.template getZero< ResidualType >();
-				ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, cg_opts.ring );
-				assert( ret == SUCCESS );
-
-				norm_residual = std::sqrt( norm_residual );
-
-				if( cg_opts.print_iter_residual ) {
-					std::cout << " residual " << norm_residual << std::endl;
-				}
-
-				++iter;
-				out_info.iterations = iter;
-				out_info.norm_residual = norm_residual;
-			} while( iter < cg_opts.max_iterations &&
-				norm_residual / norm_residual_initial > cg_opts.tolerance && ret == SUCCESS );
-
-			return ret;
-		}
-
 		/**
 		 * Runner object incapsulating all information to run a Conjugate Gradient solver
 		 * with multi-grid.
@@ -307,17 +111,21 @@ namespace grb {
 		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template<
-			typename IOType,
-			typename NonzeroType,
-			typename InputType,
-			typename ResidualType,
+			typename MGCGTypes,
 			typename MultiGridRunnerType,
-			class Ring,
-			class Minus,
-			Descriptor descr = descriptors::no_operation
-		> struct MultiGridCGRunner {
-
+			typename TelTokenType,
+			Descriptor descr = descriptors::no_operation,
+			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
+		> struct MultiGridCGRunner : public grb::utils::telemetry::Timeable< TelTokenType > {
+
+			using IOType = typename MGCGTypes::IOType;
+			using NonzeroType = typename MGCGTypes::NonzeroType;
+			using InputType = typename MGCGTypes::InputType;
+			using ResidualType = typename MGCGTypes::ResidualType;
+			using Ring = typename MGCGTypes::Ring;
+			using Minus = typename MGCGTypes::Minus;
 			using HPCGInputType = MultiGridCGData< IOType, NonzeroType, InputType >;
+			using MGRunnerType = MultiGridRunnerType;
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
@@ -326,11 +134,16 @@ namespace grb {
 			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
 				"cannot construct the Multi-Grid runner by move" );
 
-			// default value: override with your own
-			CGOptions< IOType, ResidualType, Ring, Minus > cg_opts = { true, 10,
-				Ring(). template getZero< ResidualType >(), false, Ring(), Minus() };
+			Ring ring; ///< algebraic ring to be used
+			Minus minus; ///< minus operator to be used
+			bool with_preconditioning = true; ///<  whether preconditioning is enabled
+			size_t max_iterations = 10; ///< max number of allowed iterations for CG: after that, the solver is halted
+									///< and the result achieved so far returned
+			ResidualType tolerance = ring. template getZero< ResidualType >(); ///< ratio between initial residual and current residual that halts the solver
+										///< if reached, for the solution is to be considered "good enough"
 
-			MultiGridRunnerType mg_runner;
+			MultiGridRunnerType &mg_runner;
+			DbgOutputStreamType dbg_logger;
 
 			/**
 			 * Construct a new MultiGridCGRunner object by moving the required MG runner.
@@ -339,8 +152,25 @@ namespace grb {
 			 * as the state of the MG runner is managed automatically with this object.
 			 */
 			MultiGridCGRunner(
-				MultiGridRunnerType &&_mg_runner
-			) : mg_runner( std::move( _mg_runner ) ) {}
+				const TelTokenType & tt,
+				MultiGridRunnerType &_mg_runner
+			) :
+				grb::utils::telemetry::Timeable< TelTokenType >( tt ),
+				mg_runner( _mg_runner ),
+				dbg_logger()
+			{
+				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
+			}
+
+			MultiGridCGRunner(
+				const TelTokenType & tt,
+				MultiGridRunnerType & _mg_runner,
+				DbgOutputStreamType & _dbg_logger
+			) :
+				grb::utils::telemetry::Timeable< TelTokenType >( tt ),
+				mg_runner( _mg_runner ),
+				dbg_logger( _dbg_logger )
+			{}
 
 			/**
 			 * Functional operator to invoke a full CG-MG computation.
@@ -355,7 +185,154 @@ namespace grb {
 				MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
 				CGOutInfo< ResidualType > &out_info
 			) {
-				return multigrid_conjugate_gradient< descr >( cg_data, cg_opts, grid_base, mg_runner, out_info );
+				this->start();
+				grb::RC ret = multigrid_conjugate_gradient( cg_data, grid_base, out_info );
+				this->stop();
+				return ret;
+			}
+
+			/**
+			 * Conjugate Gradient algorithm implementation augmented by a Multi-Grid solver,
+			 * inspired to the High Performance Conjugate Gradient benchmark.
+			 *
+			 * This CG solver calls the MG solver at the beginning of each iteration to improve
+			 * the initial solution via the residual (thanks to the smoother) and then proceeds with
+			 * the standard CG iteration.
+			 *
+			 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
+			 * the failure code.
+			 *
+			 *
+			 * @param cg_data data for the CG solver only
+			 * @param grid_base base (i.e., finer) level of the multi-grid, with the information of the physical system
+			 * @param out_info solver output information
+			 * @return grb::RC SUCCESS in case of succesful run
+			 */
+			grb::RC multigrid_conjugate_gradient(
+				HPCGInputType &cg_data,
+				typename MultiGridRunnerType::MultiGridInputType &grid_base,
+				CGOutInfo< ResidualType > &out_info
+			) {
+				const grb::Matrix< NonzeroType > &A = grid_base.A; // system matrix
+				grb::Vector< IOType > &r = grid_base.r;  // residual vector
+				grb::Vector< IOType > &z = grid_base.z;  // pre-conditioned residual vector
+				grb::Vector< IOType > &x = cg_data.x; // initial (and final) solution
+				const grb::Vector< InputType > &b = cg_data.b; // right-side value
+				grb::Vector< IOType > &p = cg_data.p;  // direction vector
+				grb::Vector< IOType > &Ap = cg_data.u; // temp vector
+				grb::RC ret = SUCCESS;
+
+				const IOType io_zero = ring.template getZero< IOType >();
+				ret = ret ? ret : grb::set( Ap, io_zero );
+				ret = ret ? ret : grb::set( r, io_zero );
+				ret = ret ? ret : grb::set( p, io_zero );
+
+				ret = ret ? ret : grb::set( p, x );
+				// Ap = A * x
+				ret = ret ? ret : grb::mxv< descr >( Ap, A, x, ring );
+				assert( ret == SUCCESS );
+				// r = b - Ap
+				ret = ret ? ret : grb::eWiseApply< descr >( r, b, Ap, minus );
+				assert( ret == SUCCESS );
+
+				const ResidualType residual_zero = ring.template getZero< ResidualType >();
+				ResidualType norm_residual = residual_zero;
+				// norm_residual = r' * r
+				ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, ring );
+				assert( ret == SUCCESS );
+
+				// compute sqrt to avoid underflow
+				norm_residual = std::sqrt( norm_residual );
+
+				// initial norm of residual
+				out_info.norm_residual = norm_residual;
+				const ResidualType norm_residual_initial = norm_residual;
+				ResidualType old_r_dot_z = residual_zero, r_dot_z = residual_zero, beta = residual_zero;
+				size_t iter = 0;
+
+				dbg_logger << ">>> start p: " << p << std::endl;
+				dbg_logger << ">>> start Ap: " << Ap << std::endl;
+				dbg_logger << ">>> start r: " << r << std::endl;
+
+				do {
+					dbg_logger << "========= iteration " << iter << " =========" << std::endl;
+
+					if( with_preconditioning ) {
+						ret = ret ? ret : mg_runner( grid_base );
+						assert( ret == SUCCESS );
+					} else {
+						// z = r
+						ret = ret ? ret : grb::set( z, r );
+						assert( ret == SUCCESS );
+					}
+					dbg_logger << ">>> initial z: " << z << std::endl;
+
+					if( iter == 0 ) {
+						//  p = z
+						ret = ret ? ret : grb::set< descr >( p, z );
+						assert( ret == SUCCESS );
+						// r_dot_z = r' * z
+						ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, ring );
+						assert( ret == SUCCESS );
+					} else {
+						old_r_dot_z = r_dot_z;
+						// r_dot_z = r' * z
+						r_dot_z = ring.template getZero< ResidualType >();
+						ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, ring );
+						assert( ret == SUCCESS );
+
+						beta = r_dot_z / old_r_dot_z;
+						// Ap  = 0
+						ret = ret ? ret : grb::set< descr >( Ap, io_zero );
+						assert( ret == SUCCESS );
+						// Ap += beta * p
+						ret = ret ? ret : grb::eWiseMul< descr >( Ap, beta, p, ring );
+						assert( ret == SUCCESS );
+						// Ap = Ap + z
+						ret = ret ? ret : grb::eWiseApply< descr >( Ap, Ap, z, ring.getAdditiveOperator() );
+						assert( ret == SUCCESS );
+						// p = Ap
+						std::swap( Ap, p );
+						assert( ret == SUCCESS );
+					}
+					dbg_logger << ">>> middle p: " << p << std::endl;
+
+					// Ap = A * p
+					ret = ret ? ret : grb::set< descr >( Ap, io_zero );
+					ret = ret ? ret : grb::mxv< descr >( Ap, A, p, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> middle Ap: " << Ap << std::endl;
+
+					// pAp = p' * Ap
+					ResidualType pAp = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot< descr >( pAp, Ap, p, ring );
+					assert( ret == SUCCESS );
+
+					ResidualType alpha = r_dot_z / pAp;
+					// x += alpha * p
+					ret = ret ? ret : grb::eWiseMul< descr >( x, alpha, p, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> end x: " << x << std::endl;
+
+					// r += - alpha * Ap
+					ret = ret ? ret : grb::eWiseMul< descr >( r, -alpha, Ap, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> end r: " << r << std::endl;
+
+					// residual = r' * r
+					norm_residual = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, ring );
+					assert( ret == SUCCESS );
+
+					norm_residual = std::sqrt( norm_residual );
+
+					++iter;
+					out_info.iterations = iter;
+					out_info.norm_residual = norm_residual;
+				} while( iter < max_iterations &&
+					norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
+
+				return ret;
 			}
 
 		};
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
index 6462e4019..ed580da3d 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_data.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -28,6 +28,7 @@
 #include <cstddef>
 
 #include <graphblas.hpp>
+#include <graphblas/utils/telemetry/Stopwatch.hpp>
 
 
 namespace grb {
@@ -49,9 +50,12 @@ namespace grb {
 		 */
 		template<
 			typename IOType,
-			typename NonzeroType
+			typename NonzeroType,
+			typename TelTokenType
 		> struct MultiGridData {
 
+			grb::utils::telemetry::Stopwatch< TelTokenType > mg_stopwatch;
+			grb::utils::telemetry::Stopwatch< TelTokenType > sm_stopwatch;
 			const size_t level; ///< level of the grid (0 for the finest physical system)
 			const size_t system_size; ///< size of the system, i.e. side of the #A system matrix
 			grb::Matrix< NonzeroType > A; ///< system matrix
@@ -62,9 +66,12 @@ namespace grb {
 			 * Construct a new multigrid data object from level information and system size.
 			 */
 			MultiGridData(
+				const TelTokenType & _tt,
 				size_t _level,
 				size_t sys_size
 			) :
+				mg_stopwatch( _tt ),
+				sm_stopwatch( _tt ),
 				level( _level ),
 				system_size( sys_size ),
 				A( sys_size, sys_size ),
@@ -72,9 +79,10 @@ namespace grb {
 				r( sys_size ) {}
 
 			// for safety, disable copy semantics
-			MultiGridData( const MultiGridData< IOType, NonzeroType > & o ) = delete;
+			MultiGridData( const MultiGridData< IOType, NonzeroType, TelTokenType > & o ) = delete;
 
-			MultiGridData<IOType, NonzeroType > & operator=( const MultiGridData< IOType, NonzeroType > & ) = delete;
+			MultiGridData<IOType, NonzeroType, TelTokenType > & operator=(
+				const MultiGridData< IOType, NonzeroType, TelTokenType > & ) = delete;
 
 			grb::RC init_vectors( IOType zero ) {
 				grb::RC rc = grb::set( z, zero );
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 177027f3e..6ab53b469 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -33,133 +33,13 @@
 
 #include <graphblas.hpp>
 #include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
-#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/telemetry/OutputStream.hpp>
 
 #include "multigrid_data.hpp"
 
 namespace grb {
 	namespace algorithms {
 
-		/**
-		 * Multi-grid V cycle implementation to refine a given solution.
-		 *
-		 * A full multi-grid run goes through the following steps:
-		 *
-		 * 1. calls the pre-smoother to improve on the initial solution stored into \p mgiter_begin->z
-		 * 2. coarsens the residual vector
-		 * 3. recursively solves the coarser system
-		 * 4. prolongs the coarser solution into the \p mgiter_begin->z
-		 * 5. further smooths the solution wih a post-smoother call
-		 *
-		 * The algorithm moves across grid levels via the STL-like iterators \p mgiter_begin
-		 * and \p mgiter_end and accesses the grid data via the former (using the operator \c * ): when
-		 * \p mgiter_begin \c == \p mgiter_end , a smoothing round is invoked and the recursion halted.
-		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
-		 * and returning the failure code.
-		 *
-		 * @tparam descr descriptor for static information
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam MGSysIterType type of the iterator across grid levels
-		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
-		 *  smoothing steps
-		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
-		 *  and prolongation
-		 * @tparam Ring the ring of algebraic operators zero-values
-		 * @tparam Minus the minus operator for subtractions
-		 *
-		 * @param mgiter_begin iterator pointing to the current level of the multi-grid
-		 * @param mgiter_end end iterator, indicating the end of the recursion
-		 * @param smoother callable object to invoke the smoothing steps
-		 * @param coarsener callable object to coarsen and prolong (between current and coarser grid levels)
-		 * @param ring the ring to perform the operations on
-		 * @param minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC if the algorithm could correctly terminate, the error code of the first
-		 *  unsuccessful operation otherwise
-		 */
-		template <
-			Descriptor descr,
-			typename IOType,
-			typename NonzeroType,
-			typename MGSysIterType,
-			typename MGSmootherType,
-			typename CoarsenerType,
-			class Ring,
-			class Minus
-		> grb::RC multi_grid(
-			MGSysIterType mgiter_begin,
-			const MGSysIterType mgiter_end,
-			MGSmootherType &smoother,
-			CoarsenerType &coarsener,
-			const Ring &ring,
-			const Minus &minus
-		) {
-			static_assert( std::is_base_of< MultiGridData< IOType, NonzeroType >,
-				typename std::decay< decltype( *mgiter_begin ) >::type >::value, "the iterator type MGSysIterType"
-				" must reference an object of type MultiGridData< IOType, NonzeroType >" );
-
-			RC ret = SUCCESS;
-			assert( mgiter_begin != mgiter_end );
-			MultiGridData< IOType, NonzeroType > &finer_system = *mgiter_begin;
-			++mgiter_begin;
-
-#ifdef HPCG_PRINT_STEPS
-			DBG_println( "mg BEGINNING {" );
-#endif
-
-			// clean destination vector
-			ret = ret ? ret : grb::set< descr >( finer_system.z, ring. template getZero< IOType >() );
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( finer_system.r, "initial r" );
-#endif
-			if( !( mgiter_begin != mgiter_end ) ) {
-				// compute one round of Gauss Seidel and return
-				ret = ret ? ret : smoother.nonrecursive_smooth( finer_system );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( finer_system.z, "smoothed z" );
-				DBG_println( "} mg END" );
-#endif
-				return ret;
-			}
-			MultiGridData< IOType, NonzeroType > &coarser_system = *mgiter_begin;
-
-			// pre-smoother
-			ret = ret ? ret : smoother.pre_smooth( finer_system );
-			assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( finer_system.z, "pre-smoothed z" );
-#endif
-
-			ret = ret ? ret : coarsener.coarsen_residual( finer_system, coarser_system );
-			assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( coarser_system.r, "coarse r" );
-#endif
-
-			ret = ret ? ret : multi_grid< descr, IOType, NonzeroType, MGSysIterType,
-				MGSmootherType, CoarsenerType, Ring, Minus >( mgiter_begin, mgiter_end,
-				smoother, coarsener, ring, minus );
-			assert( ret == SUCCESS );
-
-			ret = ret ? ret : coarsener.prolong_solution( coarser_system, finer_system );
-			assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( finer_system.z, "prolonged z" );
-#endif
-
-			// post-smoother
-			ret = ret ? ret : smoother.post_smooth( finer_system );
-			assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( finer_system.z, "post-smoothed z" );
-			DBG_println( "} mg END" );
-#endif
-
-			return ret;
-		}
-
 		/**
 		 * Callable object to invoke the V-cycle multi-grid algorithm, which also requires
 		 * a smoother and a coarsener object.
@@ -178,50 +58,52 @@ namespace grb {
 		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template<
+			typename MGTypes,
 			typename MGSmootherType,
 			typename CoarsenerType,
-			typename IOType,
-			typename NonzeroType,
-			class Ring,
-			class Minus,
-			Descriptor descr = descriptors::no_operation
+			typename TelTokenType,
+			Descriptor descr = descriptors::no_operation,
+			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
 		> struct MultiGridRunner {
 
+			using self_t = MultiGridRunner< MGTypes, MGSmootherType, CoarsenerType, TelTokenType, descr >;
+			using IOType = typename MGTypes::IOType;
+			using NonzeroType = typename MGTypes::NonzeroType;
+			using Ring = typename MGTypes::Ring;
+			using Minus = typename MGTypes::Minus;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
+			using SmootherRunnerType = MGSmootherType;
+			using CoarsenerRunnerType = CoarsenerType;
+
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
 			static_assert( std::is_default_constructible< Minus >::value,
 				"cannot construct the Minus operator with default values" );
-			static_assert( std::is_move_constructible< MGSmootherType >::value,
-				"MGSmootherType must be move-constructible");
-			static_assert( std::is_move_constructible< CoarsenerType >::value,
-				"CoarsenerType must be move-constructible");
-
-			using MultiGridInputType = MultiGridData< IOType, NonzeroType >;
 
 			// check the interface between HPCG and MG match
 			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType,
 				MultiGridInputType >::value, "input type of the Smoother kernel must match the input from Multi-Grid" );
 
-			MGSmootherType smoother_runner; ///< object to run the smoother
-			CoarsenerType coarsener_runner; ///< object to run the coarsener
+			MGSmootherType & smoother_runner; ///< object to run the smoother
+			CoarsenerType & coarsener_runner; ///< object to run the coarsener
+			DbgOutputStreamType dbg_logger;
+
 			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
-			bool print_duration = false; ///< whether to print the duration of a full multi-grid call
-			grb::utils::Timer timer;
 			Ring ring; ///< algebraic ring
 			Minus minus; ///< minus operator
 
 			// operator to extract the reference out of an std::unique_ptr object
 			struct __extractor {
-				MultiGridInputType & operator()(
+				MultiGridInputType* operator()(
 					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference &ref
 				) {
-					return *ref.get();
+					return ref.get();
 				}
 
-				const MultiGridInputType & operator()(
+				const MultiGridInputType* operator()(
 					typename std::vector< std::unique_ptr< MultiGridInputType > >::const_reference &ref
 				) const {
-					return *ref.get();
+					return ref.get();
 				}
 			};
 
@@ -235,29 +117,129 @@ namespace grb {
 			 * smoother and coarsener.
 			 */
 			MultiGridRunner(
-				MGSmootherType &&_smoother_runner,
-				CoarsenerType &&_coarsener_runner
-			) : smoother_runner( std::move( _smoother_runner ) ),
-				coarsener_runner( std::move(  _coarsener_runner ) ) {}
+				MGSmootherType &_smoother_runner,
+				CoarsenerType &_coarsener_runner
+			) : smoother_runner( _smoother_runner ),
+				coarsener_runner(  _coarsener_runner )
+			{
+				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
+			}
+
+			MultiGridRunner(
+				MGSmootherType &_smoother_runner,
+				CoarsenerType &_coarsener_runner,
+				DbgOutputStreamType & _dbg_logger
+			) : smoother_runner( _smoother_runner ),
+				coarsener_runner(  _coarsener_runner ),
+				dbg_logger( _dbg_logger )
+			{}
 
 			/**
 			 * Operator to invoke a full multi-grid run starting from the given level.
 			 */
 			inline grb::RC operator()( MultiGridInputType &system ) {
-				if( print_duration ) {
-					timer.reset();
-				}
-				grb::RC ret = multi_grid< descr, IOType, NonzeroType, __unique_ptr_extractor,
-					MGSmootherType, CoarsenerType, Ring, Minus >(
-					__unique_ptr_extractor( system_levels.begin() += system.level ),
-					__unique_ptr_extractor( system_levels.end() ),
-					smoother_runner, coarsener_runner, ring, minus );
-				if( print_duration ) {
-					double duration = timer.time();
-					std::cout << " pre-conditioner (ms) "<< duration;
+				return this->operator()( __unique_ptr_extractor( system_levels.begin() += system.level ),
+					__unique_ptr_extractor( system_levels.end() ) );
+			}
+
+			inline grb::RC operator()(
+				__unique_ptr_extractor begin,
+				const __unique_ptr_extractor end
+			) {
+				begin->mg_stopwatch.start();
+				grb::RC ret = multi_grid( begin, end );
+				begin->mg_stopwatch.stop();
+				return ret;
+			}
+
+			/**
+			 * Multi-grid V cycle implementation to refine a given solution.
+			 *
+			 * A full multi-grid run goes through the following steps:
+			 *
+			 * 1. calls the pre-smoother to improve on the initial solution stored into \p mgiter_begin->z
+			 * 2. coarsens the residual vector
+			 * 3. recursively solves the coarser system
+			 * 4. prolongs the coarser solution into the \p mgiter_begin->z
+			 * 5. further smooths the solution wih a post-smoother call
+			 *
+			 * The algorithm moves across grid levels via the STL-like iterators \p mgiter_begin
+			 * and \p mgiter_end and accesses the grid data via the former (using the operator \c * ): when
+			 * \p mgiter_begin \c == \p mgiter_end , a smoothing round is invoked and the recursion halted.
+			 *
+			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
+			 * and returning the failure code.
+			 *
+			 * @tparam descr descriptor for static information
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam MGSysIterType type of the iterator across grid levels
+			 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
+			 *  smoothing steps
+			 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
+			 *  and prolongation
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param mgiter_begin iterator pointing to the current level of the multi-grid
+			 * @param mgiter_end end iterator, indicating the end of the recursion
+			 * @param smoother callable object to invoke the smoothing steps
+			 * @param coarsener callable object to coarsen and prolong (between current and coarser grid levels)
+			 * @param ring the ring to perform the operations on
+			 * @param minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC if the algorithm could correctly terminate, the error code of the first
+			 *  unsuccessful operation otherwise
+			 */
+			grb::RC multi_grid(
+				__unique_ptr_extractor mgiter_begin,
+				const __unique_ptr_extractor mgiter_end
+			) {
+				RC ret = SUCCESS;
+				assert( mgiter_begin != mgiter_end );
+				MultiGridInputType &finer_system = *mgiter_begin;
+				++mgiter_begin;
+
+				dbg_logger << "mg BEGINNING {" << std::endl;
+
+				// clean destination vector
+				ret = ret ? ret : grb::set< descr >( finer_system.z, ring. template getZero< IOType >() );
+				dbg_logger << ">>> initial r: " << finer_system.r << std::endl;
+
+				if( !( mgiter_begin != mgiter_end ) ) {
+					// compute one round of Gauss Seidel and return
+					ret = ret ? ret : smoother_runner.nonrecursive_smooth( finer_system );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> smoothed z: " << finer_system.z << std::endl;
+					dbg_logger << "} mg END" << std::endl;
+					return ret;
 				}
+				MultiGridInputType &coarser_system = *mgiter_begin;
+
+				// pre-smoother
+				ret = ret ? ret : smoother_runner.pre_smooth( finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> pre-smoothed z: " << finer_system.z << std::endl;
+
+				ret = ret ? ret : coarsener_runner.coarsen_residual( finer_system, coarser_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> coarse r: " << coarser_system.r << std::endl;
+
+				ret = ret ? ret : this->operator()( mgiter_begin, mgiter_end );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : coarsener_runner.prolong_solution( coarser_system, finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> prolonged z: " << finer_system.z << std::endl;
+
+				// post-smoother
+				ret = ret ? ret : smoother_runner.post_smooth( finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> post-smoothed z: " << finer_system.z << std::endl;
+				dbg_logger << "} mg END" << std::endl;
+
 				return ret;
 			}
+
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 3193b46fe..02d0c5dd4 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -59,64 +59,125 @@ namespace grb {
 			}
 		};
 
-		namespace internal {
 
+		/**
+		 * Runner object for the RBGS smoother, with multiple methods for each type of smoothing step:
+		 * pre-, post- and non-recursive, as invoked during a full run of a multi-grid V-cycle.
+		 *
+		 * It stores the information to smooth each level of the grid, to be initalized separately.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam Ring the ring of algebraic operators
+		 * @tparam descr descriptors with statically-known data for computation and containers
+		 */
+		template <
+			class SmootherTypes,
+			typename TelTokenType,
+			Descriptor descr = descriptors::no_operation
+		> struct RedBlackGSSmootherRunner {
+
+			using IOType = typename SmootherTypes::IOType;
+			using NonzeroType = typename SmootherTypes::NonzeroType;
+			using Ring = typename SmootherTypes::Ring;
+			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
+			using SmootherDataType = SmootherData< IOType >;
+
+			size_t presmoother_steps = 1UL; ///< number of pre-smoother steps
+			size_t postsmoother_steps = 1UL;  ///< number of post-smoother steps
+			size_t non_recursive_smooth_steps = 1UL;  ///< number of smoother steps for the last grid level
+			std::vector< std::unique_ptr< SmootherDataType > > levels;  ///< for each grid level,
+				///< the smoothing data (finest first)
+			Ring ring;  ///< the algebraic ring
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring operator with default values" );
+
+
+
+			inline grb::RC pre_smooth( SmootherInputType& data ) {
+				return run_smoother( data, presmoother_steps );
+			}
+
+			inline grb::RC post_smooth( SmootherInputType& data ) {
+				return run_smoother( data, postsmoother_steps );
+			}
+
+			inline grb::RC nonrecursive_smooth( SmootherInputType& data ) {
+				return run_smoother( data, non_recursive_smooth_steps );
+			}
+
+		protected:
 			/**
-			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
+			 * Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother,
+			 * with inputs and outputs stored inside \p data.
 			 *
-			 * @tparam descr descriptor for static information
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
+			 * This is an internal method called by all user-facing methods, because this specific
+			 * smoother performs all smoothing steps the same way.
+			 */
+			grb::RC run_smoother(
+				SmootherInputType &data,
+				const size_t smoother_steps
+			) {
+				RC ret = SUCCESS;
+
+				SmootherDataType &smoothing_info = *( levels.at( data.level ).get() );
+
+				data.sm_stopwatch.start();
+				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
+					ret = ret ? ret : red_black_gauss_seidel( data, smoothing_info );
+					assert( ret == SUCCESS );
+				}
+				data.sm_stopwatch.stop();
+				return ret;
+			}
+
+			/**
+			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
 			 *
 			 * @param[in] A the system matrix
 			 * @param[in] A_diagonal a vector storing the diagonal elements of \p A
 			 * @param[in] r the residual
-			 * @param[in,out] x the initial solution to start from, and where the smoothed solution is stored to
+			 * @param[in,out] z the initial solution to start from, and where the smoothed solution is stored to
 			 * @param[out] smoother_temp a vector for temporary values
 			 * @param[in] color_mask the mask of colors to filter the rows to smooth
-			 * @param[in] ring the ring to perform the operations on
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *  unsuccessful operation otherwise
 			 */
-			template<
-				Descriptor descr,
-				typename IOType,
-				typename NonzeroType,
-				class Ring
-			> grb::RC rbgs_single_step(
-				const grb::Matrix< NonzeroType > & A,
-				const grb::Vector< IOType > & A_diagonal,
-				const grb::Vector< IOType > & r,
-				grb::Vector< IOType > & x,
-				grb::Vector< IOType > & smoother_temp,
-				const grb::Vector< bool > & color_mask,
-				const Ring & ring
+			grb::RC red_black_gauss_seidel_single_step(
+				SmootherInputType &data,
+				SmootherDataType &smoothing_info,
+				size_t color
 			) {
-				RC ret = SUCCESS;
-
-				// smoother_temp[color_mask] = A[color_mask] * x[color_mask]
+				const grb::Matrix< NonzeroType > & A = data.A;
+				const grb::Vector< IOType > & A_diagonal = smoothing_info.A_diagonal;
+				const grb::Vector< IOType > & r = data.r;
+				grb::Vector< IOType > & z = data.z;
+				grb::Vector< IOType > & smoother_temp = smoothing_info.smoother_temp;
+				const grb::Vector< bool > & color_mask = smoothing_info.color_masks[ color ];
+
+				// smoother_temp[color_mask] = A[color_mask] * z[color_mask]
 				// use the structural descriptors, assuming ONLY the values of the current color are set
 				// note that if this assumption does not hold, also the following eWiseLambda() is wrong
-				ret = ret ? ret : grb::mxv< grb::descriptors::safe_overlap | grb::descriptors::structural >(
-					smoother_temp, color_mask, A, x, ring );
+				RC ret = grb::mxv< grb::descriptors::safe_overlap | grb::descriptors::structural >(
+					smoother_temp, color_mask, A, z, ring );
 				assert( ret == SUCCESS );
 
 				// TODO internal issue #201
 				// Replace below with masked calls:
-				// x[mask] = r[mask] - smoother_temp[mask] + x[mask] .* diagonal[mask]
-				// x[mask] = x[maks] ./ diagonal[mask]
+				// z[mask] = r[mask] - smoother_temp[mask] + z[mask] .* diagonal[mask]
+				// z[mask] = z[maks] ./ diagonal[mask]
 				ret = ret ? ret :
 					grb::eWiseLambda(
-						[ &x, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
+						[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
 							// if the mask was properly initialized, the check on the mask value is unnecessary;
 							// if( color_mask[ i ] ) {
 							IOType d = A_diagonal[ i ];
-							IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
-							x[ i ] = v / d;
+							IOType v = r[ i ] - smoother_temp[ i ] + z[ i ] * d;
+							z[ i ] = v / d;
 							// }
 						},
-						color_mask, x, r, smoother_temp, A_diagonal );
+						color_mask, z, r, smoother_temp, A_diagonal );
 				assert( ret == SUCCESS );
 				return ret;
 			}
@@ -131,25 +192,13 @@ namespace grb {
 			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
 			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
 			 *
-			 * @tparam descr descriptor for static information
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
 			 * @param[in,out] data structure with the data of a single grid level
-			 * @param[in] ring the ring to perform the operations on
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template<
-				Descriptor descr,
-				typename IOType,
-				typename NonzeroType,
-				class Ring
-			> grb::RC red_black_gauss_seidel(
-				MultiGridData< IOType, NonzeroType > &data,
-				SmootherData< IOType > &smoothing_info,
-				const Ring & ring
+			grb::RC red_black_gauss_seidel(
+				SmootherInputType &data,
+				SmootherDataType &smoothing_info
 			) {
 				RC ret = SUCCESS;
 				// zero the temp output just once, assuming proper masking avoids
@@ -158,88 +207,16 @@ namespace grb {
 					ring. template getZero< IOType >() );
 
 				// forward step
-				using cit_t = typename std::vector< grb::Vector< bool > >::const_iterator;
-				cit_t end = smoothing_info.color_masks.cend();
-				for( cit_t it = smoothing_info.color_masks.cbegin(); it != end && ret == SUCCESS; ++it ) {
-					ret = rbgs_single_step< descr >( data.A, smoothing_info.A_diagonal, data.r,
-						data.z, smoothing_info.smoother_temp, *it, ring );
+				for( size_t color = 0; color < smoothing_info.color_masks.size(); ++color ) {
+					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color );
 				}
 				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
 					ring. template getZero< IOType >() );
 
 				// backward step
-				using crit_t = typename std::vector< grb::Vector< bool > >::const_reverse_iterator;
-				crit_t rend = smoothing_info.color_masks.crend();
-				for( crit_t rit = smoothing_info.color_masks.crbegin(); rit != rend && ret == SUCCESS; ++rit ) {
-					ret = rbgs_single_step< descr >( data.A, smoothing_info.A_diagonal, data.r,
-						data.z, smoothing_info.smoother_temp, *rit, ring );
-				}
-				return ret;
-			}
-
-		} // namespace internal
-
-		/**
-		 * Runner object for the RBGS smoother, with multiple methods for each type of smoothing step:
-		 * pre-, post- and non-recursive, as invoked during a full run of a multi-grid V-cycle.
-		 *
-		 * It stores the information to smooth each level of the grid, to be initalized separately.
-		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam Ring the ring of algebraic operators
-		 * @tparam descr descriptors with statically-known data for computation and containers
-		 */
-		template <
-			typename IOType,
-			typename NonzeroType,
-			class Ring,
-			Descriptor descr = descriptors::no_operation
-		> struct RedBlackGSSmootherRunner {
-
-			size_t presmoother_steps; ///< number of pre-smoother steps
-			size_t postsmoother_steps;  ///< number of post-smoother steps
-			size_t non_recursive_smooth_steps;  ///< number of smoother steps for the last grid level
-			std::vector< std::unique_ptr< SmootherData< IOType > > > levels;  ///< for each grid level,
-				///< the smoothing data (finest first)
-			Ring ring;  ///< the algebraic ring
-
-			static_assert( std::is_default_constructible< Ring >::value,
-				"cannot construct the Ring operator with default values" );
-
-			using SmootherInputType = MultiGridData< IOType, NonzeroType >;
-
-			inline grb::RC pre_smooth( SmootherInputType& data ) {
-				return __run_smoother( data, presmoother_steps );
-			}
-
-			inline grb::RC post_smooth( SmootherInputType& data ) {
-				return __run_smoother( data, postsmoother_steps );
-			}
+				for( size_t color = smoothing_info.color_masks.size(); color > 0; --color ) {
+					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color - 1 );
 
-			inline grb::RC nonrecursive_smooth( SmootherInputType& data ) {
-				return __run_smoother( data, non_recursive_smooth_steps );
-			}
-
-			/**
-			 * Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother,
-			 * with inputs and outputs stored inside \p data.
-			 *
-			 * This is an internal method called by all user-facing methods, because this specific
-			 * smoother performs all smoothing steps the same way.
-			 */
-			grb::RC __run_smoother(
-				SmootherInputType &data,
-				const size_t smoother_steps
-			) {
-				RC ret = SUCCESS;
-
-				SmootherData< IOType > &smoothing_info = *( levels.at( data.level ).get() );
-
-				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : internal::red_black_gauss_seidel< descr >(
-						data, smoothing_info, ring );
-					assert( ret == SUCCESS );
 				}
 				return ret;
 			}
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index f2b008e6f..3d1fee648 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -61,39 +61,82 @@ namespace grb {
 			}
 		};
 
-		namespace internal {
+		/**
+		 * Runner structure, holding the data to coarsen the levels of a multi-grid simulation.
+		 *
+		 * This coarsener just uses the same matrix to perform the coarsening (via an mxv())
+		 * and the prolongation, using it transposed.
+		 */
+		template<
+			class CoarsenerTypes,
+			typename TelTokenType,
+			Descriptor descr = descriptors::no_operation
+		> struct SingleMatrixCoarsener {
+
+			using IOType = typename CoarsenerTypes::IOType;
+			using NonzeroType = typename CoarsenerTypes::NonzeroType;
+			using Ring = typename CoarsenerTypes::Ring;
+			using Minus = typename CoarsenerTypes::Minus;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
+			using CoarseningDataType = CoarseningData< IOType, NonzeroType >;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+
+			/**
+			 * Data to coarsen each level, from finer to coarser.
+			 */
+			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType,
+				NonzeroType > > > coarsener_levels;
+			Ring ring;
+			Minus minus;
+
+			/**
+			 * Method required by MultiGridRunner before the recursive call, to coarsen
+			 * the residual vector of \p finer (the finer system) into the residual of
+			 * \p coarser (the coarser system).
+			 */
+			inline grb::RC coarsen_residual(
+				const MultiGridInputType &finer,
+				MultiGridInputType &coarser
+			) {
+				// first compute the residual
+				CoarseningData< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
+				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring. template getZero< IOType >() );
+				ret = ret ? ret : grb::mxv< descr >( coarsener.Ax_finer, finer.A, finer.z, ring );
+
+				return ret ? ret : compute_coarsening( finer.r, coarser.r, coarsener );
+			}
 
+			/**
+			 * Method required by MultiGridRunner after the recursive call, to "prolong" the coarser solution
+			 * into the finer solution.
+			 */
+			inline grb::RC prolong_solution(
+				const MultiGridInputType &coarser,
+				MultiGridInputType &finer
+			) {
+				return compute_prolongation( coarser.z, finer.z, *coarsener_levels[ finer.level ] );
+			}
+
+		protected:
 			/**
 			 * computes the coarser residual vector \p CoarseningData.r by coarsening
 			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
 			 *
 			 * The coarsening information are stored inside \p CoarseningData.
 			 *
-			 * @tparam descr descriptor for static information
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
 			 * @param[in] r_fine fine residual vector
 			 * @param[in,out] coarsening_data \ref MultiGridData data structure storing the information for coarsening
-			 * @param[in] ring the ring to perform the operations on
-			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template<
-				Descriptor descr,
-				typename IOType,
-				typename NonzeroType,
-				class Ring,
-				class Minus
-			> grb::RC compute_coarsening(
+			grb::RC compute_coarsening(
 				const grb::Vector< IOType > & r_fine, // fine residual
 				grb::Vector< IOType > & r_coarse, // fine residual
-				CoarseningData< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring,
-				const Minus & minus
+				CoarseningData< IOType, NonzeroType > & coarsening_data
 			) {
 				RC ret = SUCCESS;
 				ret = ret ? ret : grb::eWiseApply< descr >( coarsening_data.Ax_finer, r_fine,
@@ -110,104 +153,33 @@ namespace grb {
 
 			/**
 			 * computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
-			 * \p x_fine.
+			 * \p z_fine.
 			 *
 			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
 			 *
-			 * @tparam descr descriptor for static information
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[out] x_fine the solution vector to store the prolonged solution into
+			 * @param[out] z_fine the solution vector to store the prolonged solution into
 			 * @param[in,out] coarsening_data information for coarsening
-			 * @param[in] ring the ring to perform the operations on
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 * unsuccessful operation otherwise
 			 */
-			template<
-				Descriptor descr,
-				typename IOType,
-				typename NonzeroType,
-				class Ring
-			> grb::RC compute_prolongation(
+			grb::RC compute_prolongation(
 				const grb::Vector< IOType > & z_coarse,
-				grb::Vector< IOType > & x_fine, // fine residual
-				grb::algorithms::CoarseningData< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring
+				grb::Vector< IOType > & z_fine, // fine residual
+				grb::algorithms::CoarseningData< IOType, NonzeroType > & coarsening_data
 			) {
 				RC ret = SUCCESS;
 				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
-				// to nrows(x_fine)
+				// to nrows(z_fine)
 				ret = ret ? ret : grb::set< descr >( coarsening_data.Ax_finer, ring.template getZero< IOType >() );
 
 				ret = ret ? ret : grb::mxv< descr | grb::descriptors::transpose_matrix >(
 					coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, z_coarse, ring );
 				assert( ret == SUCCESS );
 
-				ret = ret ? ret : grb::foldl< descr >( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
+				ret = ret ? ret : grb::foldl< descr >( z_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // z_fine += Ax_finer;
 				assert( ret == SUCCESS );
 				return ret;
 			}
-
-		} // namespace internal
-
-		/**
-		 * Runner structure, holding the data to coarsen the levels of a multi-grid simulation.
-		 *
-		 * This coarsener just uses the same matrix to perform the coarsening (via an mxv())
-		 * and the prolongation, using it transposed.
-		 */
-		template<
-			typename IOType,
-			typename NonzeroType,
-			class Ring,
-			class Minus,
-			Descriptor descr = descriptors::no_operation
-		> struct SingleMatrixCoarsener {
-
-			static_assert( std::is_default_constructible< Ring >::value,
-				"cannot construct the Ring with default values" );
-			static_assert( std::is_default_constructible< Minus >::value,
-				"cannot construct the Minus operator with default values" );
-
-			using MultiGridInputType = MultiGridData< IOType, NonzeroType >;
-
-			/**
-			 * Data to coarsen each level, from finer to coarser.
-			 */
-			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType,
-				NonzeroType > > > coarsener_levels;
-			Ring ring;
-			Minus minus;
-
-			/**
-			 * Method required by MultiGridRunner before the recursive call, to coarsen
-			 * the residual vector of \p finer (the finer system) into the residual of
-			 * \p coarser (the coarser system).
-			 */
-			inline grb::RC coarsen_residual(
-				const MultiGridInputType &finer,
-				MultiGridInputType &coarser
-			) {
-				// first compute the residual
-				CoarseningData< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
-				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring. template getZero< IOType >() );
-				ret = ret ? ret : grb::mxv< descr >( coarsener.Ax_finer, finer.A, finer.z, ring );
-
-				return internal::compute_coarsening< descr >( finer.r, coarser.r, coarsener, ring, minus );
-			}
-
-			/**
-			 * Method required by MultiGridRunner after the recursive call, to "prolong" the coarser solution
-			 * into the finer solution.
-			 */
-			inline grb::RC prolong_solution(
-				const MultiGridInputType &coarser,
-				MultiGridInputType &finer
-			) {
-				return internal::compute_prolongation< descr >( coarser.z, finer.z, *coarsener_levels[ finer.level ], ring );
-			}
 		};
 
 	} // namespace algorithms
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index f7cd05787..adba0339a 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -31,33 +31,23 @@
 #include <memory>
 #include <type_traits>
 #include <algorithm>
+#include <array>
+#include <cstring>
+#include <iomanip>
+#include <locale>
 
 #include <graphblas.hpp>
 
-//========== TRACE SOLVER STEPS =========
-// to easily trace the steps of the solver, just define this symbol
-// #define HPCG_PRINT_STEPS
+#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
+#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
 
-// here we define a custom macro, which enables tracing only for HPCG code
-#ifdef HPCG_PRINT_STEPS
-#include <cstdio>
-
-// HPCG_PRINT_STEPS requires defining the following symbols
-
-// prints args on a dedicated line
-#define DBG_println( args ) std::cout << args << std::endl;
-// forward declaration for the tracing facility
-template< typename T > void print_norm( const grb::Vector< T > &r, const char * head );
-// prints head and the norm of r
-#define DBG_print_norm( vec, head ) print_norm( vec, head )
-#endif
-//============================================
-
-#include <graphblas/algorithms/hpcg/hpcg.hpp>
 #include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
 #include <graphblas/algorithms/hpcg/system_building_utils.hpp>
 
 #include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/telemetry/Telemetry.hpp>
 
 #include <utils/argument_parser.hpp>
 #include <utils/assertions.hpp>
@@ -84,33 +74,88 @@ using namespace grb;
 using namespace algorithms;
 
 static const char * const TEXT_HIGHLIGHT = "===> ";
-#define thcout ( std::cout << TEXT_HIGHLIGHT )
-#define thcerr ( std::cerr << TEXT_HIGHLIGHT )
-#define MASTER_PRINT( pid, txt ) if( pid == 0 ) { std::cout << txt; }
 
 // default types
-using IOType = double;
-using NonzeroType = double;
-using InputType = double;
-using ResidualType = double;
-using StdRing = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
-	grb::identities::zero, grb::identities::one >;
-using StdMinus = operators::subtract< NonzeroType >;
+using value_t = double;
+
+struct HPCGTypes {
+	using IOType = value_t;
+	using NonzeroType = value_t;
+	using InputType = value_t;
+	using ResidualType = value_t;
+	using Ring = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
+		grb::identities::zero, grb::identities::one >;
+	using Minus = operators::subtract< NonzeroType >;
+	using Divide = operators::divide< NonzeroType >;
+};
+
+using IOType = typename HPCGTypes::IOType;
+using NonzeroType = typename HPCGTypes::NonzeroType;
+using InputType = typename HPCGTypes::InputType;
+using ResidualType = typename HPCGTypes::ResidualType;
+using Ring = typename HPCGTypes::Ring;
+
 using coord_t = size_t;
+
 constexpr Descriptor hpcg_desc = descriptors::dense;
 
+DECLARE_TELEMETRY_TOKEN( DistOut )
+ACTIVATE_TOKEN( DistOut )
+using dist_token_t = TELEMETRY_TOKEN_TYPE( DistOut );
+using DistStream = grb::utils::telemetry::OutputStream< dist_token_t >;
+
+DECLARE_TELEMETRY_TOKEN( HPCGTelemetry )
+ACTIVATE_TOKEN( HPCGTelemetry )
+using hpcg_token_t = TELEMETRY_TOKEN_TYPE( HPCGTelemetry );
+
+DECLARE_TELEMETRY_TOKEN( MGTelemetry )
+ACTIVATE_TOKEN( MGTelemetry )
+using mg_token_t = TELEMETRY_TOKEN_TYPE( MGTelemetry );
+
+DECLARE_TELEMETRY_TOKEN( DBGToken )
+// ACTIVATE_TOKEN( DBGToken )
+using dbg_token_t = TELEMETRY_TOKEN_TYPE( DBGToken );
+using DBGStream = grb::utils::telemetry::OutputStream< dbg_token_t >;
+
+using duration_t = utils::telemetry::duration_nano_t;
+using hpcg_csv_t = utils::telemetry::CSVWriter< hpcg_token_t, hpcg_token_t::enabled, size_t, duration_t >;
+using mg_csv_t = utils::telemetry::CSVWriter< mg_token_t, mg_token_t::enabled, size_t, size_t, duration_t, duration_t >;
+
 // assembled types for simulation runners and input/output structures
-using hpcg_runner_t = HPCGRunnerType< hpcg_desc, IOType, NonzeroType, InputType, ResidualType,
-	StdRing, StdMinus >;
-using mg_data_t = MultiGridData< IOType, NonzeroType >;
-using coarsening_data_t = CoarseningData< IOType, NonzeroType >;
-using smoothing_data_t = SmootherData< IOType >;
-using hpcg_data_t = MultiGridCGData< IOType, NonzeroType, InputType >;
-
-static const IOType io_zero = StdRing(). template getZero< IOType >();
-static const NonzeroType nz_zero = StdRing(). template getZero< NonzeroType >();
-static const InputType input_zero = StdRing(). template getZero< InputType >();
-static const ResidualType residual_zero = StdRing(). template getZero< ResidualType >();
+using smoother_runner_t = grb::algorithms::RedBlackGSSmootherRunner< HPCGTypes, mg_token_t, hpcg_desc >;
+using smoothing_data_t = typename smoother_runner_t::SmootherDataType;
+
+using coarsener_runner_t = grb::algorithms::SingleMatrixCoarsener< HPCGTypes, mg_token_t, hpcg_desc >;
+using coarsening_data_t = typename coarsener_runner_t::CoarseningDataType;
+
+using mg_runner_t = MultiGridRunner< HPCGTypes, smoother_runner_t, coarsener_runner_t, mg_token_t, hpcg_desc, DBGStream >;
+using mg_data_t = typename mg_runner_t::MultiGridInputType;
+
+using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_token_t, hpcg_desc, DBGStream >;
+using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
+
+struct dotter : grb::utils::telemetry::OutputStreamLazy {
+	const grb::Vector< IOType > &v;
+	dotter( const grb::Vector< IOType > &_v ) : v( _v ) {}
+	ResidualType operator()() const {
+		Ring ring;
+		ResidualType r = 0;
+		grb::dot( r, v, v, ring );
+		return r;
+	}
+};
+
+static inline DBGStream & operator<<( DBGStream & stream, const grb::Vector< IOType > & v ) {
+	stream << std::setprecision( 7 );
+	return stream << dotter( v );
+}
+
+static const IOType io_zero = Ring(). template getZero< IOType >();
+static const NonzeroType nz_zero = Ring(). template getZero< NonzeroType >();
+static const InputType input_zero = Ring(). template getZero< InputType >();
+static const ResidualType residual_zero = Ring(). template getZero< ResidualType >();
+
+static constexpr size_t MAX_CSV_PATH_LENGTH = 255;
 
 /**
  * Container for the parameters for the HPCG simulation.
@@ -126,7 +171,18 @@ struct simulation_input {
 	size_t smoother_steps;
 	bool evaluation_run;
 	bool no_preconditioning;
-	bool print_iter_stats;
+	// logging options: these are serializable for launcher invocation
+	std::array< char, MAX_CSV_PATH_LENGTH + 1 > hpcg_csv;
+	std::array< char, MAX_CSV_PATH_LENGTH + 1 > mg_csv;
+	bool hpcg_log;
+	bool mg_log;
+
+	simulation_input() {
+		hpcg_csv[ 0 ] = '\0';
+		mg_csv[ 0 ] = '\0';
+	}
+
+	simulation_input( const simulation_input & ) = default;
 };
 
 /**
@@ -155,35 +211,6 @@ static void print_system(
 }
 #endif
 
-//========== ROUTINES TO TRACE SOLVER STEPS =========
-#ifdef HPCG_PRINT_STEPS
-template<
-	typename T,
-	class Ring
-> void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm = ring. template getZero< T >();
-	RC ret = grb::dot( norm, r, r, ring ); // norm = r' * r;
-	(void)ret;
-	assert( ret == SUCCESS );
-	if( spmd<>::pid() != 0 ) {
-		return;
-	}
-	// printf makes more likely to get single lineas in output with multiple processes
-	// additionally, it doesn't approximate double values
-	if( head != nullptr ) {
-		printf(">>> %s: %lf\n", head, norm );
-	} else {
-		printf(">>> %lf\n", norm );
-	}
-}
-
-template< typename T > void print_norm( const grb::Vector< T > & r, const char * head ) {
-	return print_norm( r, head, StdRing() );
-}
-#endif
-//============================================
-
-
 /**
  * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
  * for each level of the multi-grid. The input is the vector of system sizes \p mg_sizes, with sizes in
@@ -212,25 +239,26 @@ template< typename T > T static next_pow_2( T n ) {
  * explained in \ref multigrid_allocate_data().
  */
 static void allocate_system_structures(
-	const std::vector< size_t > &mg_sizes,
 	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &cg_system_data
+	std::unique_ptr< hpcg_data_t > &cg_system_data,
+	const std::vector< size_t > &mg_sizes,
+	const mg_token_t & mg_token,
+	DistStream & logger
 ) {
-	const size_t pid = spmd<>::pid() ;
 	grb::utils::Timer timer;
 
 	hpcg_data_t *data = new hpcg_data_t( mg_sizes[ 0 ] );
 	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
-	MASTER_PRINT( pid, "allocating data for the MultiGrid simulation...");
+	logger << "allocating data for the MultiGrid simulation...";
 	timer.reset();
-	multigrid_allocate_data( mg_sizes, system_levels, coarsener_levels, smoother_levels );
+	multigrid_allocate_data( system_levels, coarsener_levels, smoother_levels, mg_sizes, mg_token );
 	double time = timer.time();
-	MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+	logger << " time (ms) " << time << std::endl;
 
 	// zero all vectors
-	MASTER_PRINT( pid, "zeroing all vectors...");
+	logger << "zeroing all vectors...";
 	timer.reset();
 	grb::RC rc = data->init_vectors( io_zero );
 	ASSERT_RC_SUCCESS( rc );
@@ -241,23 +269,25 @@ static void allocate_system_structures(
 	std::for_each( smoother_levels.begin(), smoother_levels.end(),
 		[]( std::unique_ptr< smoothing_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
 	time = timer.time();
-	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
+	logger << " time (ms) " << time << std::endl;
 }
 
+
 /**
  * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
  * It allocates the data structures and populates them according to the algorithms chosen for HPCG.
  */
 static void build_3d_system(
-	const simulation_input & in,
 	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &cg_system_data
+	std::unique_ptr< hpcg_data_t > &cg_system_data,
+	const simulation_input & in,
+	const mg_token_t & tt,
+	DistStream & logger
 ) {
 	constexpr size_t DIMS = 3;
 	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
-	const size_t pid = spmd<>::pid();
 	grb::utils::Timer timer;
 
 	HPCGSystemParams< DIMS, NonzeroType > params = {
@@ -266,22 +296,21 @@ static void build_3d_system(
 	};
 
 	std::vector< builder_t > mg_generators;
-	MASTER_PRINT( pid, "building HPCG generators for " << ( in.max_coarsening_levels + 1 )
-		<< " levels..." );
+	logger << "building HPCG generators for " << ( in.max_coarsening_levels + 1 ) << " levels...";
 	timer.reset();
 	// construct the builder_t generator for each grid level, which depends on the system physics
 	hpcg_build_multigrid_generators( params, mg_generators );
 	double time = timer.time();
-	MASTER_PRINT( pid, " time (ms) " << time << std::endl );
-	MASTER_PRINT( pid, "built HPCG generators for " << mg_generators.size()
-		<< " levels" << std::endl );
+	logger << " time (ms) " << time << std::endl;
+	logger << "built HPCG generators for " << mg_generators.size()
+		<< " levels" << std::endl;
 
 	// extract the size for each level
 	std::vector< size_t > mg_sizes;
 	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
 		[] ( const builder_t &b ) { return b.system_size(); } );
 	// given the sizes, allocate the data structures for all the inputs of the algorithms
-	allocate_system_structures( mg_sizes, system_levels, coarsener_levels, smoother_levels, cg_system_data );
+	allocate_system_structures( system_levels, coarsener_levels, smoother_levels, cg_system_data, mg_sizes, tt, logger );
 	assert( mg_generators.size() == system_levels.size() );
 	assert( mg_generators.size() == smoother_levels.size() );
 	assert( mg_generators.size() - 1 == coarsener_levels.size() ); // coarsener acts between two levels
@@ -289,29 +318,29 @@ static void build_3d_system(
 	// for each grid level, populate the data structures according to the specific algorithm
 	// and track the time for diagnostics purposes
 	for( size_t i = 0; i < mg_generators.size(); i++) {
-		MASTER_PRINT( pid, "SYSTEM LEVEL " << i << std::endl );
+		logger << "SYSTEM LEVEL " << i << std::endl;
 		auto& sizes = mg_generators[ i ].get_generator().get_sizes();
-		MASTER_PRINT( pid, " sizes: " );
+		logger << " sizes: ";
 		for( size_t s = 0; s < DIMS - 1; s++ ) {
-			MASTER_PRINT( pid,sizes[ s ] << " x " );
+			logger <<sizes[ s ] << " x ";
 		}
-		MASTER_PRINT( pid, sizes[ DIMS - 1 ] << std::endl );
-		MASTER_PRINT( pid, " populating system matrix: " );
+		logger << sizes[ DIMS - 1 ] << std::endl;
+		logger << " populating system matrix: ";
 		timer.reset();
-		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A );
+		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A, logger );
 		time = timer.time();
 		ASSERT_RC_SUCCESS( rc );
-		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+		logger << " time (ms) " << time << std::endl;
 
-		MASTER_PRINT( pid, " populating smoothing data: " );
+		logger << " populating smoothing data: ";
 		timer.reset();
-		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ] );
+		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ], logger );
 		time = timer.time();
 		ASSERT_RC_SUCCESS( rc );
-		MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+		logger << " time (ms) " << time << std::endl;
 
 		if( i > 0 ) {
-			MASTER_PRINT( pid, " populating coarsening data: " );
+			logger << " populating coarsening data: ";
 			timer.reset();
 			if( !in.use_average_coarsener ) {
 				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
@@ -320,11 +349,12 @@ static void build_3d_system(
 			}
 			time = timer.time();
 			ASSERT_RC_SUCCESS( rc );
-			MASTER_PRINT( pid, " time (ms) " << time << std::endl )
+			logger << " time (ms) " << time << std::endl;
 		}
 	}
 }
 
+
 /**
  * Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
@@ -333,25 +363,44 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
 	const size_t pid = spmd<>::pid();
 	grb::utils::Timer timer;
-	MASTER_PRINT( pid, "beginning input generation..." << std::endl );
+
+	dist_token_t dist( pid == 0 );
+	class MyNumPunct : public std::numpunct<char> {
+	// protected:
+		char do_thousands_sep() const override { return '\''; }
+		std::string do_grouping() const override { return "\03"; }
+	};
+	std::locale old_locale = std::cout.imbue( std::locale( std::cout.getloc(), new MyNumPunct ) );
+	DistStream logger( dist, std::cout );
+
+	logger << "beginning input generation..." << std::endl;
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data_t > hpcg_state;
 
+	// log HPCG by default on master
+	hpcg_token_t hpcg_token( pid == 0 );
+	// log Mg and smoother only if the user requested it
+	mg_token_t mg_token( pid == 0 && in.mg_log );
+
+	dbg_token_t dbg_token( pid == 0 );
+	DBGStream dbg_stream( dbg_token, std::cout );
+
 	// define the main HPCG runner and initialize the options of its components
-	hpcg_runner_t hpcg_runner( build_hpcg_runner< hpcg_desc, IOType, NonzeroType, InputType, ResidualType,
-		StdRing, StdMinus >( in.smoother_steps ) );
-	auto &mg_runner = hpcg_runner.mg_runner;
-	auto &coarsener = mg_runner.coarsener_runner;
-	auto &smoother = mg_runner.smoother_runner;
-	hpcg_runner.cg_opts.tolerance = residual_zero;
-	hpcg_runner.cg_opts.with_preconditioning = ! in.no_preconditioning;
+	coarsener_runner_t coarsener;
+	smoother_runner_t smoother;
+	smoother.presmoother_steps = smoother.postsmoother_steps = in.smoother_steps;
+	smoother.non_recursive_smooth_steps = 1UL;
+	mg_runner_t mg_runner( smoother, coarsener, dbg_stream );
+	hpcg_runner_t hpcg_runner( hpcg_token, mg_runner, dbg_stream );
+	hpcg_runner.tolerance = residual_zero;
+	hpcg_runner.with_preconditioning = ! in.no_preconditioning;
 
 	timer.reset();
 	// build the entire multi-grid system
-	build_3d_system( in, mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state );
+	build_3d_system( mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state, in, mg_token, logger );
 	double input_duration = timer.time();
-	MASTER_PRINT( pid, "input generation time (ms): " << input_duration << std::endl );
+	logger << "input generation time (ms): " << input_duration << std::endl;
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
@@ -367,7 +416,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// set vectors as from standard HPCG benchmark
 	set( x, 1.0 );
 	set( b, nz_zero );
-	rc = grb::mxv( b, A, x, StdRing() );
+	rc = grb::mxv( b, A, x, Ring() );
 	set( x, io_zero );
 
 #ifdef HPCG_PRINT_SYSTEM
@@ -382,32 +431,43 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	mg_data_t &grid_base = *mg_runner.system_levels[ 0 ];
 
 	// do a cold run to warm the system up
-	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning cold run..." << std::endl );
-	hpcg_runner.cg_opts.max_iterations = 1;
+	logger << TEXT_HIGHLIGHT << "beginning cold run..." << std::endl;
+	hpcg_runner.max_iterations = 1;
 	timer.reset();
 	rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
 	double iter_duration = timer.time();
 	ASSERT_RC_SUCCESS( rc );
-	MASTER_PRINT( pid, " time (ms): " << iter_duration << std::endl );
+	logger << " time (ms): " << iter_duration << std::endl;
 
 	// restore CG options to user-given values
-	hpcg_runner.cg_opts.max_iterations = in.max_iterations;
-	hpcg_runner.cg_opts.print_iter_residual = in.print_iter_stats;
-	mg_runner.print_duration = in.print_iter_stats;
-	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning solver..." << std::endl );
+	hpcg_runner.max_iterations = in.max_iterations;
+	logger << TEXT_HIGHLIGHT << "beginning solver..." << std::endl;
 	out.inner_test_repetitions = 0;
 	out.times.useful = 0.0;
+
+	hpcg_csv_t hpcg_csv( hpcg_token, { "repetition", "time" } );
+	mg_csv_t mg_csv( mg_token, { "repetition", "level", "mg time", "smoother time" } );
+
 	// do benchmark
 	for( size_t i = 0; i < in.inner_test_repetitions; ++i ) {
 		rc = set( x, io_zero );
 		ASSERT_RC_SUCCESS( rc );
-		MASTER_PRINT( pid, TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl );
+		logger << TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl;
 		timer.reset();
 		rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
 		iter_duration = timer.time();
 		out.times.useful += iter_duration;
 		ASSERT_RC_SUCCESS( rc );
-		MASTER_PRINT( pid, "repetition,duration (ms): " << i << "," << iter_duration << std::endl );
+		hpcg_csv.add_line( i, hpcg_runner.getElapsedNano() );
+		logger << "repetition,duration (ns): " << hpcg_csv.last_line() << std::endl;
+		for( const auto & mg_level : mg_runner.system_levels ) {
+			mg_csv.add_line( i, mg_level->level, mg_level->mg_stopwatch.getElapsedNano(),
+				mg_level->sm_stopwatch.getElapsedNano() );
+			mg_level->mg_stopwatch.reset();
+			mg_level->sm_stopwatch.reset();
+		}
+		hpcg_runner.reset();
+
 		out.inner_test_repetitions++;
 	}
 	if( in.evaluation_run ) {
@@ -417,8 +477,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 	out.times.useful /= static_cast< double >( in.inner_test_repetitions );
 
-	MASTER_PRINT( pid, TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
-				<< ", " << out.times.useful << std::endl );
+	logger << TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
+				<< ", " << out.times.useful << std::endl;
+	std::cout.imbue( old_locale );
 
 	// start postamble
 	timer.reset();
@@ -426,16 +487,26 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.error_code = rc;
 
 	grb::set( b, 1.0 );
-	grb::eWiseMul( b, -1.0, x, StdRing() );
+	grb::eWiseMul( b, -1.0, x, Ring() );
 	out.square_norm_diff = nz_zero;
-	grb::dot( out.square_norm_diff, b, b, StdRing() );
+	grb::dot( out.square_norm_diff, b, b, Ring() );
 
 	// output
 	out.pinnedVector.reset( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
 	// finish timing
 	out.times.postamble = timer.time();
+
+	// write measurements into CSV files
+	if ( in.hpcg_log ) {
+		hpcg_csv.write_to_file( in.hpcg_csv.data() );
+	}
+	if ( in.mg_log ) {
+		mg_csv.write_to_file( in.mg_csv.data() );
+	}
 }
 
+#define thcout ( std::cout << TEXT_HIGHLIGHT )
+
 /**
  * Parser the command-line arguments to extract the simulation information and checks they are valid.
  */
@@ -456,7 +527,6 @@ int main( int argc, char ** argv ) {
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
 	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
 	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
-	thcout << "Print iteration residual: " << std::boolalpha << sim_in.print_iter_stats << std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
 	thcout << "Maximum norm for residual: " << max_diff_norm << std::endl;
@@ -500,19 +570,19 @@ int main( int argc, char ** argv ) {
 	// check result vector, stored inside a pinned vector
 	ASSERT_TRUE( out.pinnedVector );
 	const PinnedVector< double > &solution = *out.pinnedVector;
-	thcout << "Size of x is " << solution.size() << std::endl;
-	ASSERT_GT( solution.size(), 0 );
-	print_vector( solution, 30, "SOLUTION" );
+	ASSERT_EQ( solution.size(), sim_in.nx * sim_in.ny * sim_in.nz );
 
 	// check norm of solution w.r.t. expected solution (i.e. vector of all 1)
 	double diff_norm = sqrt( out.square_norm_diff );
-	thcout << "Norm of difference vector |<exact solution> - <actual solution>|: " << diff_norm << std::endl;
+	thcout << "Norm of difference vector: |<exact solution> - <actual solution>| = " << diff_norm << std::endl;
 	ASSERT_LT( diff_norm, max_diff_norm );
 
 	thcout << "Test OK" << std::endl;
 	return 0;
 }
 
+static const char * const empty = "";
+
 static void parse_arguments(
 	simulation_input & sim_in,
 	size_t & outer_iterations,
@@ -520,8 +590,9 @@ static void parse_arguments(
 	int argc,
 	char ** argv
 ) {
-
 	argument_parser parser;
+	const char * hpcg_csv, * mg_csv;
+
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
@@ -543,8 +614,10 @@ static void parse_arguments(
 			"launch single run directly, without benchmarker (ignore repetitions)" )
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
 			"do not apply pre-conditioning via multi-grid V cycle" )
-		.add_option( "--print-iter-stats", sim_in.print_iter_stats, false,
-			"on each iteration, print more statistics" )
+		.add_optional_argument( "--hpcg-csv", hpcg_csv , empty,
+			"file for HPCG run measurements (overwrites any previous)" )
+		.add_optional_argument( "--mg-csv", mg_csv , empty,
+			"file for Multigrid run measurements (overwrites any previous)" )
 		.add_option( "--use-average-coarsener", sim_in.use_average_coarsener, false,
 			"coarsen by averaging instead of by sampling a single point (slower, but more accurate)" );
 
@@ -564,6 +637,7 @@ static void parse_arguments(
 		std::exit( -1 );
 	}
 
+	// check sizes
 	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
 	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
 		std::lldiv_t div_res = std::div( static_cast< long long >( s ), static_cast< long long >( max_system_divider ) );
@@ -582,4 +656,22 @@ static void parse_arguments(
 			std::exit( -1 );
 		}
 	}
+
+	// check output CSVs
+	size_t len = std::strlen( hpcg_csv );
+	if( ( sim_in.hpcg_log = len > 0 ) ) {
+		if ( len > MAX_CSV_PATH_LENGTH ) {
+			std::cerr << "HPCG CSV file name is too long!" << std::endl;
+			std::exit( -1 );
+		}
+		std::strncpy( sim_in.hpcg_csv.data(), hpcg_csv, MAX_CSV_PATH_LENGTH );
+	}
+	len = std::strlen( mg_csv );
+	if( ( sim_in.mg_log = len > 0 ) ) {
+		if ( len > MAX_CSV_PATH_LENGTH ) {
+			std::cerr << "HPCG CSV file name is too long!" << std::endl;
+			std::exit( -1 );
+		}
+		std::strncpy( sim_in.mg_csv.data(), mg_csv, MAX_CSV_PATH_LENGTH );
+	}
 }

From 74283e6209a71460df21f634240dd611010ab6c5 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 2 Mar 2023 15:26:46 +0100
Subject: [PATCH 21/28] renaming telemetry API elements

---
 .../multigrid/multigrid_building_utils.hpp    |  14 +-
 .../algorithms/multigrid/multigrid_cg.hpp     |  14 +-
 .../algorithms/multigrid/multigrid_data.hpp   |  20 +-
 .../multigrid/multigrid_v_cycle.hpp           |   8 +-
 .../multigrid/red_black_gauss_seidel.hpp      |  12 +-
 .../multigrid/single_matrix_coarsener.hpp     |   7 +-
 .../graphblas/utils/telemetry/CSVWriter.hpp   | 145 ++++----
 .../utils/telemetry/OutputStream.hpp          |  28 +-
 .../graphblas/utils/telemetry/Stopwatch.hpp   |  40 +--
 .../graphblas/utils/telemetry/Telemetry.hpp   |   2 +-
 .../utils/telemetry/TelemetryBase.hpp         |  50 +--
 .../utils/telemetry/TelemetryController.hpp   | 319 ++++++++++++++++++
 .../utils/telemetry/TelemetryToken.hpp        | 145 --------
 .../graphblas/utils/telemetry/Timeable.hpp    |  18 +-
 tests/smoke/hpcg.cpp                          | 121 ++++---
 15 files changed, 539 insertions(+), 404 deletions(-)
 create mode 100644 include/graphblas/utils/telemetry/TelemetryController.hpp
 delete mode 100644 include/graphblas/utils/telemetry/TelemetryToken.hpp

diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
index ad09f4c9f..f46b8e558 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -56,24 +56,26 @@ namespace grb {
 		 *  the coarser system matrix (in this order)
 		 * @tparam SmootherInfoType type holding the information for the smoother;
 		 *  its constructor must take in input the size of the system matrix for that level
+		 * @tparam TelControllerType telemetry controller type, to (de)activate time measurement at compile-time
 		 *
 		 * @param mg_sizes sizes of the system matrix for each level of the multi-grid
 		 * @param system_levels system data (system matrix, residual, solution, ...) for each level
 		 * @param coarsener_levels at position \a i of this vector, data to coarsen from level \a i
 		 *  (system size \p mg_sizes [i] ) to level \a i+1 (system size \p mg_sizes [i+1] )
 		 * @param smoother_levels smoother data for each level
+		 * @param tt telemetry controller to control time tracing
 		 */
 		template<
 			typename MGInfoType,
 			typename CoarsenerInfoType,
 			typename SmootherInfoType,
-			typename TelTokenType
+			typename TelControllerType
 		> void multigrid_allocate_data(
-			std::vector< std::unique_ptr< MGInfoType > > &system_levels,
-			std::vector< std::unique_ptr< CoarsenerInfoType > > &coarsener_levels,
-			std::vector< std::unique_ptr< SmootherInfoType > > &smoother_levels,
-			const std::vector< size_t > &mg_sizes,
-			const TelTokenType & tt
+			std::vector< std::unique_ptr< MGInfoType > > & system_levels,
+			std::vector< std::unique_ptr< CoarsenerInfoType > > & coarsener_levels,
+			std::vector< std::unique_ptr< SmootherInfoType > > & smoother_levels,
+			const std::vector< size_t > & mg_sizes,
+			const TelControllerType & tt
 		) {
 			if( mg_sizes.size() == 0 ) {
 				throw std::invalid_argument( "at least one size should be available" );
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 2bb936a1c..3099e7d4e 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -113,10 +113,10 @@ namespace grb {
 		template<
 			typename MGCGTypes,
 			typename MultiGridRunnerType,
-			typename TelTokenType,
+			typename TelControllerType,
 			Descriptor descr = descriptors::no_operation,
 			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
-		> struct MultiGridCGRunner : public grb::utils::telemetry::Timeable< TelTokenType > {
+		> struct MultiGridCGRunner : public grb::utils::telemetry::Timeable< TelControllerType > {
 
 			using IOType = typename MGCGTypes::IOType;
 			using NonzeroType = typename MGCGTypes::NonzeroType;
@@ -152,10 +152,10 @@ namespace grb {
 			 * as the state of the MG runner is managed automatically with this object.
 			 */
 			MultiGridCGRunner(
-				const TelTokenType & tt,
-				MultiGridRunnerType &_mg_runner
+				const TelControllerType & tt,
+				MultiGridRunnerType & _mg_runner
 			) :
-				grb::utils::telemetry::Timeable< TelTokenType >( tt ),
+				grb::utils::telemetry::Timeable< TelControllerType >( tt ),
 				mg_runner( _mg_runner ),
 				dbg_logger()
 			{
@@ -163,11 +163,11 @@ namespace grb {
 			}
 
 			MultiGridCGRunner(
-				const TelTokenType & tt,
+				const TelControllerType & tt,
 				MultiGridRunnerType & _mg_runner,
 				DbgOutputStreamType & _dbg_logger
 			) :
-				grb::utils::telemetry::Timeable< TelTokenType >( tt ),
+				grb::utils::telemetry::Timeable< TelControllerType >( tt ),
 				mg_runner( _mg_runner ),
 				dbg_logger( _dbg_logger )
 			{}
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
index ed580da3d..67fe7bb8f 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_data.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -47,17 +47,19 @@ namespace grb {
 		 * @tparam IOType Type of values of the vectors for intermediate results
 		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
 		 *                     and the coarsening matrix #Ax_finer
+		 * @tparam TelControllerType type of the controller for telemetry, to compile-time (de)activate
+		 * 	the (mg_sm)_stopwatches
 		 */
 		template<
 			typename IOType,
 			typename NonzeroType,
-			typename TelTokenType
+			typename TelControllerType
 		> struct MultiGridData {
 
-			grb::utils::telemetry::Stopwatch< TelTokenType > mg_stopwatch;
-			grb::utils::telemetry::Stopwatch< TelTokenType > sm_stopwatch;
-			const size_t level; ///< level of the grid (0 for the finest physical system)
-			const size_t system_size; ///< size of the system, i.e. side of the #A system matrix
+			grb::utils::telemetry::Stopwatch< TelControllerType > mg_stopwatch; ///< stopwatch to measure the execution time in MG
+			grb::utils::telemetry::Stopwatch< TelControllerType > sm_stopwatch; ///< stopwatch to measure the execution time in the smoother
+			const size_t level;           ///< level of the grid (0 for the finest physical system)
+			const size_t system_size;     ///< size of the system, i.e. side of the #A system matrix
 			grb::Matrix< NonzeroType > A; ///< system matrix
 			grb::Vector< IOType > z; ///< multi-grid solution
 			grb::Vector< IOType > r; ///< residual
@@ -66,7 +68,7 @@ namespace grb {
 			 * Construct a new multigrid data object from level information and system size.
 			 */
 			MultiGridData(
-				const TelTokenType & _tt,
+				const TelControllerType & _tt,
 				size_t _level,
 				size_t sys_size
 			) :
@@ -79,10 +81,10 @@ namespace grb {
 				r( sys_size ) {}
 
 			// for safety, disable copy semantics
-			MultiGridData( const MultiGridData< IOType, NonzeroType, TelTokenType > & o ) = delete;
+			MultiGridData( const MultiGridData< IOType, NonzeroType, TelControllerType > & o ) = delete;
 
-			MultiGridData<IOType, NonzeroType, TelTokenType > & operator=(
-				const MultiGridData< IOType, NonzeroType, TelTokenType > & ) = delete;
+			MultiGridData< IOType, NonzeroType, TelControllerType > & operator=(
+					const MultiGridData< IOType, NonzeroType, TelControllerType > & ) = delete;
 
 			grb::RC init_vectors( IOType zero ) {
 				grb::RC rc = grb::set( z, zero );
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 6ab53b469..dbe15d2b8 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -61,17 +61,19 @@ namespace grb {
 			typename MGTypes,
 			typename MGSmootherType,
 			typename CoarsenerType,
-			typename TelTokenType,
+			typename TelControllerType,
 			Descriptor descr = descriptors::no_operation,
 			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
 		> struct MultiGridRunner {
 
-			using self_t = MultiGridRunner< MGTypes, MGSmootherType, CoarsenerType, TelTokenType, descr >;
+			using self_t = MultiGridRunner< MGTypes, MGSmootherType, CoarsenerType, TelControllerType, descr >;
+			// algebraic types
 			using IOType = typename MGTypes::IOType;
 			using NonzeroType = typename MGTypes::NonzeroType;
 			using Ring = typename MGTypes::Ring;
 			using Minus = typename MGTypes::Minus;
-			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelControllerType >;
+			// runners
 			using SmootherRunnerType = MGSmootherType;
 			using CoarsenerRunnerType = CoarsenerType;
 
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 02d0c5dd4..305fa30d7 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -66,22 +66,22 @@ namespace grb {
 		 *
 		 * It stores the information to smooth each level of the grid, to be initalized separately.
 		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam Ring the ring of algebraic operators
+		 * @tparam SmootherTypes container of algebraic tyoes for the smoother (IOType, NonzeroType, Ring)
+		 * @tparam TelControllerType telemetry controller to (de)activate time tracing within passed MultiGridData objects
 		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
 		template <
 			class SmootherTypes,
-			typename TelTokenType,
+			typename TelControllerType,
 			Descriptor descr = descriptors::no_operation
 		> struct RedBlackGSSmootherRunner {
 
 			using IOType = typename SmootherTypes::IOType;
 			using NonzeroType = typename SmootherTypes::NonzeroType;
 			using Ring = typename SmootherTypes::Ring;
-			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
-			using SmootherDataType = SmootherData< IOType >;
+			using Minus = typename SmootherTypes::Minus;
+			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external input structure
+			using SmootherDataType = SmootherData< IOType >; ///< smoothing information and temporary variables (per MG level)
 
 			size_t presmoother_steps = 1UL; ///< number of pre-smoother steps
 			size_t postsmoother_steps = 1UL;  ///< number of post-smoother steps
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index 3d1fee648..0e2ee58af 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -69,7 +69,7 @@ namespace grb {
 		 */
 		template<
 			class CoarsenerTypes,
-			typename TelTokenType,
+			typename TelControllerType,
 			Descriptor descr = descriptors::no_operation
 		> struct SingleMatrixCoarsener {
 
@@ -77,8 +77,9 @@ namespace grb {
 			using NonzeroType = typename CoarsenerTypes::NonzeroType;
 			using Ring = typename CoarsenerTypes::Ring;
 			using Minus = typename CoarsenerTypes::Minus;
-			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelTokenType >;
-			using CoarseningDataType = CoarseningData< IOType, NonzeroType >;
+
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< input data from MG
+			using CoarseningDataType = CoarseningData< IOType, NonzeroType >; ///< internal data with coarsening information
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
diff --git a/include/graphblas/utils/telemetry/CSVWriter.hpp b/include/graphblas/utils/telemetry/CSVWriter.hpp
index 969b73be8..94a7111b6 100644
--- a/include/graphblas/utils/telemetry/CSVWriter.hpp
+++ b/include/graphblas/utils/telemetry/CSVWriter.hpp
@@ -23,15 +23,15 @@
 #ifndef _H_GRB_UTILS_TELEMETRY_CSV_WRITER
 #define _H_GRB_UTILS_TELEMETRY_CSV_WRITER
 
-#include <type_traits>
-#include <tuple>
-#include <vector>
-#include <string>
+#include <fstream>
 #include <initializer_list>
 #include <ostream>
 #include <stdexcept>
-#include <fstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "TelemetryBase.hpp"
 
@@ -41,46 +41,35 @@ namespace grb {
 
 			static constexpr char STD_CSV_SEP = ',';
 
-			template<
-				typename TelTokenType,
-				bool enabled,
-				class T1,
-				class ...Ts
-			> class CSVWriter : public TelemetryBase< TelTokenType, enabled > {
+			template< typename TelControllerType, bool enabled, class T1, class... Ts >
+			class CSVWriter : public TelemetryBase< TelControllerType, enabled > {
 			public:
-				template< class U, class ...Us > struct is_csv_printable {
+				template< class U, class... Us >
+				struct is_csv_printable {
 					static constexpr bool value = std::is_arithmetic< U >::value;
 				};
 
-				template< class U1, class U2, class ...Us > struct is_csv_printable< U1, U2, Us...>  {
+				template< class U1, class U2, class... Us >
+				struct is_csv_printable< U1, U2, Us... > {
 					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
 				};
 
 				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
 
-				using self_t = CSVWriter< TelTokenType, enabled, T1, Ts... >;
+				using self_t = CSVWriter< TelControllerType, enabled, T1, Ts... >;
 
-				using base_t = TelemetryBase< TelTokenType, enabled >;
+				using base_t = TelemetryBase< TelControllerType, enabled >;
 
 				CSVWriter() = delete;
 
-				CSVWriter(
-					const TelTokenType & tt,
-					std::initializer_list< const char * > _headers,
-					char _separator,
-					size_t size
-				) :
-					base_t( tt )
-				 {
-					( void ) tt;
-					( void ) _headers;
-					( void ) _separator;
-					( void ) size;
+				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers, char _separator, size_t size ) : base_t( tt ) {
+					(void)tt;
+					(void)_headers;
+					(void)_separator;
+					(void)size;
 				}
 
-				CSVWriter( const TelTokenType & tt, std::initializer_list< const char * > _headers ) :
-					CSVWriter( tt, _headers, STD_CSV_SEP, 10 )
-				{}
+				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers ) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
 
 				CSVWriter( const self_t & ) = delete;
 
@@ -90,8 +79,8 @@ namespace grb {
 
 				self_t & operator=( self_t && ) = delete;
 
-				template< class... UTypes > void add_line( UTypes&&... ) {
-				}
+				template< class... UTypes >
+				void add_line( UTypes &&... ) {}
 
 				void clear() {}
 
@@ -100,37 +89,37 @@ namespace grb {
 				}
 
 				// print nothing
-				char last_line() const { return '\0'; }
+				char last_line() const {
+					return '\0';
+				}
 
 				std::ostream & write_to_stream( std::ostream & stream ) const {
 					return stream;
 				}
 
 				void write_to_file( const char * name ) const {
-					( void ) name;
+					(void)name;
 				}
 			};
 
-
-			template<
-				typename TelTokenType,
-				class T1,
-				class ...Ts
-			> class CSVWriter< TelTokenType, true, T1, Ts... >  : public TelemetryBase< TelTokenType, true > {
+			template< typename TelControllerType, class T1, class... Ts >
+			class CSVWriter< TelControllerType, true, T1, Ts... > : public TelemetryBase< TelControllerType, true > {
 			public:
-				template< class U, class ...Us > struct is_csv_printable {
+				template< class U, class... Us >
+				struct is_csv_printable {
 					static constexpr bool value = std::is_arithmetic< U >::value;
 				};
 
-				template< class U1, class U2, class ...Us > struct is_csv_printable< U1, U2, Us...>  {
+				template< class U1, class U2, class... Us >
+				struct is_csv_printable< U1, U2, Us... > {
 					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
 				};
 
 				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
 
-				using self_t = CSVWriter< TelTokenType, true, T1, Ts... >;
+				using self_t = CSVWriter< TelControllerType, true, T1, Ts... >;
 
-				using base_t = TelemetryBase< TelTokenType, true >;
+				using base_t = TelemetryBase< TelControllerType, true >;
 
 				class CSVLastTuple {
 				public:
@@ -138,10 +127,7 @@ namespace grb {
 
 					CSVLastTuple( const CSVLastTuple & clt ) : csv( clt.csv ) {}
 
-					inline friend std::ostream & operator<<(
-						std::ostream & stream,
-						const CSVLastTuple & t
-					) {
+					inline friend std::ostream & operator<<( std::ostream & stream, const CSVLastTuple & t ) {
 						return t.csv.write_last_line_to_stream( stream );
 					}
 
@@ -151,15 +137,7 @@ namespace grb {
 
 				CSVWriter() = delete;
 
-				CSVWriter(
-					const TelTokenType & tt,
-					std::initializer_list< const char * > _headers,
-					char _separator,
-					size_t size
-				) :
-					base_t( tt ),
-					separator( _separator )
-				{
+				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers, char _separator, size_t size ) : base_t( tt ), separator( _separator ) {
 					if( _headers.size() != NUM_FIELDS ) {
 						throw std::runtime_error( "wrong number of headers, it must match the unmber of line elements" );
 					}
@@ -168,17 +146,15 @@ namespace grb {
 					for( const auto & h : _headers ) {
 						headers.emplace_back( h );
 					}
-					if ( !tt.is_active() ) {
+					if( ! tt.is_active() ) {
 						return;
 					}
 					lines.reserve( size );
 					// zero to force physical allocation
-					//std::memset( reinterpret_cast< void * >( lines.data() ), 0, lines.size() * sizeof( tuple_t ) );
+					// std::memset( reinterpret_cast< void * >( lines.data() ), 0, lines.size() * sizeof( tuple_t ) );
 				}
 
-				CSVWriter( const TelTokenType & tt, std::initializer_list< const char * > _headers ) :
-					CSVWriter( tt, _headers, STD_CSV_SEP, 10 )
-				{}
+				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers ) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
 
 				CSVWriter( const self_t & ) = delete;
 
@@ -188,9 +164,10 @@ namespace grb {
 
 				self_t & operator=( self_t && ) = delete;
 
-				template< class... UTypes > void add_line( UTypes&&... vs ) {
-					if ( this->is_active() ) {
-						lines.emplace_back( std::forward<UTypes>( vs )...  );
+				template< class... UTypes >
+				void add_line( UTypes &&... vs ) {
+					if( this->is_active() ) {
+						lines.emplace_back( std::forward< UTypes >( vs )... );
 					}
 				}
 
@@ -199,21 +176,21 @@ namespace grb {
 				}
 
 				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
-					if ( lines.size() > 0 && this->is_active() ) {
+					if( lines.size() > 0 && this->is_active() ) {
 						write_line( stream, lines.back() );
 					}
 					return stream;
 				}
 
 				CSVLastTuple last_line() const {
-					if ( lines.size() == 0 ) {
+					if( lines.size() == 0 ) {
 						throw std::runtime_error( "no measures" );
 					}
 					return CSVLastTuple( *this );
 				}
 
 				std::ostream & write_to_stream( std::ostream & stream ) const {
-					if ( !this->is_active() ) {
+					if( ! this->is_active() ) {
 						return stream;
 					}
 					write_header( stream );
@@ -226,11 +203,11 @@ namespace grb {
 				}
 
 				void write_to_file( const char * name ) const {
-					if ( !this->is_active() ) {
+					if( ! this->is_active() ) {
 						return;
 					}
 					std::ofstream file( name );
-					if( !file.is_open() ) {
+					if( ! file.is_open() ) {
 						throw std::runtime_error( "cannot open file" );
 					}
 					write_to_stream( file );
@@ -261,33 +238,25 @@ namespace grb {
 				}
 
 				// recursive case
-				template< size_t OFFS > inline void write_val(
-					std::ostream & stream,
-					typename std::enable_if< OFFS < NUM_FIELDS - 1, const tuple_t &>::type _tup
-				) const {
+				template< size_t OFFS >
+				inline void write_val( std::ostream & stream, typename std::enable_if < OFFS< NUM_FIELDS - 1, const tuple_t & >::type _tup ) const {
 					stream << std::get< OFFS >( _tup ) << separator;
 					write_val< OFFS + 1 >( stream, _tup ); // tail recursion
 				}
 
 				// base case
-				template< size_t OFFS > inline void write_val(
-					std::ostream & stream,
-					typename std::enable_if< OFFS == NUM_FIELDS - 1, const tuple_t &>::type _tup
-				) const {
-					(void) separator;
+				template< size_t OFFS >
+				inline void write_val( std::ostream & stream, typename std::enable_if< OFFS == NUM_FIELDS - 1, const tuple_t & >::type _tup ) const {
+					(void)separator;
 					stream << std::get< OFFS >( _tup );
 				}
-
 			};
 
-			template<
-				class T1,
-				class ...Ts
-			> using StaticCSVWriter = CSVWriter< TelemetryTokenAlwaysOn, true, T1, Ts... >;
-
-		}
-	}
-}
+			template< class T1, class... Ts >
+			using StaticCSVWriter = CSVWriter< TelemetryControllerAlwaysOn, true, T1, Ts... >;
 
+		} // namespace telemetry
+	}     // namespace utils
+} // namespace grb
 
 #endif // _H_GRB_UTILS_TELEMETRY_CSV_WRITER
diff --git a/include/graphblas/utils/telemetry/OutputStream.hpp b/include/graphblas/utils/telemetry/OutputStream.hpp
index 35622b11a..8ec0606d7 100644
--- a/include/graphblas/utils/telemetry/OutputStream.hpp
+++ b/include/graphblas/utils/telemetry/OutputStream.hpp
@@ -57,16 +57,16 @@ namespace grb {
 			};
 
 			template<
-				typename TelTokenType,
-				bool enabled = TelTokenType::enabled
-			> class OutputStream : public TelemetryBase< TelTokenType, enabled > {
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class OutputStream : public TelemetryBase< TelControllerType, enabled > {
 			public:
-				using self_t = OutputStream< TelTokenType, enabled >;
+				using self_t = OutputStream< TelControllerType, enabled >;
 
 				OutputStream() = default;
 
-				OutputStream( const TelTokenType & _tt, std::ostream & _out ) :
-					TelemetryBase< TelTokenType, enabled >( _tt )
+				OutputStream( const TelControllerType & _tt, std::ostream & _out ) :
+					TelemetryBase< TelControllerType, enabled >( _tt )
 				{
 					( void ) _out;
 				}
@@ -96,15 +96,15 @@ namespace grb {
 				}
 			};
 
-			template< typename TelTokenType > class OutputStream< TelTokenType, true > :
-				public TelemetryBase< TelTokenType, true > {
+			template< typename TelControllerType > class OutputStream< TelControllerType, true > :
+				public TelemetryBase< TelControllerType, true > {
 			public:
-				using self_t = OutputStream< TelTokenType, true >;
+				using self_t = OutputStream< TelControllerType, true >;
 
-				using base_t = TelemetryBase< TelTokenType, true >;
+				using base_t = TelemetryBase< TelControllerType, true >;
 
-				OutputStream( const TelTokenType & _tt, std::ostream & _out ) :
-					TelemetryBase< TelTokenType, true >( _tt ),
+				OutputStream( const TelControllerType & _tt, std::ostream & _out ) :
+					TelemetryBase< TelControllerType, true >( _tt ),
 					out( _out )
 				{}
 
@@ -142,9 +142,9 @@ namespace grb {
 				std::ostream & out;
 			};
 
-			using OutputStreamOff = OutputStream< TelemetryTokenAlwaysOff, false >;
+			using OutputStreamOff = OutputStream< TelemetryControllerAlwaysOff, false >;
 
-			using OutputStreamOn = OutputStream< TelemetryTokenAlwaysOn, true >;
+			using OutputStreamOn = OutputStream< TelemetryControllerAlwaysOn, true >;
 		}
 	}
 }
diff --git a/include/graphblas/utils/telemetry/Stopwatch.hpp b/include/graphblas/utils/telemetry/Stopwatch.hpp
index 2cc900b61..1faa2e186 100644
--- a/include/graphblas/utils/telemetry/Stopwatch.hpp
+++ b/include/graphblas/utils/telemetry/Stopwatch.hpp
@@ -48,19 +48,12 @@ namespace grb {
 				static inline duration_float_t nano2Sec( duration_nano_t nano ) {
 					return static_cast< duration_float_t >( nano ) / 1000000000UL;
 				}
-
 			};
 
-			template<
-				typename TelTokenType,
-				bool enabled = TelTokenType::enabled
-			> class Stopwatch:
-				public StopwatchBase, public TelemetryBase< TelTokenType, enabled > {
+			template< typename TelControllerType, bool enabled = TelControllerType::enabled >
+			class Stopwatch : public StopwatchBase, public TelemetryBase< TelControllerType, enabled > {
 			public:
-				Stopwatch( const TelTokenType & tt ) :
-					StopwatchBase(),
-					TelemetryBase< TelTokenType, enabled >( tt )
-					{}
+				Stopwatch( const TelControllerType & tt ) : StopwatchBase(), TelemetryBase< TelControllerType, enabled >( tt ) {}
 
 				Stopwatch( const Stopwatch & ) = default;
 
@@ -79,11 +72,8 @@ namespace grb {
 				}
 			};
 
-
-			template<
-				typename TelTokenType
-			> class Stopwatch< TelTokenType, true >:
-				public StopwatchBase, public TelemetryBase< TelTokenType, true > {
+			template< typename TelControllerType >
+			class Stopwatch< TelControllerType, true > : public StopwatchBase, public TelemetryBase< TelControllerType, true > {
 
 				typedef typename std::chrono::high_resolution_clock clock_t;
 
@@ -96,23 +86,19 @@ namespace grb {
 				time_point_t beginning;
 
 			public:
-				Stopwatch( const TelTokenType & tt ) :
-					StopwatchBase(),
-					TelemetryBase< TelTokenType, true >( tt ),
-					elapsedTime( duration_t::zero() )
-					{}
+				Stopwatch( const TelControllerType & tt ) : StopwatchBase(), TelemetryBase< TelControllerType, true >( tt ), elapsedTime( duration_t::zero() ) {}
 
 				Stopwatch( const Stopwatch & s ) = default;
 
 				inline void start() {
-					if ( this->is_active() ) {
+					if( this->is_active() ) {
 						beginning = clock_t::now();
 					}
 				}
 
 				inline duration_nano_t stop() {
 					duration_nano_t count = 0;
-					if ( this->is_active() ) {
+					if( this->is_active() ) {
 						time_point_t end = clock_t::now();
 						duration_t d = end - beginning;
 						count = d.count();
@@ -123,7 +109,7 @@ namespace grb {
 
 				inline duration_nano_t reset() {
 					duration_t r = duration_t::zero();
-					if ( this->is_active() ) {
+					if( this->is_active() ) {
 						r = elapsedTime;
 						elapsedTime = duration_t::zero();
 					}
@@ -135,9 +121,9 @@ namespace grb {
 				}
 			};
 
-			using StaticStopwatch = Stopwatch< TelemetryTokenAlwaysOn, true >;
-		}
-	}
-}
+			using StaticStopwatch = Stopwatch< TelemetryControllerAlwaysOn, true >;
+		} // namespace telemetry
+	}     // namespace utils
+} // namespace grb
 
 #endif // _H_GRB_UTILS_TELEMETRY_STOPWATCH
diff --git a/include/graphblas/utils/telemetry/Telemetry.hpp b/include/graphblas/utils/telemetry/Telemetry.hpp
index f8369d1d1..0bb35909b 100644
--- a/include/graphblas/utils/telemetry/Telemetry.hpp
+++ b/include/graphblas/utils/telemetry/Telemetry.hpp
@@ -23,7 +23,7 @@
 #ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY
 #define _H_GRB_UTILS_TELEMETRY_TELEMETRY
 
-#include "TelemetryToken.hpp"
+#include "TelemetryController.hpp"
 #include "Stopwatch.hpp"
 #include "Timeable.hpp"
 #include "CSVWriter.hpp"
diff --git a/include/graphblas/utils/telemetry/TelemetryBase.hpp b/include/graphblas/utils/telemetry/TelemetryBase.hpp
index 969f93213..fcb9f5105 100644
--- a/include/graphblas/utils/telemetry/TelemetryBase.hpp
+++ b/include/graphblas/utils/telemetry/TelemetryBase.hpp
@@ -17,31 +17,37 @@
 
 /*
  * @author Alberto Scolari
- * @date 14th February, 2023
+ * @date 1st March, 2023
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
 #define _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
 
-#include "TelemetryToken.hpp"
+#include "TelemetryController.hpp"
 
 namespace grb {
 	namespace utils {
 		namespace telemetry {
 
+			/**
+			 *
+			 *
+			 * @tparam TelControllerType
+			 * @tparam enabled
+			 */
 			template<
-				typename TelTokenType,
-				bool enabled = TelTokenType::enabled
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
 			> class TelemetryBase {
 			public:
-				static_assert( is_telemetry_token< TelTokenType >::value,
-					"type TelTokenType does not implement Telemetry Token interface" );
+				static_assert( is_telemetry_controller< TelControllerType >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
 
-				using self_t = TelemetryBase< TelTokenType, enabled >;
+				using self_t = TelemetryBase< TelControllerType, enabled >;
 
 				TelemetryBase() = default;
 
-				TelemetryBase( const TelTokenType & tt ) {
+				TelemetryBase( const TelControllerType & tt ) {
 					( void ) tt;
 				}
 
@@ -54,35 +60,35 @@ namespace grb {
 
 
 			template<
-				typename TelTokenType
-			> class TelemetryBase< TelTokenType, true > {
+				typename TelControllerType
+			> class TelemetryBase< TelControllerType, true > {
 
-				const TelTokenType & telemetry_token;
+				const TelControllerType & telemetry_Controller;
 
 			public:
-				static_assert( is_telemetry_token< TelTokenType >::value,
-					"type TelTokenType does not implement Telemetry Token interface" );
+				static_assert( is_telemetry_controller< TelControllerType >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
 
-				using self_t = TelemetryBase< TelTokenType, true >;
+				using self_t = TelemetryBase< TelControllerType, true >;
 
-				TelemetryBase( const TelTokenType & tt ): telemetry_token( tt ) {}
+				TelemetryBase( const TelControllerType & tt ): telemetry_Controller( tt ) {}
 
-				TelemetryBase( const self_t & tb ) : telemetry_token( tb.telemetry_token ) {}
+				TelemetryBase( const self_t & tb ) : telemetry_Controller( tb.telemetry_Controller ) {}
 
 				self_t & operator=( const self_t & ) = delete;
 
-				bool is_active() const { return telemetry_token.is_active(); }
+				bool is_active() const { return telemetry_Controller.is_active(); }
 			};
 
 			// always actibe base, especially for prototyping scenarios
-			template<> class TelemetryBase< TelemetryTokenAlwaysOn, true > {
+			template<> class TelemetryBase< TelemetryControllerAlwaysOn, true > {
 			public:
-				static_assert( is_telemetry_token< TelemetryTokenAlwaysOn >::value,
-					"type TelTokenType does not implement Telemetry Token interface" );
+				static_assert( is_telemetry_controller< TelemetryControllerAlwaysOn >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
 
-				using self_t = TelemetryBase< TelemetryTokenAlwaysOn, true >;
+				using self_t = TelemetryBase< TelemetryControllerAlwaysOn, true >;
 
-				TelemetryBase( const TelemetryTokenAlwaysOn & tt ) { (void) tt; }
+				TelemetryBase( const TelemetryControllerAlwaysOn & tt ) { (void) tt; }
 
 				TelemetryBase( const self_t & tb ) = default;
 
diff --git a/include/graphblas/utils/telemetry/TelemetryController.hpp b/include/graphblas/utils/telemetry/TelemetryController.hpp
new file mode 100644
index 000000000..63a013eab
--- /dev/null
+++ b/include/graphblas/utils/telemetry/TelemetryController.hpp
@@ -0,0 +1,319 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Alberto Scolari
+ * @date 1st March, 2023
+ *
+ * This file defines the basic functionalities for <b>Telemetry Controllers</b>, i.e.,
+ * objects that enable/disable telemetry at compile-time and runtime.
+ *
+ * A telemetry controller can be \b enabled (at compile-time) to produce the code for telemetry and must be
+ * \b activated at runtime to emit actual telemetry information. Activation depends on runtime information
+ * (e.g., user's input) and may change dynamically \a after the controller is instantiated.
+ * If a controller is \b disabled, no code for compile-time is generated in any compliant telemetry functionality;
+ * hence, any (de)activation of a disabled telemetry controller is simply ignored and produces no result.
+ * In any case, the code must compile under all conditions, in order to avoid verbose
+ * pre-processing \a #if conditions.
+ *
+ * A typical instantiation of a telemetry controller in a user's application looks as follows:
+ *
+ * \code{.cpp}
+ * ENABLE_TELEMETRY_CONTROLLER( my_controller_t )
+ * DEFINE_TELEMETRY_CONTROLLER( my_controller_t )
+ *
+ * int main() {
+ * 		my_controller_t my_controller( true );
+ * 		if( my_controller.is_active() ) {
+ * 			std::cout << "my_controller is active";
+ * 		} else {
+ * 			std::cout << "my_controller is NOT active";
+ * 			if( !my_controller_t::enabled ) {
+ * 				std::cout << ", because it was deactivated at compile-time";
+ * 			}
+ * 		}
+ * 		std::cout << std::endl;
+ * 		return 0;
+ * }
+ * \endcode
+ *
+ * where the activation directive \a ENABLE_TELEMETRY_CONTROLLER is present only if the controller
+ * is to be activated. Users should indeed comment/uncomment this directive do disable/enable telemetry
+ * while debugging, or may add extra pre-processing logic to control it during compilation, like
+ *
+ * \code{.cpp}
+ * #ifdef __I_WANT_my_controller_t_ENABLED__
+ * 		ENABLE_TELEMETRY_CONTROLLER( my_controller_t )
+ * #endif
+ * DEFINE_TELEMETRY_CONTROLLER( my_controller_t )
+ * \endcode
+ *
+ * Note that the \a ENABLE_TELEMETRY_CONTROLLER directive (if present) must come \b before the
+ * \a DEFINE_TELEMETRY_CONTROLLER directive, otherwise compilation errors occur.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
+
+#include <type_traits>
+#include <utility> // std::declval< T >()
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * Returns whether a telemetry controller is enabled <b>at compile-time</b>. By default
+			 * it is \b not.
+			 *
+			 * @tparam T type associated to the telemetry controller
+			 * @return true never
+			 * @return false always
+			 */
+			template< typename T > constexpr bool is_controller_enabled() { return false; }
+
+			/**
+			 * Class that encapsulates the logic to enable/disable telemetry at compile-time
+			 * or at runtime.
+			 *
+			 * Telemetry can be completely disabled at compile-time (e.g., to avoid any code generation
+			 * and overhead) or can be controlled at runtime, based on external conditions (e.g.,
+			 * user's input, cluster node number, ...).
+			 *
+			 * In the following, the field #enabled encodes the compile-time information, while
+			 * the field \a active (if present) and the corresponding getter #is_active() tell
+			 * whether the controller is \a active at runtime. Hence, users of telemetry controllers should always
+			 * use the #is_active() method to check whether telemetry is active, while implementations
+			 * of telemetry controllers should implement this method also based on the value of the #enabled
+			 * field, possibly "short-circuiting" when #enabled is \a false. This implementation does
+			 * exactly this, disabling telemetry at compile-time and ignoring any runtime information.
+			 *
+			 * @tparam en whether telemetry is enabled (\p en = \a true has a dedicated template specialization)
+			 */
+			template< bool en > class TelemetryControllerBase {
+			public:
+				using self_t = TelemetryControllerBase< en >;
+
+				/**
+				 * Construct a new Telemetry Controller Base object with runtime information.
+				 *
+				 * HEre, runtime information is ignored, as this implementation disables any telemetry.
+				 *
+				 * @param _enabled whether telemetry is runtime-enabled (ignored here)
+				 */
+				TelemetryControllerBase( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryControllerBase() = delete;
+
+				TelemetryControllerBase( const self_t & ) = delete;
+
+				TelemetryControllerBase& operator=( const self_t & ) = delete;
+
+				/**
+				 * Whether telemetry is runtime-active.
+				 *
+				 * @return true never here
+				 * @return false always
+				 */
+				constexpr bool inline is_active() const { return false; }
+
+				/**
+				 * Set the active status of the telemetry controller.
+				 *
+				 * This \a disabled implementation ignores the input \p _active.
+				 */
+				void inline set_active( bool _active ) {
+					( void ) _active;
+				}
+
+				/**
+				 * Whether telemetry is compile-time active (never here).
+				 */
+				static constexpr bool enabled = false;
+			};
+
+			/**
+			 * Convenience definition fo an always-off telemetry controller.
+			 */
+			using TelemetryControllerAlwaysOff = TelemetryControllerBase< false >;
+
+			/**
+			 * Template specialization for compile-time enabled telemetry, which
+			 * can be controlled at runtime.
+			 *
+			 * The controller is \b enabled by default, and its \a active status can be controlled
+			 * at runtime via the constructor and the #set_active(bool) method.
+			 */
+			template<> class TelemetryControllerBase< true > {
+			public:
+				using self_t = TelemetryControllerBase< true >;
+
+				/**
+				 * Construct a new Telemetry oCntroller Base object, specifying the \a active state.
+				 *
+				 * @param _active whether the controller is \a active or not
+				 */
+				TelemetryControllerBase( bool _active ) : active( _active ) {}
+
+				TelemetryControllerBase() = delete;
+
+				TelemetryControllerBase( const self_t & ) = default;
+
+				TelemetryControllerBase& operator=( const self_t & ) = delete;
+
+				/**
+				 * Tells whether the controller is \a active.
+				*/
+				bool is_active() const { return this->active; }
+
+				/**
+				 * Set the \a active status of the controller at runtime.
+				 *
+				 * @param _active whether to activate the controller
+				 */
+				void inline set_active( bool _active ) {
+					this->active = _active;
+				}
+
+				/**
+				 * Whether telemetry is compile-time active (here always).
+				*/
+				static constexpr bool enabled = true;
+
+			protected:
+				bool active;
+			};
+
+			/**
+			 * Always active controller, useful especially for prototyping scenarios.
+			 */
+			class TelemetryControllerAlwaysOn {
+			public:
+				TelemetryControllerAlwaysOn( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryControllerAlwaysOn() = default;
+
+				TelemetryControllerAlwaysOn( const TelemetryControllerAlwaysOn & ) = default;
+
+				TelemetryControllerAlwaysOn& operator=( const TelemetryControllerAlwaysOn & ) = delete;
+
+				/**
+				 * Tells whether the controller is \a active, which is in this case always true.
+				*/
+				constexpr bool is_active() const { return true; }
+
+				/**
+				 * Set the active status of the telemetry controller.
+				 *
+				 * This \a disabled implementation ignores the input \p _active.
+				 */
+				void inline set_active( bool _active ) {
+					( void ) _active;
+				}
+
+				/**
+				 * Whether telemetry is compile-time active (here always).
+				 */
+				static constexpr bool enabled = true;
+			};
+
+			/**
+			 * SFINAE-based structure to check whether \p T is a telemetry controller, i.e.
+			 *   - it has a \a constexpr static field named \a enabled
+			 *   - it has an \a is_active() method
+			 *   - it has a \a set_active(bool) method
+			 */
+			template< typename T > struct is_telemetry_controller {
+			private:
+				template< typename U > static constexpr bool has_enabled_field(
+					typename std::enable_if<
+						std::is_same< typename std::decay< decltype( U::enabled ) >::type, bool >::value,
+							bool * >::type
+					) {
+						return true;
+					}
+
+				template< typename U > static constexpr bool has_enabled_field( ... ) { return false; }
+
+				template< typename U > static constexpr bool has_is_active_method(
+					typename std::enable_if<
+						std::is_same< typename std::decay<decltype( std::declval< U >().is_active() )
+							>::type, bool >::value, bool * >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool has_is_active_method( ... ) { return false; }
+
+				template< typename U > static constexpr bool has_set_active_method(
+					typename std::enable_if<
+						std::is_same< decltype( std::declval< U >().set_active( true ) ), void >::value,
+						bool * >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool has_set_active_method( ... ) { return false; }
+
+			public:
+				static constexpr bool value = has_enabled_field< T >( nullptr )
+					&& has_is_active_method< T >( nullptr ) && has_set_active_method< T >( nullptr ) ;
+			};
+		}
+
+	}
+}
+
+// Name of the Controller Enabler, i.e., a type that controls whether a telemetry controller is enabled
+#define __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) __ ## name ## _Enabler
+
+// Name of the Telemetry Controller type
+#define __TELEMETRY_CONTROLLER_NAME( name ) name ## _cls
+
+/**
+ * Defines a telemetry controller, i.e., a custom type derived from TelemetryControllerBase.
+ *
+ * This declaration requires the declaration of an associated controller enabler type, which controls
+ * whether the controller is enabled at compile-time; the controller is by default \b deactivated.
+ */
+#define DEFINE_TELEMETRY_CONTROLLER( name ) 																\
+	class __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) {};												\
+	using name = class __TELEMETRY_CONTROLLER_NAME( name ) :												\
+		public grb::utils::telemetry::TelemetryControllerBase<											\
+			grb::utils::telemetry::is_controller_enabled< __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() > {	\
+	public:																							\
+		using base_t = grb::utils::telemetry::TelemetryControllerBase<									\
+			grb::utils::telemetry::is_controller_enabled< __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() >;	\
+		__TELEMETRY_CONTROLLER_NAME( name )( bool _enabled ) : base_t( _enabled ) {}						\
+	};
+
+/**
+ * Enables a telemetry controller through its associated enabler type.
+ *
+ * Once enabled, it can be runtime activated.
+ */
+#define ENABLE_TELEMETRY_CONTROLLER( name ) class __TELEMETRY_CONTROLLER_ENABLER_NAME( name );	\
+	namespace grb { namespace utils { namespace telemetry {						\
+		template<> constexpr bool is_controller_enabled<								\
+			__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() { return true; } 		\
+	} } }
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
diff --git a/include/graphblas/utils/telemetry/TelemetryToken.hpp b/include/graphblas/utils/telemetry/TelemetryToken.hpp
deleted file mode 100644
index dabac3c2e..000000000
--- a/include/graphblas/utils/telemetry/TelemetryToken.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-
-/*
- *   Copyright 2023 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
- */
-
-#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
-#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
-
-#include <type_traits>
-#include <utility>
-
-namespace grb {
-	namespace utils {
-		namespace telemetry {
-
-			template< typename T > constexpr bool is_token_enabled() { return false; }
-
-			// OFF
-			template< bool en > class TelemetryTokenBase {
-			public:
-				using self_t = TelemetryTokenBase< en >;
-
-				TelemetryTokenBase( bool _enabled ) {
-					(void) _enabled;
-				}
-
-				TelemetryTokenBase() = delete;
-
-				TelemetryTokenBase( const self_t & ) = delete;
-
-				TelemetryTokenBase& operator=( const self_t & ) = delete;
-
-				constexpr bool is_active() const { return false; }
-
-				static constexpr bool enabled = false;
-			};
-
-			using TelemetryTokenAlwaysOff = TelemetryTokenBase< false >;
-
-			template<> class TelemetryTokenBase< true > {
-			public:
-				using self_t = TelemetryTokenBase< true >;
-
-				TelemetryTokenBase( bool _active ) : active( _active ) {}
-
-				TelemetryTokenBase() = delete;
-
-				TelemetryTokenBase( const self_t & ) = delete;
-
-				TelemetryTokenBase& operator=( const self_t & ) = delete;
-
-				bool is_active() const { return this->active; }
-
-				static constexpr bool enabled = true;
-
-			protected:
-				const bool active;
-			};
-
-			// always active token, especially for prototyping scenarios
-			class TelemetryTokenAlwaysOn {
-			public:
-				TelemetryTokenAlwaysOn( bool _enabled ) {
-					(void) _enabled;
-				}
-
-				TelemetryTokenAlwaysOn() = delete;
-
-				TelemetryTokenAlwaysOn( const TelemetryTokenAlwaysOn & ) = delete;
-
-				TelemetryTokenAlwaysOn& operator=( const TelemetryTokenAlwaysOn & ) = delete;
-
-				constexpr bool is_active() const { return true; }
-
-				static constexpr bool enabled = true;
-			};
-
-
-			template< typename T > struct is_telemetry_token {
-			private:
-				template< typename U > static constexpr bool has_enabled_field(
-					typename std::enable_if<
-						std::is_same< typename std::decay< decltype( U::enabled ) >::type, bool >::value,
-							bool * >::type
-					) {
-						return true;
-					}
-
-				template< typename U > static constexpr bool has_enabled_field( ... ) { return false; }
-
-				template< typename U > static constexpr bool has_is_active_method(
-					typename std::enable_if<
-						std::is_same< typename std::decay< decltype( std::declval< U >().is_active() ) >::type, bool >::value,
-						bool * >::type
-				) {
-					return true;
-				}
-
-				template< typename U > static constexpr bool has_is_active_method( ... ) { return false; }
-
-			public:
-				static constexpr bool value = has_enabled_field< T >( nullptr ) && has_is_active_method< T >( nullptr );
-			};
-		}
-
-	}
-}
-
-#define __TELEMETRY_TOKEN_ENABLER_NAME( name ) __ ## name ## Enabler
-#define __TELEMETRY_TOKEN_NAME( name ) name
-
-#define DECLARE_TELEMETRY_TOKEN( name ) 																			\
-	class __TELEMETRY_TOKEN_ENABLER_NAME( name ) {};																\
-	template< typename T > class __TELEMETRY_TOKEN_NAME( name ) :													\
-		public grb::utils::telemetry::TelemetryTokenBase< grb::utils::telemetry::is_token_enabled< T >() > {		\
-	public:																											\
-		using base_t = grb::utils::telemetry::TelemetryTokenBase< grb::utils::telemetry::is_token_enabled< T >() >;	\
-		__TELEMETRY_TOKEN_NAME( name )( bool _enabled ) : base_t( _enabled ) {}										\
-	};
-
-
-#define ACTIVATE_TOKEN( name ) namespace grb { namespace utils { namespace telemetry {					\
-	template<> constexpr bool is_token_enabled< __TELEMETRY_TOKEN_ENABLER_NAME( name ) >() { return true; } \
-} } }
-
-#define TELEMETRY_TOKEN_TYPE( name ) __TELEMETRY_TOKEN_NAME( name )< __TELEMETRY_TOKEN_ENABLER_NAME( name ) >
-
-#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_TOKEN
diff --git a/include/graphblas/utils/telemetry/Timeable.hpp b/include/graphblas/utils/telemetry/Timeable.hpp
index 02dd85b9e..95d1bdfa2 100644
--- a/include/graphblas/utils/telemetry/Timeable.hpp
+++ b/include/graphblas/utils/telemetry/Timeable.hpp
@@ -30,13 +30,13 @@ namespace grb {
 		namespace telemetry {
 
 			template<
-				typename TelTokenType,
-				bool enabled = TelTokenType::enabled
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
 			> class Timeable {
 			public:
-				using self_t = Timeable< TelTokenType, enabled >;
+				using self_t = Timeable< TelControllerType, enabled >;
 
-				Timeable( const TelTokenType & tt ) {
+				Timeable( const TelControllerType & tt ) {
 					(void) tt;
 				}
 
@@ -61,11 +61,11 @@ namespace grb {
 
 			};
 
-			template< typename TelTokenType > class Timeable< TelTokenType, true > {
+			template< typename TelControllerType > class Timeable< TelControllerType, true > {
 			public:
-				using self_t = Timeable< TelTokenType, true >;
+				using self_t = Timeable< TelControllerType, true >;
 
-				Timeable( const TelTokenType & tt ) : swatch( tt ) {}
+				Timeable( const TelControllerType & tt ) : swatch( tt ) {}
 
 				Timeable( const self_t & ) = default;
 
@@ -89,10 +89,10 @@ namespace grb {
 				}
 
 			private:
-				Stopwatch< TelTokenType > swatch;
+				Stopwatch< TelControllerType > swatch;
 			};
 
-			using StaticTimeable = Timeable< TelemetryTokenAlwaysOn, true >;
+			using StaticTimeable = Timeable< TelemetryControllerAlwaysOn, true >;
 
 		}
 	}
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index adba0339a..07c38cc99 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -99,39 +99,43 @@ using coord_t = size_t;
 
 constexpr Descriptor hpcg_desc = descriptors::dense;
 
-DECLARE_TELEMETRY_TOKEN( DistOut )
-ACTIVATE_TOKEN( DistOut )
-using dist_token_t = TELEMETRY_TOKEN_TYPE( DistOut );
-using DistStream = grb::utils::telemetry::OutputStream< dist_token_t >;
+// telemetry control: controllers and output stream types for telemetry
+// they can be (de)activated at compile-time by (un)commenting the respective ENABLE_TELEMETRY_CONTROLLER() macro
+ENABLE_TELEMETRY_CONTROLLER( dist_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( dist_controller_t )
+using DistStream = grb::utils::telemetry::OutputStream< dist_controller_t >;
 
-DECLARE_TELEMETRY_TOKEN( HPCGTelemetry )
-ACTIVATE_TOKEN( HPCGTelemetry )
-using hpcg_token_t = TELEMETRY_TOKEN_TYPE( HPCGTelemetry );
+ENABLE_TELEMETRY_CONTROLLER( hpcg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( hpcg_controller_t )
 
-DECLARE_TELEMETRY_TOKEN( MGTelemetry )
-ACTIVATE_TOKEN( MGTelemetry )
-using mg_token_t = TELEMETRY_TOKEN_TYPE( MGTelemetry );
+ENABLE_TELEMETRY_CONTROLLER( mg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( mg_controller_t )
 
-DECLARE_TELEMETRY_TOKEN( DBGToken )
-// ACTIVATE_TOKEN( DBGToken )
-using dbg_token_t = TELEMETRY_TOKEN_TYPE( DBGToken );
-using DBGStream = grb::utils::telemetry::OutputStream< dbg_token_t >;
+// ENABLE_TELEMETRY_CONTROLLER( dbg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( dbg_controller_t )
+using DBGStream = grb::utils::telemetry::OutputStream< dbg_controller_t >;
 
 using duration_t = utils::telemetry::duration_nano_t;
-using hpcg_csv_t = utils::telemetry::CSVWriter< hpcg_token_t, hpcg_token_t::enabled, size_t, duration_t >;
-using mg_csv_t = utils::telemetry::CSVWriter< mg_token_t, mg_token_t::enabled, size_t, size_t, duration_t, duration_t >;
+using hpcg_csv_t = utils::telemetry::CSVWriter< hpcg_controller_t, hpcg_controller_t::enabled,
+	size_t, duration_t >;
+using mg_csv_t = utils::telemetry::CSVWriter< mg_controller_t, mg_controller_t::enabled,
+	size_t, size_t, duration_t, duration_t >;
 
 // assembled types for simulation runners and input/output structures
-using smoother_runner_t = grb::algorithms::RedBlackGSSmootherRunner< HPCGTypes, mg_token_t, hpcg_desc >;
+using smoother_runner_t = grb::algorithms::RedBlackGSSmootherRunner< HPCGTypes,
+	mg_controller_t, hpcg_desc >;
 using smoothing_data_t = typename smoother_runner_t::SmootherDataType;
 
-using coarsener_runner_t = grb::algorithms::SingleMatrixCoarsener< HPCGTypes, mg_token_t, hpcg_desc >;
+using coarsener_runner_t = grb::algorithms::SingleMatrixCoarsener< HPCGTypes,
+	mg_controller_t, hpcg_desc >;
 using coarsening_data_t = typename coarsener_runner_t::CoarseningDataType;
 
-using mg_runner_t = MultiGridRunner< HPCGTypes, smoother_runner_t, coarsener_runner_t, mg_token_t, hpcg_desc, DBGStream >;
+using mg_runner_t = MultiGridRunner< HPCGTypes, smoother_runner_t, coarsener_runner_t,
+	mg_controller_t, hpcg_desc, DBGStream >;
 using mg_data_t = typename mg_runner_t::MultiGridInputType;
 
-using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_token_t, hpcg_desc, DBGStream >;
+using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_controller_t,
+	hpcg_desc, DBGStream >;
 using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
 
 struct dotter : grb::utils::telemetry::OutputStreamLazy {
@@ -219,32 +223,12 @@ static void print_system(
  * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
  * explained in \ref multigrid_allocate_data().
  */
-template< typename T > T static next_pow_2( T n ) {
-	static_assert( std::is_integral< T >::value, "Integral required." );
-	--n;
-	n |= ( n >> 1 );
-	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
-		const unsigned shift = static_cast< T >( 1U ) << i;
-		n |= ( n >> shift );
-	}
-	return n + 1;
-}
-
-/**
- * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
- * for each level of the multi-grid. The input is the vector of system sizes \p mg_sizes, with sizes in
- * monotonically \b decreasing order (finest system first).
- *
- * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
- * explained in \ref multigrid_allocate_data().
- */
-static void allocate_system_structures(
-	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
-	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
-	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &cg_system_data,
-	const std::vector< size_t > &mg_sizes,
-	const mg_token_t & mg_token,
+static void allocate_system_structures( std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
+	std::unique_ptr< hpcg_data_t > & cg_system_data,
+	const std::vector< size_t > & mg_sizes,
+	const mg_controller_t & mg_controller,
 	DistStream & logger
 ) {
 	grb::utils::Timer timer;
@@ -253,7 +237,7 @@ static void allocate_system_structures(
 	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
 	logger << "allocating data for the MultiGrid simulation...";
 	timer.reset();
-	multigrid_allocate_data( system_levels, coarsener_levels, smoother_levels, mg_sizes, mg_token );
+	multigrid_allocate_data( system_levels, coarsener_levels, smoother_levels, mg_sizes, mg_controller );
 	double time = timer.time();
 	logger << " time (ms) " << time << std::endl;
 
@@ -283,7 +267,7 @@ static void build_3d_system(
 	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
 	std::unique_ptr< hpcg_data_t > &cg_system_data,
 	const simulation_input & in,
-	const mg_token_t & tt,
+	const mg_controller_t & tt,
 	DistStream & logger
 ) {
 	constexpr size_t DIMS = 3;
@@ -364,13 +348,19 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	const size_t pid = spmd<>::pid();
 	grb::utils::Timer timer;
 
-	dist_token_t dist( pid == 0 );
-	class MyNumPunct : public std::numpunct<char> {
-	// protected:
-		char do_thousands_sep() const override { return '\''; }
-		std::string do_grouping() const override { return "\03"; }
+	// standard logger: active only on master node
+	dist_controller_t dist( pid == 0 );
+	// separate thousands when printing integers
+	class IntegerSeparation : public std::numpunct< char > {
+		// protected:
+		char do_thousands_sep() const override {
+			return '\'';
+		}
+		std::string do_grouping() const override {
+			return "\03";
+		}
 	};
-	std::locale old_locale = std::cout.imbue( std::locale( std::cout.getloc(), new MyNumPunct ) );
+	std::locale old_locale = std::cout.imbue( std::locale( std::cout.getloc(), new IntegerSeparation ) );
 	DistStream logger( dist, std::cout );
 
 	logger << "beginning input generation..." << std::endl;
@@ -378,13 +368,14 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data_t > hpcg_state;
 
-	// log HPCG by default on master
-	hpcg_token_t hpcg_token( pid == 0 );
-	// log Mg and smoother only if the user requested it
-	mg_token_t mg_token( pid == 0 && in.mg_log );
+	// measure HPCG execution time by default on master
+	hpcg_controller_t hpcg_controller( pid == 0 );
+	// measure MG and smoother only if the user requested it
+	mg_controller_t mg_controller( pid == 0 && in.mg_log );
 
-	dbg_token_t dbg_token( pid == 0 );
-	DBGStream dbg_stream( dbg_token, std::cout );
+	// trace execution of CG and MG only on master
+	dbg_controller_t dbg_controller( pid == 0 );
+	DBGStream dbg_stream( dbg_controller, std::cout );
 
 	// define the main HPCG runner and initialize the options of its components
 	coarsener_runner_t coarsener;
@@ -392,13 +383,14 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	smoother.presmoother_steps = smoother.postsmoother_steps = in.smoother_steps;
 	smoother.non_recursive_smooth_steps = 1UL;
 	mg_runner_t mg_runner( smoother, coarsener, dbg_stream );
-	hpcg_runner_t hpcg_runner( hpcg_token, mg_runner, dbg_stream );
+	hpcg_runner_t hpcg_runner( hpcg_controller, mg_runner, dbg_stream );
 	hpcg_runner.tolerance = residual_zero;
 	hpcg_runner.with_preconditioning = ! in.no_preconditioning;
 
 	timer.reset();
 	// build the entire multi-grid system
-	build_3d_system( mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels, hpcg_state, in, mg_token, logger );
+	build_3d_system( mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels,
+		hpcg_state, in, mg_controller, logger );
 	double input_duration = timer.time();
 	logger << "input generation time (ms): " << input_duration << std::endl;
 
@@ -445,8 +437,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.inner_test_repetitions = 0;
 	out.times.useful = 0.0;
 
-	hpcg_csv_t hpcg_csv( hpcg_token, { "repetition", "time" } );
-	mg_csv_t mg_csv( mg_token, { "repetition", "level", "mg time", "smoother time" } );
+	// initialize CSV writers (if activated)
+	hpcg_csv_t hpcg_csv( hpcg_controller, { "repetition", "time" } );
+	mg_csv_t mg_csv( mg_controller, { "repetition", "level", "mg time", "smoother time" } );
 
 	// do benchmark
 	for( size_t i = 0; i < in.inner_test_repetitions; ++i ) {

From d10adaca43f212665856bac2e55302e3663873af Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 23 Feb 2023 17:54:16 +0100
Subject: [PATCH 22/28] polishing a few telemetry functionalities, linting
 added code, documenting telemetry

---
 .../algorithms/hpcg/average_coarsener.hpp     |  68 ++---
 .../algorithms/hpcg/greedy_coloring.hpp       |  33 ++-
 .../hpcg/single_point_coarsener.hpp           |  61 ++---
 .../algorithms/hpcg/system_builder.hpp        |  23 +-
 .../algorithms/hpcg/system_building_utils.hpp | 146 +++++------
 .../multigrid/multigrid_building_utils.hpp    |   6 +-
 .../algorithms/multigrid/multigrid_cg.hpp     |  56 ++--
 .../algorithms/multigrid/multigrid_data.hpp   |   8 +-
 .../multigrid/multigrid_v_cycle.hpp           |  73 ++----
 .../multigrid/red_black_gauss_seidel.hpp      |  66 +++--
 .../multigrid/single_matrix_coarsener.hpp     |  30 ++-
 .../utils/iterators/IteratorValueAdaptor.hpp  |   6 +-
 .../utils/multigrid/array_vector_storage.hpp  |  23 +-
 .../multigrid/dynamic_vector_storage.hpp      |  42 ++-
 .../halo_matrix_generator_iterator.hpp        |  39 +--
 .../linearized_halo_ndim_iterator.hpp         |  77 +++---
 .../multigrid/linearized_halo_ndim_system.hpp | 166 ++++++------
 .../multigrid/linearized_ndim_iterator.hpp    |  57 ++--
 .../multigrid/linearized_ndim_system.hpp      |  86 +++---
 .../graphblas/utils/multigrid/ndim_system.hpp |  28 +-
 .../graphblas/utils/multigrid/ndim_vector.hpp |  46 ++--
 .../graphblas/utils/telemetry/CSVWriter.hpp   | 248 ++++++++++++------
 .../utils/telemetry/OutputStream.hpp          | 211 +++++++++++----
 .../graphblas/utils/telemetry/Stopwatch.hpp   | 186 ++++++++++---
 .../graphblas/utils/telemetry/Telemetry.hpp   |  25 +-
 .../utils/telemetry/TelemetryBase.hpp         |  65 +++--
 .../utils/telemetry/TelemetryController.hpp   | 143 +++++-----
 .../graphblas/utils/telemetry/Timeable.hpp    |  40 ++-
 tests/smoke/hpcg.cpp                          | 190 +++++++-------
 29 files changed, 1293 insertions(+), 955 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/average_coarsener.hpp b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
index 6af5e5ff7..eb3853c61 100644
--- a/include/graphblas/algorithms/hpcg/average_coarsener.hpp
+++ b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
@@ -24,12 +24,12 @@
 #ifndef _H_GRB_ALGORITHMS_AVERAGE_COARSENER
 #define _H_GRB_ALGORITHMS_AVERAGE_COARSENER
 
-#include <cstddef>
 #include <array>
-#include <iterator>
-#include <stdexcept>
 #include <cmath>
+#include <cstddef>
+#include <iterator>
 #include <numeric>
+#include <stdexcept>
 
 #include <graphblas/utils/multigrid/array_vector_storage.hpp>
 #include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
@@ -87,16 +87,21 @@ namespace grb {
 				) noexcept :
 					_i( i ),
 					_j( j ),
-					_value( value )
-				{}
+					_value( value ) {}
 
 				_ValueGenerator( const _ValueGenerator & ) = default;
 
 				_ValueGenerator & operator=( const _ValueGenerator & ) = default;
 
-				inline RowIndexType i() const { return _i; }
-				inline ColumnIndexType j() const { return _j; }
-				inline ValueType v() const { return _value; }
+				inline RowIndexType i() const {
+					return _i;
+				}
+				inline ColumnIndexType j() const {
+					return _j;
+				}
+				inline ValueType v() const {
+					return _value;
+				}
 
 			private:
 				RowIndexType _i;
@@ -108,12 +113,12 @@ namespace grb {
 			using iterator_category = std::random_access_iterator_tag;
 			using value_type = _ValueGenerator;
 			using pointer = const value_type;
-			using reference = const value_type&;
+			using reference = const value_type &;
 			using difference_type = typename LinearSystemIterType::difference_type;
 
-			AverageGeneratorIterator( const SelfType &o ) = default;
+			AverageGeneratorIterator( const SelfType & o ) = default;
 
-			AverageGeneratorIterator( SelfType &&o ) = default;
+			AverageGeneratorIterator( SelfType && o ) = default;
 
 			SelfType & operator=( const SelfType & ) = default;
 
@@ -123,11 +128,11 @@ namespace grb {
 			 * Advances \c this by 1 in constant time.
 			 */
 			SelfType & operator++() noexcept {
-				(void) ++_subspace_iter;
+				(void)++_subspace_iter;
 				size_t subspace_position = _subspace_iter->get_linear_position();
 				// std::cout << "subspace_position " << subspace_position << std::endl;
 				if( subspace_position == _num_neighbors ) {
-					(void) ++_sys_iter;
+					(void)++_sys_iter;
 					_subspace_iter = _finer_subspace->begin();
 				}
 				update_coords();
@@ -150,21 +155,21 @@ namespace grb {
 			/**
 			 * Computes the difference between \c this and \p o as integer.
 			 */
-			difference_type operator-( const SelfType &o ) const {
+			difference_type operator-( const SelfType & o ) const {
 				return this->_sys_iter - o._sys_iter;
 			}
 
 			/**
 			 * Returns whether \c this and \p o differ.
 			 */
-			bool operator!=( const SelfType &o ) const {
+			bool operator!=( const SelfType & o ) const {
 				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
 			 * Returns whether \c this and \p o are equal.
 			 */
-			bool operator==( const SelfType &o ) const {
+			bool operator==( const SelfType & o ) const {
 				return ! this->operator!=( o );
 			}
 
@@ -198,9 +203,9 @@ namespace grb {
 			}
 
 		private:
-			const LinearSystemType *_lin_sys;
-			const LinearSystemType *_finer_subspace;
-			const ArrayType *_steps;
+			const LinearSystemType * _lin_sys;
+			const LinearSystemType * _finer_subspace;
+			const ArrayType * _steps;
 			CoordType _num_neighbors;
 			LinearSystemIterType _sys_iter;
 			LinearSystemIterType _subspace_iter;
@@ -217,9 +222,9 @@ namespace grb {
 			 * @param steps ratios per dimension between finer and coarser system
 			 */
 			AverageGeneratorIterator(
-				const LinearSystemType &system,
-				const LinearSystemType &finer_subspace,
-				const ArrayType &steps
+				const LinearSystemType & system,
+				const LinearSystemType & finer_subspace,
+				const ArrayType & steps
 			) noexcept :
 				_lin_sys( &system ),
 				_finer_subspace( &finer_subspace ),
@@ -246,7 +251,7 @@ namespace grb {
 				ColumnIndexType s = 1;
 				for( size_t i = 0; i < DIMS; i++ ) {
 					finer += s * _subspace_iter->get_position()[ i ];
-					s *= (*_steps)[ i ];
+					s *= ( *_steps )[ i ];
 					finer += s * _sys_iter->get_position()[ i ];
 					s *= _lin_sys->get_sizes()[ i ];
 				}
@@ -280,8 +285,8 @@ namespace grb {
 			 * otherwise an exception is raised.
 			 */
 			AverageCoarsenerBuilder(
-				const ArrayType &_finer_sizes,
-				const ArrayType &_coarser_sizes
+				const ArrayType & _finer_sizes,
+				const ArrayType & _coarser_sizes
 			) :
 				system( _coarser_sizes.begin(), _coarser_sizes.end() ),
 				_finer_subspace( _coarser_sizes.cbegin(), _coarser_sizes.cend() ),
@@ -291,10 +296,8 @@ namespace grb {
 					// finer size MUST be an exact multiple of coarser_size
 					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
 					if( ratio.quot < 2 || ratio.rem != 0 ) {
-						throw std::invalid_argument(
-							std::string( "finer size of dimension " ) + std::to_string( i ) +
-							std::string( "is not an exact multiple of coarser size" )
-						);
+						throw std::invalid_argument( std::string( "finer size of dimension " )
+							+ std::to_string( i ) + std::string( "is not an exact multiple of coarser size" ) );
 					}
 					steps[ i ] = ratio.quot;
 				}
@@ -338,10 +341,9 @@ namespace grb {
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
 			grb::utils::multigrid::LinearizedNDimSystem< CoordType,
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > _finer_subspace;
-
-			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
+			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;///< array of steps, i.e. how much each column coordinate (finer system) must be
+																				//// incremented when incrementing the row coordinates; is is the ration between
+			                                                                    //// #finer_sizes and row_generator#physical_sizes
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
index 5519a6504..bb4759d6f 100644
--- a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -24,8 +24,8 @@
 #ifndef _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
 #define _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
 
-#include <vector>
 #include <cstddef>
+#include <vector>
 
 #include <graphblas/utils/multigrid/linearized_halo_ndim_system.hpp>
 
@@ -75,16 +75,15 @@ namespace grb {
 			typename CoordType,
 			bool lowest_color_first = true
 		> void hpcg_greedy_color_ndim_system(
-			const grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType > &system,
-			std::vector< CoordType > &row_colors,
-			std::vector< CoordType > &color_counters,
+			const grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType > & system,
+			std::vector< CoordType > & row_colors,
+			std::vector< CoordType > & color_counters,
 			bool reorder_rows_per_color = false
 		) {
-
 			CoordType nrows = system.system_size();
 			row_colors.insert( row_colors.begin(), nrows, nrows ); // value `nrows' means `uninitialized'; initialized colors go from 0 to nrow-1
 			CoordType totalColors = 1;
-			row_colors[0] = 0; // first point gets color 0
+			row_colors[ 0 ] = 0; // first point gets color 0
 
 			// Finds colors in a greedy (a likely non-optimal) fashion.
 			typename grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >::Iterator begin = system.begin();
@@ -106,9 +105,9 @@ namespace grb {
 					if( curCol < curRow ) {
 						assert( row_colors[ curCol ] < nrows ); // if curCol < curRow, curCol has already a color assigned
 						std::vector< bool >::reference color_is_assigned = assigned[ row_colors[ curCol ] ];
-						if( !color_is_assigned ) {
+						if( ! color_is_assigned ) {
 							// count how many colors are already assigned
-							(void) currentlyAssigned++;
+							(void)currentlyAssigned++;
 						}
 						// track which colors are assigned
 						color_is_assigned = true;
@@ -122,7 +121,7 @@ namespace grb {
 					if( lowest_color_first ) {
 						// here, assign colors greedily starting from the lowest available one
 						for( CoordType j = 0; j < totalColors; ++j ) {
-							if( !assigned[ j ] ) {
+							if( ! assigned[ j ] ) {
 								// if no neighbor with this color, use it for this row
 								row_colors[ curRow ] = j;
 								break;
@@ -132,7 +131,7 @@ namespace grb {
 						// here, assign colors greedily starting from the highest available one
 						for( CoordType j = totalColors; j > 0; --j ) {
 							CoordType color = j - 1;
-							if( !assigned[ color ] ) {
+							if( ! assigned[ color ] ) {
 								// if no neighbor with this color, use it for this row
 								row_colors[ curRow ] = color;
 								break;
@@ -143,7 +142,7 @@ namespace grb {
 					assert( row_colors[ curRow ] == nrows );
 					if( row_colors[ curRow ] == nrows ) {
 						row_colors[ curRow ] = totalColors;
-						(void) totalColors++;
+						(void)totalColors++;
 					} else {
 						assert( 0 ); // should never get here
 					}
@@ -153,7 +152,7 @@ namespace grb {
 
 #ifdef _DEBUG
 			std::cout << "assigned colors: " << totalColors << " [ <row> -> <color>]\n";
-			for( size_t i = 0; i < row_colors.size(); i++ ){
+			for( size_t i = 0; i < row_colors.size(); i++ ) {
 				std::cout << i << " -> " << row_colors[ i ] << ", ";
 			}
 			std::cout << std::endl;
@@ -162,21 +161,21 @@ namespace grb {
 			// count number of vertices per color
 			color_counters.insert( color_counters.begin(), totalColors, 0 );
 			for( CoordType i = 0; i < nrows; ++i ) {
-				(void) color_counters[ row_colors[ i ] ]++;
+				(void)color_counters[ row_colors[ i ] ]++;
 			}
 
-			if( !reorder_rows_per_color ) {
+			if( ! reorder_rows_per_color ) {
 				return;
 			}
 
 			// form in-place prefix scan
 			CoordType old = 0, old0;
 			for( CoordType i = 1; i < totalColors; ++i ) {
-				old0 = color_counters[i];
-				color_counters[i] = color_counters[i-1] + old;
+				old0 = color_counters[ i ];
+				color_counters[ i ] = color_counters[ i - 1 ] + old;
 				old = old0;
 			}
-			color_counters[0] = 0;
+			color_counters[ 0 ] = 0;
 
 			// translate `colors' into a permutation
 			for( CoordType i = 0; i < nrows; ++i ) {
diff --git a/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
index a3826c9c0..92ef47263 100644
--- a/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
+++ b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
@@ -24,11 +24,11 @@
 #ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
 #define _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
 
-#include <cstddef>
 #include <array>
+#include <cmath>
+#include <cstddef>
 #include <iterator>
 #include <stdexcept>
-#include <cmath>
 
 #include <graphblas/utils/multigrid/array_vector_storage.hpp>
 #include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
@@ -70,8 +70,7 @@ namespace grb {
 
 			using RowIndexType = CoordType; ///< numeric type of rows
 			using ColumnIndexType = CoordType;
-			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType,
-				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
+			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType, grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
 			using LinearSystemIterType = typename LinearSystemType::Iterator;
 			using SelfType = SinglePointCoarsenerIterator< DIMS, CoordType, ValueType >;
 			using ArrayType = std::array< CoordType, DIMS >;
@@ -85,15 +84,18 @@ namespace grb {
 					ColumnIndexType j
 				) noexcept :
 					_i( i ),
-					_j( j )
-				{}
+					_j( j ) {}
 
 				_HPCGValueGenerator( const _HPCGValueGenerator & ) = default;
 
 				_HPCGValueGenerator & operator=( const _HPCGValueGenerator & ) = default;
 
-				inline RowIndexType i() const { return _i; }
-				inline ColumnIndexType j() const { return _j; }
+				inline RowIndexType i() const {
+					return _i;
+				}
+				inline ColumnIndexType j() const {
+					return _j;
+				}
 				inline ValueType v() const {
 					return static_cast< ValueType >( 1 );
 				}
@@ -107,12 +109,12 @@ namespace grb {
 			using iterator_category = std::random_access_iterator_tag;
 			using value_type = _HPCGValueGenerator;
 			using pointer = const value_type;
-			using reference = const value_type&;
+			using reference = const value_type &;
 			using difference_type = typename LinearSystemIterType::difference_type;
 
-			SinglePointCoarsenerIterator( const SelfType &o ) = default;
+			SinglePointCoarsenerIterator( const SelfType & o ) = default;
 
-			SinglePointCoarsenerIterator( SelfType &&o ) = default;
+			SinglePointCoarsenerIterator( SelfType && o ) = default;
 
 			SelfType & operator=( const SelfType & ) = default;
 
@@ -122,7 +124,7 @@ namespace grb {
 			 * Advances \c this by 1 in constant time.
 			 */
 			SelfType & operator++() noexcept {
-				(void) ++_sys_iter;
+				(void)++_sys_iter;
 				update_coords();
 				return *this;
 			}
@@ -139,21 +141,21 @@ namespace grb {
 			/**
 			 * Computes the difference between \c this and \p o as integer.
 			 */
-			difference_type operator-( const SelfType &o ) const {
+			difference_type operator-( const SelfType & o ) const {
 				return this->_sys_iter - o._sys_iter;
 			}
 
 			/**
 			 * Returns whether \c this and \p o differ.
 			 */
-			bool operator!=( const SelfType &o ) const {
+			bool operator!=( const SelfType & o ) const {
 				return this->_sys_iter != o._sys_iter;
 			}
 
 			/**
 			 * Returns whether \c this and \p o are equal.
 			 */
-			bool operator==( const SelfType &o ) const {
+			bool operator==( const SelfType & o ) const {
 				return ! this->operator!=( o );
 			}
 
@@ -187,8 +189,8 @@ namespace grb {
 			}
 
 		private:
-			const LinearSystemType *_lin_sys;
-			const ArrayType *_steps;
+			const LinearSystemType * _lin_sys;
+			const ArrayType * _steps;
 			LinearSystemIterType _sys_iter;
 			value_type _val;
 
@@ -201,8 +203,8 @@ namespace grb {
 			 * @param steps ratios per dimension between finer and coarser system
 			 */
 			SinglePointCoarsenerIterator(
-				const LinearSystemType &system,
-				const ArrayType &steps
+				const LinearSystemType & system,
+				const ArrayType & steps
 			) noexcept :
 				_lin_sys( &system ),
 				_steps( &steps ),
@@ -225,7 +227,7 @@ namespace grb {
 				ColumnIndexType finer = 0;
 				ColumnIndexType s = 1;
 				for( size_t i = 0; i < DIMS; i++ ) {
-					s *= (*_steps)[ i ];
+					s *= ( *_steps )[ i ];
 					finer += s * _sys_iter->get_position()[ i ];
 					s *= _lin_sys->get_sizes()[ i ];
 				}
@@ -259,17 +261,17 @@ namespace grb {
 			 * otherwise an exception is raised.
 			 */
 			SinglePointCoarsenerBuilder(
-				const ArrayType &_finer_sizes,
-				const ArrayType &_coarser_sizes
-			) : system( _coarser_sizes.begin(), _coarser_sizes.end() ) {
+				const ArrayType & _finer_sizes,
+				const ArrayType & _coarser_sizes
+			) :
+				system( _coarser_sizes.begin(),
+				_coarser_sizes.end() )
+			{
 				for( size_t i = 0; i < DIMS; i++ ) {
 					// finer size MUST be an exact multiple of coarser_size
 					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
 					if( ratio.quot < 2 || ratio.rem != 0 ) {
-						throw std::invalid_argument(
-							std::string( "finer size of dimension " ) + std::to_string( i ) +
-							std::string( "is not an exact multiple of coarser size" )
-						);
+						throw std::invalid_argument( std::string( "finer size of dimension " ) + std::to_string( i ) + std::string( "is not an exact multiple of coarser size" ) );
 					}
 					steps[ i ] = ratio.quot;
 				}
@@ -311,11 +313,10 @@ namespace grb {
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
 
 			ArrayType steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
+							 //// incremented when incrementing the row coordinates; is is the ration between
+			                 //// #finer_sizes and row_generator#physical_sizes
 		};
 
 	} // namespace algorithms
 } // namespace grb
 #endif // _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
-
diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
index 94d1565f2..84600414c 100644
--- a/include/graphblas/algorithms/hpcg/system_builder.hpp
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -34,14 +34,13 @@
 #include <array>
 #include <cstddef>
 #include <initializer_list>
+#include <iterator>
 #include <numeric>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
-#include <cstddef>
-#include <iterator>
 
 #include <graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp>
 
@@ -73,8 +72,11 @@ namespace grb {
 
 				HPCGDiagGenerator & operator=( const HPCGDiagGenerator & ) = default;
 
-				inline ValueType operator()( const CoordType &i, const CoordType &j ) const noexcept {
-					return j == i ? _diag: _non_diag;
+				inline ValueType operator()(
+					const CoordType & i,
+					const CoordType & j
+				) const noexcept {
+					return j == i ? _diag : _non_diag;
 				}
 
 				ValueType _diag;
@@ -82,8 +84,8 @@ namespace grb {
 			};
 
 			using HaloSystemType = grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >;
-			using Iterator = grb::utils::multigrid::HaloMatrixGeneratorIterator< DIMS, CoordType,
-				ValueType, HPCGDiagGenerator >;
+			using Iterator = grb::utils::multigrid::HaloMatrixGeneratorIterator< DIMS,
+				CoordType, ValueType, HPCGDiagGenerator >;
 
 			/**
 			 * Construct a new HPCGSystemBuilder object from the data of the physical system.
@@ -94,7 +96,7 @@ namespace grb {
 			 * @param non_diag value outside the diagonal, for element-element interaction
 			 */
 			HPCGSystemBuilder(
-				const std::array< CoordType, DIMS > &sizes,
+				const std::array< CoordType, DIMS > & sizes,
 				CoordType halo,
 				ValueType diag,
 				ValueType non_diag
@@ -116,9 +118,11 @@ namespace grb {
 
 			HPCGSystemBuilder( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
 
-			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=( const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=(
+				const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
 
-			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=(
+				HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
 
 			/**
 			 * Number of elements of the mesh.
@@ -175,4 +179,3 @@ namespace grb {
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
-
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index b86564def..9f3fdf583 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -24,23 +24,25 @@
 #ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
 #define _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
 
+#include <algorithm>
 #include <array>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
-#include <memory>
-#include <algorithm>
 #include <cstdlib>
+#include <memory>
 #include <stdexcept>
-#include <cmath>
 #include <string>
 
 #include <graphblas.hpp>
+#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
+#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
 #include <graphblas/utils/iterators/partition_range.hpp>
 
-#include "system_builder.hpp"
-#include "single_point_coarsener.hpp"
 #include "average_coarsener.hpp"
 #include "greedy_coloring.hpp"
+#include "single_point_coarsener.hpp"
+#include "system_builder.hpp"
 
 namespace grb {
 	namespace algorithms {
@@ -82,8 +84,8 @@ namespace grb {
 			typename CoordType,
 			typename NonzeroType
 		> void hpcg_build_multigrid_generators(
-			const HPCGSystemParams< DIMS, NonzeroType > &params,
-			std::vector< grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > > &mg_generators
+			const HPCGSystemParams< DIMS, NonzeroType > & params,
+			std::vector< grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > > & mg_generators
 		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 
@@ -104,28 +106,23 @@ namespace grb {
 				coord_sizes.begin() );
 
 			// generate hierarchical coarseners
-			for( size_t coarsening_level = 0UL;
-				min_physical_size >= params.min_phys_size && coarsening_level <= params.max_levels;
-				coarsening_level++ ) {
+			for( size_t coarsening_level = 0UL; min_physical_size >= params.min_phys_size
+				&& coarsening_level <= params.max_levels; coarsening_level++ ) {
 
 				// build generator
-				mg_generators.emplace_back( coord_sizes, params.halo_size,
-					params.diag_value, params.non_diag_value );
+				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value,
+					params.non_diag_value );
 
 				// prepare for new iteration
 				min_physical_size /= params.coarsening_step;
-				std::for_each( coord_sizes.begin(), coord_sizes.end(),
-					[ &params ]( CoordType &v ) {
-						std::ldiv_t ratio = std::ldiv( v, params.coarsening_step );
-						if( ratio.rem != 0 ) {
-							throw std::invalid_argument(
-								std::string( "system size " ) + std::to_string( v ) +
-								std::string( " is not divisible by " ) +
-								std::to_string( params.coarsening_step )
-							);
-						}
-						v = ratio.quot;
-					});
+				std::for_each( coord_sizes.begin(), coord_sizes.end(), [ &params ]( CoordType & v ) {
+					std::ldiv_t ratio = std::ldiv( v, params.coarsening_step );
+					if( ratio.rem != 0 ) {
+						throw std::invalid_argument( std::string( "system size " ) + std::to_string( v )
+							+ std::string( " is not divisible by " ) + std::to_string( params.coarsening_step ) );
+					}
+					v = ratio.quot;
+				} );
 			}
 		}
 
@@ -138,23 +135,20 @@ namespace grb {
 		 * This function takes care of the parallelism by employing random-access iterators and by
 		 * \b parallelizing the generation across multiple processes in case of distributed execution.
 		 */
-		template <
+		template<
 			size_t DIMS,
 			typename CoordType,
 			typename NonzeroType,
 			typename Logger
 		> grb::RC hpcg_populate_system_matrix(
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			grb::Matrix< NonzeroType > &M,
-			Logger & logger
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & system_generator,
+			grb::Matrix< NonzeroType > & M, Logger & logger
 		) {
-
 			logger << "- generating system matrix...";
-			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator begin(
-				system_generator.make_begin_iterator() );
-			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator end(
-				system_generator.make_end_iterator()
-			);
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator
+				begin( system_generator.make_begin_iterator() );
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator
+				end( system_generator.make_end_iterator() );
 			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
 				system_generator.num_neighbors(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
@@ -184,19 +178,19 @@ namespace grb {
 			typename IOType,
 			typename NonzeroType
 		> grb::RC hpcg_populate_coarsener_any_builder(
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
-			CoarseningData< IOType, NonzeroType > &coarsener
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
 		) {
 			static_assert( DIMS > 0, "DIMS must be > 0" );
 
-			const std::array< CoordType, DIMS > &finer_sizes = finer_system_generator.get_generator().get_sizes();
-			const std::array< CoordType, DIMS > &coarser_sizes = coarser_system_generator.get_generator().get_sizes();
+			const std::array< CoordType, DIMS > & finer_sizes = finer_system_generator.get_generator().get_sizes();
+			const std::array< CoordType, DIMS > & coarser_sizes = coarser_system_generator.get_generator().get_sizes();
 			const size_t finer_size = finer_system_generator.system_size();
 			const size_t coarser_size = coarser_system_generator.system_size();
 
 			if( coarser_size >= finer_size ) {
-				throw std::invalid_argument( "wrong sizes");
+				throw std::invalid_argument( "wrong sizes" );
 			}
 
 			size_t const rows = coarser_size;
@@ -204,17 +198,15 @@ namespace grb {
 
 			assert( finer_sizes.size() == coarser_sizes.size() );
 
-			grb::Matrix< NonzeroType > &M = coarsener.coarsening_matrix;
+			grb::Matrix< NonzeroType > & M = coarsener.coarsening_matrix;
 			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
 				throw std::invalid_argument( "wrong matrix dimensions: matrix should be rectangular"
-											" with rows == <coarser size> and cols == <finer size>" );
+											 " with rows == <coarser size> and cols == <finer size>" );
 			}
 
 			IterBuilderType coarsener_builder( finer_sizes, coarser_sizes );
-			typename IterBuilderType::Iterator begin( coarsener_builder.make_begin_iterator() ),
-				end( coarsener_builder.make_end_iterator() );
-			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
-				coarsener_builder.system_size(), begin, end );
+			typename IterBuilderType::Iterator begin( coarsener_builder.make_begin_iterator() ), end( coarsener_builder.make_end_iterator() );
+			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(), coarsener_builder.system_size(), begin, end );
 			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
 		}
 
@@ -227,13 +219,13 @@ namespace grb {
 			typename IOType,
 			typename NonzeroType
 		> grb::RC hpcg_populate_coarsener(
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
-			CoarseningData< IOType, NonzeroType > &coarsener
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
 		) {
 			return hpcg_populate_coarsener_any_builder<
-				grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType > >
-				( finer_system_generator, coarser_system_generator, coarsener );
+				grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
+					finer_system_generator, coarser_system_generator, coarsener );
 		}
 
 		/**
@@ -245,13 +237,13 @@ namespace grb {
 			typename IOType,
 			typename NonzeroType
 		> grb::RC hpcg_populate_coarsener_avg(
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &finer_system_generator,
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &coarser_system_generator,
-			CoarseningData< IOType, NonzeroType > &coarsener
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
 		) {
 			return hpcg_populate_coarsener_any_builder<
-				grb::algorithms::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >
-				( finer_system_generator, coarser_system_generator, coarsener );
+			grb::algorithms::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
+				finer_system_generator, coarser_system_generator, coarsener );
 		}
 
 		namespace internal {
@@ -264,10 +256,10 @@ namespace grb {
 			 * @param[out] per_color_rows for each position \a i it stores an std::vector with all rows
 			 *  of color \a i inside \p row_colors
 			 */
-			template< typename CoordType > void hpcg_split_rows_by_color(
+			template< typename CoordType >
+			void hpcg_split_rows_by_color(
 				const std::vector< CoordType > & row_colors,
-				size_t num_colors,
-				std::vector< std::vector< CoordType > > & per_color_rows
+				size_t num_colors, std::vector< std::vector< CoordType > > & per_color_rows
 			) {
 				per_color_rows.resize( num_colors );
 				for( CoordType i = 0; i < row_colors.size(); i++ ) {
@@ -284,7 +276,8 @@ namespace grb {
 			 *
 			 * @tparam CoordType type of the internal coordinate
 			 */
-			template< typename CoordType > struct true_iter {
+			template< typename CoordType >
+			struct true_iter {
 
 				// static const bool __TRUE;
 
@@ -292,12 +285,12 @@ namespace grb {
 				using iterator_category = std::random_access_iterator_tag;
 				using value_type = bool;
 				using pointer = const bool *;
-				using reference = const bool&;
+				using reference = const bool &;
 				using difference_type = long;
 
 				true_iter() = delete;
 
-				true_iter( CoordType first ): index( first ) {}
+				true_iter( CoordType first ) : index( first ) {}
 
 				true_iter( const self_t & ) = default;
 
@@ -308,7 +301,7 @@ namespace grb {
 				}
 
 				self_t & operator++() noexcept {
-					(void) index++;
+					(void)index++;
 					return *this;
 				}
 
@@ -326,7 +319,7 @@ namespace grb {
 				}
 
 				reference operator*() const {
-					return *(this->operator->());
+					return *( this->operator->() );
 				}
 
 			private:
@@ -353,8 +346,8 @@ namespace grb {
 			 */
 			grb::RC hpcg_build_static_color_masks(
 				size_t matrix_size,
-				const std::vector< std::vector< size_t > > &per_color_rows,
-				std::vector< grb::Vector< bool> > &masks
+				const std::vector< std::vector< size_t > > & per_color_rows,
+				std::vector< grb::Vector< bool > > & masks
 			) {
 				if( ! masks.empty() ) {
 					throw std::invalid_argument( "vector of masks is expected to be empty" );
@@ -375,11 +368,9 @@ namespace grb {
 					std::vector< size_t >::const_iterator begin = rows.cbegin();
 					std::vector< size_t >::const_iterator end = rows.cend();
 					// partition_iteration_range( rows.size(), begin, end );
-					grb::RC rc = grb::buildVectorUnique( output_mask, begin , end, true_iter< size_t >( 0 ),
-						true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
+					grb::RC rc = grb::buildVectorUnique( output_mask, begin, end, true_iter< size_t >( 0 ), true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
 					if( rc != SUCCESS ) {
-						std::cerr << "error while creating output mask for color " << i << ": "
-							<< toString( rc ) << std::endl;
+						std::cerr << "error while creating output mask for color " << i << ": " << toString( rc ) << std::endl;
 						return rc;
 					}
 #ifdef _DEBUG
@@ -389,7 +380,8 @@ namespace grb {
 						for( const auto & v : output_mask ) {
 							std::cout << v.first << " ";
 							count++;
-							if( count > 20 ) break;
+							if( count > 20 )
+								break;
 						}
 						std::cout << std::endl;
 					}
@@ -423,9 +415,8 @@ namespace grb {
 			typename NonzeroType,
 			typename Logger
 		> grb::RC hpcg_populate_smoothing_data(
-			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > &system_generator,
-			SmootherData< NonzeroType > &smoothing_info,
-			Logger & logger
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & system_generator,
+			SmootherData< NonzeroType > & smoothing_info, Logger & logger
 		) {
 			grb::RC rc = set( smoothing_info.A_diagonal, system_generator.get_diag_value() );
 			if( rc != grb::SUCCESS ) {
@@ -444,10 +435,9 @@ namespace grb {
 				logger << "error: " << __LINE__ << std::endl;
 				return rc;
 			}
-			logger <<"- found " << color_counters.size() << " colors,"
-				<< " generating color masks...";
-			return internal::hpcg_build_static_color_masks( system_generator.system_size(),
-				per_color_rows, smoothing_info.color_masks );
+			logger << "- found " << color_counters.size() << " colors,"
+				   << " generating color masks...";
+			return internal::hpcg_build_static_color_masks( system_generator.system_size(), per_color_rows, smoothing_info.color_masks );
 		}
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
index f46b8e558..9c95b50cc 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -24,9 +24,9 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
 #define _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
 
-#include <vector>
-#include <memory>
 #include <cstddef>
+#include <memory>
+#include <vector>
 
 namespace grb {
 	namespace algorithms {
@@ -81,7 +81,7 @@ namespace grb {
 				throw std::invalid_argument( "at least one size should be available" );
 			}
 			size_t finer_size = mg_sizes[ 0 ];
-			system_levels.emplace_back( new MGInfoType( tt, 0, finer_size ) ); // create main system
+			system_levels.emplace_back( new MGInfoType( tt, 0, finer_size ) );  // create main system
 			smoother_levels.emplace_back( new SmootherInfoType( finer_size ) ); // create smoother for main
 			for( size_t i = 1; i < mg_sizes.size(); i++ ) {
 				size_t coarser_size = mg_sizes[ i ];
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index 3099e7d4e..f465ba8da 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -36,12 +36,11 @@
 #include <utility>
 
 #include <graphblas.hpp>
-#include <graphblas/utils/telemetry/Timeable.hpp>
 #include <graphblas/utils/telemetry/OutputStream.hpp>
+#include <graphblas/utils/telemetry/Timeable.hpp>
 
 #include "multigrid_data.hpp"
 
-
 namespace grb {
 	namespace algorithms {
 
@@ -87,8 +86,8 @@ namespace grb {
 		/**
 		 * Structure for the output information of a CG run.
 		 */
-		template < typename ResidualType > struct CGOutInfo {
-			size_t iterations; ///< number of iterations performed
+		template< typename ResidualType > struct CGOutInfo {
+			size_t iterations;          ///< number of iterations performed
 			ResidualType norm_residual; ///< norm of the final residual
 		};
 
@@ -134,15 +133,15 @@ namespace grb {
 			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
 				"cannot construct the Multi-Grid runner by move" );
 
-			Ring ring; ///< algebraic ring to be used
-			Minus minus; ///< minus operator to be used
-			bool with_preconditioning = true; ///<  whether preconditioning is enabled
-			size_t max_iterations = 10; ///< max number of allowed iterations for CG: after that, the solver is halted
-									///< and the result achieved so far returned
-			ResidualType tolerance = ring. template getZero< ResidualType >(); ///< ratio between initial residual and current residual that halts the solver
-										///< if reached, for the solution is to be considered "good enough"
+			Ring ring;                                                        ///< algebraic ring to be used
+			Minus minus;                                                      ///< minus operator to be used
+			bool with_preconditioning = true;                                 ///<  whether preconditioning is enabled
+			size_t max_iterations = 10;                                       ///< max number of allowed iterations for CG: after that, the solver is halted
+			                                                                  ///< and the result achieved so far returned
+			ResidualType tolerance = ring.template getZero< ResidualType >(); ///< ratio between initial residual and current residual that halts the solver
+			                                                                  ///< if reached, for the solution is to be considered "good enough"
 
-			MultiGridRunnerType &mg_runner;
+			MultiGridRunnerType & mg_runner;
 			DbgOutputStreamType dbg_logger;
 
 			/**
@@ -169,8 +168,7 @@ namespace grb {
 			) :
 				grb::utils::telemetry::Timeable< TelControllerType >( tt ),
 				mg_runner( _mg_runner ),
-				dbg_logger( _dbg_logger )
-			{}
+				dbg_logger( _dbg_logger ) {}
 
 			/**
 			 * Functional operator to invoke a full CG-MG computation.
@@ -181,9 +179,9 @@ namespace grb {
 			 * @return grb::RC indicating the success or the error occurred
 			 */
 			inline grb::RC operator()(
-				typename MultiGridRunnerType::MultiGridInputType &grid_base,
-				MultiGridCGData< IOType, NonzeroType, InputType > &cg_data,
-				CGOutInfo< ResidualType > &out_info
+				typename MultiGridRunnerType::MultiGridInputType & grid_base,
+				MultiGridCGData< IOType, NonzeroType, InputType > & cg_data,
+				CGOutInfo< ResidualType > & out_info
 			) {
 				this->start();
 				grb::RC ret = multigrid_conjugate_gradient( cg_data, grid_base, out_info );
@@ -209,17 +207,17 @@ namespace grb {
 			 * @return grb::RC SUCCESS in case of succesful run
 			 */
 			grb::RC multigrid_conjugate_gradient(
-				HPCGInputType &cg_data,
-				typename MultiGridRunnerType::MultiGridInputType &grid_base,
-				CGOutInfo< ResidualType > &out_info
+				HPCGInputType & cg_data,
+				typename MultiGridRunnerType::MultiGridInputType & grid_base,
+				CGOutInfo< ResidualType > & out_info
 			) {
-				const grb::Matrix< NonzeroType > &A = grid_base.A; // system matrix
-				grb::Vector< IOType > &r = grid_base.r;  // residual vector
-				grb::Vector< IOType > &z = grid_base.z;  // pre-conditioned residual vector
-				grb::Vector< IOType > &x = cg_data.x; // initial (and final) solution
-				const grb::Vector< InputType > &b = cg_data.b; // right-side value
-				grb::Vector< IOType > &p = cg_data.p;  // direction vector
-				grb::Vector< IOType > &Ap = cg_data.u; // temp vector
+				const grb::Matrix< NonzeroType > & A = grid_base.A; // system matrix
+				grb::Vector< IOType > & r = grid_base.r;            // residual vector
+				grb::Vector< IOType > & z = grid_base.z;            // pre-conditioned residual vector
+				grb::Vector< IOType > & x = cg_data.x;              // initial (and final) solution
+				const grb::Vector< InputType > & b = cg_data.b;     // right-side value
+				grb::Vector< IOType > & p = cg_data.p;              // direction vector
+				grb::Vector< IOType > & Ap = cg_data.u;             // temp vector
 				grb::RC ret = SUCCESS;
 
 				const IOType io_zero = ring.template getZero< IOType >();
@@ -329,12 +327,10 @@ namespace grb {
 					++iter;
 					out_info.iterations = iter;
 					out_info.norm_residual = norm_residual;
-				} while( iter < max_iterations &&
-					norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
+				} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
 
 				return ret;
 			}
-
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
index 67fe7bb8f..4f0d0eed4 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_data.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -24,13 +24,12 @@
 #ifndef _H_GRB_ALGORITHMS_HPCG_DATA
 #define _H_GRB_ALGORITHMS_HPCG_DATA
 
-#include <vector>
 #include <cstddef>
+#include <vector>
 
 #include <graphblas.hpp>
 #include <graphblas/utils/telemetry/Stopwatch.hpp>
 
-
 namespace grb {
 
 	namespace algorithms {
@@ -61,8 +60,8 @@ namespace grb {
 			const size_t level;           ///< level of the grid (0 for the finest physical system)
 			const size_t system_size;     ///< size of the system, i.e. side of the #A system matrix
 			grb::Matrix< NonzeroType > A; ///< system matrix
-			grb::Vector< IOType > z; ///< multi-grid solution
-			grb::Vector< IOType > r; ///< residual
+			grb::Vector< IOType > z;      ///< multi-grid solution
+			grb::Vector< IOType > r;      ///< residual
 
 			/**
 			 * Construct a new multigrid data object from level information and system size.
@@ -98,4 +97,3 @@ namespace grb {
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_HPCG_DATA
-
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index dbe15d2b8..31b623024 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -26,10 +26,10 @@
 #define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
 
 #include <cassert>
-#include <vector>
-#include <type_traits>
 #include <memory>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 #include <graphblas.hpp>
 #include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
@@ -37,6 +37,7 @@
 
 #include "multigrid_data.hpp"
 
+
 namespace grb {
 	namespace algorithms {
 
@@ -57,8 +58,7 @@ namespace grb {
 		 * @tparam Minus the minus operator for subtractions
 		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
-		template<
-			typename MGTypes,
+		template< typename MGTypes,
 			typename MGSmootherType,
 			typename CoarsenerType,
 			typename TelControllerType,
@@ -83,63 +83,60 @@ namespace grb {
 				"cannot construct the Minus operator with default values" );
 
 			// check the interface between HPCG and MG match
-			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType,
-				MultiGridInputType >::value, "input type of the Smoother kernel must match the input from Multi-Grid" );
+			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType, MultiGridInputType >::value,
+				"input type of the Smoother kernel must match the input from Multi-Grid" );
 
 			MGSmootherType & smoother_runner; ///< object to run the smoother
 			CoarsenerType & coarsener_runner; ///< object to run the coarsener
 			DbgOutputStreamType dbg_logger;
 
 			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
-			Ring ring; ///< algebraic ring
-			Minus minus; ///< minus operator
+			Ring ring;                                                          ///< algebraic ring
+			Minus minus;                                                        ///< minus operator
 
 			// operator to extract the reference out of an std::unique_ptr object
 			struct __extractor {
-				MultiGridInputType* operator()(
-					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference &ref
-				) {
+				MultiGridInputType * operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference & ref ) {
 					return ref.get();
 				}
 
-				const MultiGridInputType* operator()(
-					typename std::vector< std::unique_ptr< MultiGridInputType > >::const_reference &ref
-				) const {
+				const MultiGridInputType * operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::const_reference & ref ) const {
 					return ref.get();
 				}
 			};
 
 			using __unique_ptr_extractor = grb::utils::IteratorValueAdaptor<
-				typename std::vector< std::unique_ptr< MultiGridInputType > >::iterator,
-				__extractor
-			>;
+				typename std::vector< std::unique_ptr< MultiGridInputType > >::iterator, __extractor >;
 
 			/**
 			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
 			 * smoother and coarsener.
 			 */
 			MultiGridRunner(
-				MGSmootherType &_smoother_runner,
-				CoarsenerType &_coarsener_runner
-			) : smoother_runner( _smoother_runner ),
-				coarsener_runner(  _coarsener_runner )
+				MGSmootherType & _smoother_runner,
+				CoarsenerType & _coarsener_runner
+			) :
+				smoother_runner( _smoother_runner ),
+				coarsener_runner( _coarsener_runner )
 			{
 				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
 			}
 
 			MultiGridRunner(
-				MGSmootherType &_smoother_runner,
-				CoarsenerType &_coarsener_runner,
+				MGSmootherType & _smoother_runner,
+				CoarsenerType & _coarsener_runner,
 				DbgOutputStreamType & _dbg_logger
-			) : smoother_runner( _smoother_runner ),
-				coarsener_runner(  _coarsener_runner ),
-				dbg_logger( _dbg_logger )
-			{}
+			) :
+				smoother_runner( _smoother_runner ),
+				coarsener_runner( _coarsener_runner ),
+				dbg_logger( _dbg_logger ) {}
 
 			/**
 			 * Operator to invoke a full multi-grid run starting from the given level.
 			 */
-			inline grb::RC operator()( MultiGridInputType &system ) {
+			inline grb::RC operator()( MultiGridInputType & system ) {
 				return this->operator()( __unique_ptr_extractor( system_levels.begin() += system.level ),
 					__unique_ptr_extractor( system_levels.end() ) );
 			}
@@ -172,17 +169,6 @@ namespace grb {
 			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
 			 * and returning the failure code.
 			 *
-			 * @tparam descr descriptor for static information
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam MGSysIterType type of the iterator across grid levels
-			 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
-			 *  smoothing steps
-			 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
-			 *  and prolongation
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
 			 * @param mgiter_begin iterator pointing to the current level of the multi-grid
 			 * @param mgiter_end end iterator, indicating the end of the recursion
 			 * @param smoother callable object to invoke the smoothing steps
@@ -198,16 +184,16 @@ namespace grb {
 			) {
 				RC ret = SUCCESS;
 				assert( mgiter_begin != mgiter_end );
-				MultiGridInputType &finer_system = *mgiter_begin;
+				MultiGridInputType & finer_system = *mgiter_begin;
 				++mgiter_begin;
 
 				dbg_logger << "mg BEGINNING {" << std::endl;
 
 				// clean destination vector
-				ret = ret ? ret : grb::set< descr >( finer_system.z, ring. template getZero< IOType >() );
+				ret = ret ? ret : grb::set< descr >( finer_system.z, ring.template getZero< IOType >() );
 				dbg_logger << ">>> initial r: " << finer_system.r << std::endl;
 
-				if( !( mgiter_begin != mgiter_end ) ) {
+				if( ! ( mgiter_begin != mgiter_end ) ) {
 					// compute one round of Gauss Seidel and return
 					ret = ret ? ret : smoother_runner.nonrecursive_smooth( finer_system );
 					assert( ret == SUCCESS );
@@ -215,7 +201,7 @@ namespace grb {
 					dbg_logger << "} mg END" << std::endl;
 					return ret;
 				}
-				MultiGridInputType &coarser_system = *mgiter_begin;
+				MultiGridInputType & coarser_system = *mgiter_begin;
 
 				// pre-smoother
 				ret = ret ? ret : smoother_runner.pre_smooth( finer_system );
@@ -241,7 +227,6 @@ namespace grb {
 
 				return ret;
 			}
-
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index 305fa30d7..aa7157de7 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -38,8 +38,8 @@ namespace grb {
 		 */
 		template< typename IOType > struct SmootherData {
 
-			grb::Vector< IOType > A_diagonal; ///< vector with the diagonal of #A
-			grb::Vector< IOType > smoother_temp; ///< for smoother's intermediate results
+			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
+			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
 			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
 
 			/**
@@ -59,7 +59,6 @@ namespace grb {
 			}
 		};
 
-
 		/**
 		 * Runner object for the RBGS smoother, with multiple methods for each type of smoothing step:
 		 * pre-, post- and non-recursive, as invoked during a full run of a multi-grid V-cycle.
@@ -70,7 +69,7 @@ namespace grb {
 		 * @tparam TelControllerType telemetry controller to (de)activate time tracing within passed MultiGridData objects
 		 * @tparam descr descriptors with statically-known data for computation and containers
 		 */
-		template <
+		template<
 			class SmootherTypes,
 			typename TelControllerType,
 			Descriptor descr = descriptors::no_operation
@@ -83,27 +82,25 @@ namespace grb {
 			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external input structure
 			using SmootherDataType = SmootherData< IOType >; ///< smoothing information and temporary variables (per MG level)
 
-			size_t presmoother_steps = 1UL; ///< number of pre-smoother steps
-			size_t postsmoother_steps = 1UL;  ///< number of post-smoother steps
-			size_t non_recursive_smooth_steps = 1UL;  ///< number of smoother steps for the last grid level
-			std::vector< std::unique_ptr< SmootherDataType > > levels;  ///< for each grid level,
-				///< the smoothing data (finest first)
-			Ring ring;  ///< the algebraic ring
+			size_t presmoother_steps = 1UL;                            ///< number of pre-smoother steps
+			size_t postsmoother_steps = 1UL;                           ///< number of post-smoother steps
+			size_t non_recursive_smooth_steps = 1UL;                   ///< number of smoother steps for the last grid level
+			std::vector< std::unique_ptr< SmootherDataType > > levels; ///< for each grid level,
+			                                                           ///< the smoothing data (finest first)
+			Ring ring;                                                 ///< the algebraic ring
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring operator with default values" );
 
-
-
-			inline grb::RC pre_smooth( SmootherInputType& data ) {
+			inline grb::RC pre_smooth( SmootherInputType & data ) {
 				return run_smoother( data, presmoother_steps );
 			}
 
-			inline grb::RC post_smooth( SmootherInputType& data ) {
+			inline grb::RC post_smooth( SmootherInputType & data ) {
 				return run_smoother( data, postsmoother_steps );
 			}
 
-			inline grb::RC nonrecursive_smooth( SmootherInputType& data ) {
+			inline grb::RC nonrecursive_smooth( SmootherInputType & data ) {
 				return run_smoother( data, non_recursive_smooth_steps );
 			}
 
@@ -116,12 +113,12 @@ namespace grb {
 			 * smoother performs all smoothing steps the same way.
 			 */
 			grb::RC run_smoother(
-				SmootherInputType &data,
+				SmootherInputType & data,
 				const size_t smoother_steps
 			) {
 				RC ret = SUCCESS;
 
-				SmootherDataType &smoothing_info = *( levels.at( data.level ).get() );
+				SmootherDataType & smoothing_info = *( levels.at( data.level ).get() );
 
 				data.sm_stopwatch.start();
 				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
@@ -145,8 +142,8 @@ namespace grb {
 			 *  unsuccessful operation otherwise
 			 */
 			grb::RC red_black_gauss_seidel_single_step(
-				SmootherInputType &data,
-				SmootherDataType &smoothing_info,
+				SmootherInputType & data,
+				SmootherDataType & smoothing_info,
 				size_t color
 			) {
 				const grb::Matrix< NonzeroType > & A = data.A;
@@ -168,16 +165,16 @@ namespace grb {
 				// z[mask] = r[mask] - smoother_temp[mask] + z[mask] .* diagonal[mask]
 				// z[mask] = z[maks] ./ diagonal[mask]
 				ret = ret ? ret :
-					grb::eWiseLambda(
-						[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
-							// if the mask was properly initialized, the check on the mask value is unnecessary;
-							// if( color_mask[ i ] ) {
-							IOType d = A_diagonal[ i ];
-							IOType v = r[ i ] - smoother_temp[ i ] + z[ i ] * d;
-							z[ i ] = v / d;
-							// }
-						},
-						color_mask, z, r, smoother_temp, A_diagonal );
+                            grb::eWiseLambda(
+								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
+									// if the mask was properly initialized, the check on the mask value is unnecessary;
+					                // if( color_mask[ i ] ) {
+									IOType d = A_diagonal[ i ];
+									IOType v = r[ i ] - smoother_temp[ i ] + z[ i ] * d;
+									z[ i ] = v / d;
+									// }
+								},
+								color_mask, z, r, smoother_temp, A_diagonal );
 				assert( ret == SUCCESS );
 				return ret;
 			}
@@ -197,32 +194,31 @@ namespace grb {
 			 *                          unsuccessful operation otherwise
 			 */
 			grb::RC red_black_gauss_seidel(
-				SmootherInputType &data,
-				SmootherDataType &smoothing_info
+				SmootherInputType & data,
+				SmootherDataType & smoothing_info
 			) {
 				RC ret = SUCCESS;
 				// zero the temp output just once, assuming proper masking avoids
 				// interference among different colors
 				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
-					ring. template getZero< IOType >() );
+					ring.template getZero< IOType >() );
 
 				// forward step
 				for( size_t color = 0; color < smoothing_info.color_masks.size(); ++color ) {
 					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color );
 				}
 				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
-					ring. template getZero< IOType >() );
+					ring.template getZero< IOType >() );
 
 				// backward step
 				for( size_t color = smoothing_info.color_masks.size(); color > 0; --color ) {
 					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color - 1 );
-
 				}
 				return ret;
 			}
 		};
 
-	}     // namespace algorithms
+	} // namespace algorithms
 } // namespace grb
 
 #endif // H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index 0e2ee58af..4a19f9deb 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -24,8 +24,8 @@
 #ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
 #define _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
 
-#include <vector>
 #include <memory>
+#include <vector>
 
 #include <graphblas.hpp>
 
@@ -52,7 +52,10 @@ namespace grb {
 			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
 			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
 			 */
-			CoarseningData( size_t _finer_size, size_t coarser_size ) :
+			CoarseningData(
+				size_t _finer_size,
+				size_t coarser_size
+			) :
 				coarsening_matrix( coarser_size, _finer_size ),
 				Ax_finer( _finer_size ) {}
 
@@ -89,8 +92,7 @@ namespace grb {
 			/**
 			 * Data to coarsen each level, from finer to coarser.
 			 */
-			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType,
-				NonzeroType > > > coarsener_levels;
+			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType, NonzeroType > > > coarsener_levels;
 			Ring ring;
 			Minus minus;
 
@@ -100,12 +102,12 @@ namespace grb {
 			 * \p coarser (the coarser system).
 			 */
 			inline grb::RC coarsen_residual(
-				const MultiGridInputType &finer,
-				MultiGridInputType &coarser
+				const MultiGridInputType & finer,
+				MultiGridInputType & coarser
 			) {
 				// first compute the residual
-				CoarseningData< IOType, NonzeroType > &coarsener = *coarsener_levels[ finer.level ];
-				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring. template getZero< IOType >() );
+				CoarseningData< IOType, NonzeroType > & coarsener = *coarsener_levels[ finer.level ];
+				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring.template getZero< IOType >() );
 				ret = ret ? ret : grb::mxv< descr >( coarsener.Ax_finer, finer.A, finer.z, ring );
 
 				return ret ? ret : compute_coarsening( finer.r, coarser.r, coarsener );
@@ -116,8 +118,8 @@ namespace grb {
 			 * into the finer solution.
 			 */
 			inline grb::RC prolong_solution(
-				const MultiGridInputType &coarser,
-				MultiGridInputType &finer
+				const MultiGridInputType & coarser,
+				MultiGridInputType & finer
 			) {
 				return compute_prolongation( coarser.z, finer.z, *coarsener_levels[ finer.level ] );
 			}
@@ -136,7 +138,7 @@ namespace grb {
 			 */
 			grb::RC compute_coarsening(
 				const grb::Vector< IOType > & r_fine, // fine residual
-				grb::Vector< IOType > & r_coarse, // fine residual
+				grb::Vector< IOType > & r_coarse, // coarse residual
 				CoarseningData< IOType, NonzeroType > & coarsening_data
 			) {
 				RC ret = SUCCESS;
@@ -171,13 +173,15 @@ namespace grb {
 				RC ret = SUCCESS;
 				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
 				// to nrows(z_fine)
-				ret = ret ? ret : grb::set< descr >( coarsening_data.Ax_finer, ring.template getZero< IOType >() );
+				ret = ret ? ret : grb::set< descr >( coarsening_data.Ax_finer,
+					ring.template getZero< IOType >() );
 
 				ret = ret ? ret : grb::mxv< descr | grb::descriptors::transpose_matrix >(
 					coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, z_coarse, ring );
 				assert( ret == SUCCESS );
 
-				ret = ret ? ret : grb::foldl< descr >( z_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // z_fine += Ax_finer;
+				ret = ret ? ret : grb::foldl< descr >( z_fine, coarsening_data.Ax_finer,
+					ring.getAdditiveMonoid() ); // z_fine += Ax_finer;
 				assert( ret == SUCCESS );
 				return ret;
 			}
diff --git a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
index 2c0383325..ebac6ca02 100644
--- a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
+++ b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
@@ -56,7 +56,8 @@ namespace grb {
 			static_assert( std::is_copy_assignable< AdaptorType >::value,
 				"AdaptorType must be copy-assignable" );
 
-			typedef typename std::decay< decltype( *std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) ) >::type value_type;
+			typedef typename std::decay<
+				decltype( *std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) )>::type value_type;
 			typedef value_type & reference;
 			typedef value_type * pointer;
 			typedef const value_type * const_pointer;
@@ -96,8 +97,7 @@ namespace grb {
 			 *
 			 * @param _iter the underlying iterator, to be moved
 			 */
-			IteratorValueAdaptor(
-				typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
+			IteratorValueAdaptor( typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
 				InnerIterType && >::type _iter
 			) :
 				iter( std::move( _iter ) ),
diff --git a/include/graphblas/utils/multigrid/array_vector_storage.hpp b/include/graphblas/utils/multigrid/array_vector_storage.hpp
index a40850f77..cfca1dda2 100644
--- a/include/graphblas/utils/multigrid/array_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/array_vector_storage.hpp
@@ -25,10 +25,10 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
 #define _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
 
-#include <array>
-#include <stdexcept>
 #include <algorithm>
+#include <array>
 #include <cstddef>
+#include <stdexcept>
 
 namespace grb {
 	namespace utils {
@@ -46,11 +46,10 @@ namespace grb {
 			template<
 				size_t DIMS,
 				typename DataType
-			> class ArrayVectorStorage: public std::array< DataType, DIMS > {
+			> class ArrayVectorStorage : public std::array< DataType, DIMS > {
 			public:
-
-				using VectorStorageType = std::array< DataType, DIMS >&;
-				using ConstVectorStorageType = const std::array< DataType, DIMS >&;
+				using VectorStorageType = std::array< DataType, DIMS > &;
+				using ConstVectorStorageType = const std::array< DataType, DIMS > &;
 				using SelfType = ArrayVectorStorage< DIMS, DataType >;
 
 				/**
@@ -62,25 +61,25 @@ namespace grb {
 				ArrayVectorStorage( size_t _dimensions ) {
 					static_assert( DIMS > 0, "cannot allocate 0-sized array" );
 					if( _dimensions != DIMS ) {
-						throw std::invalid_argument("given dimensions must match the type dimensions");
+						throw std::invalid_argument( "given dimensions must match the type dimensions" );
 					}
 				}
 
 				ArrayVectorStorage() = delete;
 
 				// only copy constructor/assignment, since there's no external storage
-				ArrayVectorStorage( const SelfType &o ) noexcept {
+				ArrayVectorStorage( const SelfType & o ) noexcept {
 					std::copy_n( o.cbegin(), DIMS, this->begin() );
 				}
 
-				ArrayVectorStorage( SelfType &&o ) = delete;
+				ArrayVectorStorage( SelfType && o ) = delete;
 
-				SelfType& operator=( const SelfType &original ) noexcept {
+				SelfType & operator=( const SelfType & original ) noexcept {
 					std::copy_n( original.begin(), DIMS, this->begin() );
 					return *this;
 				}
 
-				SelfType & operator=( SelfType &&original ) = delete;
+				SelfType & operator=( SelfType && original ) = delete;
 
 				/**
 				 * Returns the geometrical dimensions of this vector, i.e. of the
@@ -106,7 +105,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
diff --git a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
index 9e5b7f92e..fff89b6db 100644
--- a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
@@ -24,9 +24,8 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
 #define _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
 
-#include <cstddef>
-#include <cstddef>
 #include <algorithm>
+#include <cstddef>
 
 namespace grb {
 	namespace utils {
@@ -43,7 +42,7 @@ namespace grb {
 			template< typename DataType > class DynamicVectorStorage {
 
 				size_t _dimensions;
-				DataType* _storage;
+				DataType * _storage;
 
 				void clean() {
 					if( this->_storage != nullptr ) {
@@ -53,47 +52,46 @@ namespace grb {
 
 			public:
 				// iterator fields
-				using reference = DataType&;
-				using const_reference = const DataType&;
-				using iterator = DataType*;
-				using const_iterator = const DataType*;
-				using pointer = DataType*;
-				using const_pointer = const DataType*;
-
-				using VectorStorageType = DataType*;
-				using ConstVectorStorageType = DataType*;
+				using reference = DataType &;
+				using const_reference = const DataType &;
+				using iterator = DataType *;
+				using const_iterator = const DataType *;
+				using pointer = DataType *;
+				using const_pointer = const DataType *;
+
+				using VectorStorageType = DataType *;
+				using ConstVectorStorageType = DataType *;
 				using SelfType = DynamicVectorStorage< DataType >;
 
-				DynamicVectorStorage( size_t __dimensions ):
-					_dimensions( __dimensions ) {
+				DynamicVectorStorage( size_t __dimensions ) : _dimensions( __dimensions ) {
 					if( __dimensions == 0 ) {
-						throw std::invalid_argument("dimensions cannot be 0");
+						throw std::invalid_argument( "dimensions cannot be 0" );
 					}
 					this->_storage = new DataType[ __dimensions ];
 				}
 
 				DynamicVectorStorage() = delete;
 
-				DynamicVectorStorage( const SelfType &o ):
+				DynamicVectorStorage( const SelfType & o ) :
 					_dimensions( o._dimensions ),
 					_storage( new DataType[ o._dimensions ] )
 				{
 					std::copy_n( o._storage, o._dimensions, this->_storage );
 				}
 
-				DynamicVectorStorage( SelfType &&o ) = delete;
+				DynamicVectorStorage( SelfType && o ) = delete;
 
-				SelfType& operator=( const SelfType &original ) {
+				SelfType & operator=( const SelfType & original ) {
 					if( original._dimensions != this->_dimensions ) {
 						this->clean();
-						this->_storage = new DataType[ original._dimensions];
+						this->_storage = new DataType[ original._dimensions ];
 					}
 					this->_dimensions = original._dimensions;
 					std::copy_n( original._storage, original._dimensions, this->_storage );
 					return *this;
 				}
 
-				SelfType& operator=( SelfType &&original ) = delete;
+				SelfType & operator=( SelfType && original ) = delete;
 
 				~DynamicVectorStorage() {
 					this->clean();
@@ -136,7 +134,7 @@ namespace grb {
 				}
 
 				inline reference operator[]( size_t pos ) {
-					return *( this->_storage + pos);
+					return *( this->_storage + pos );
 				}
 
 				inline const_reference operator[]( size_t pos ) const {
@@ -145,7 +143,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
diff --git a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
index 2404cdf00..e51d7d6df 100644
--- a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
+++ b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
@@ -37,8 +37,8 @@
 
 #include "array_vector_storage.hpp"
 #include "linearized_halo_ndim_system.hpp"
-#include "linearized_ndim_system.hpp"
 #include "linearized_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
 
 namespace grb {
 	namespace utils {
@@ -67,11 +67,9 @@ namespace grb {
 				typename CoordType,
 				typename ValueType,
 				typename ValueCallable
-			>
-			struct HaloMatrixGeneratorIterator {
+			> struct HaloMatrixGeneratorIterator {
 
-				static_assert( std::is_copy_constructible< ValueCallable >::value,
-					"ValueCallable must be copy-constructible" );
+				static_assert( std::is_copy_constructible< ValueCallable >::value, "ValueCallable must be copy-constructible" );
 
 				using RowIndexType = CoordType; ///< numeric type of rows
 				using ColumnIndexType = CoordType;
@@ -84,23 +82,26 @@ namespace grb {
 					friend SelfType;
 
 					HaloPoint(
-						const ValueCallable &value_producer,
+						const ValueCallable & value_producer,
 						RowIndexType i,
 						ColumnIndexType j
 					) noexcept :
 						_value_producer( value_producer ),
 						_i( i ),
-						_j( j )
-					{}
+						_j( j ) {}
 
 					HaloPoint( const HaloPoint & ) = default;
 
 					HaloPoint & operator=( const HaloPoint & ) = default;
 
-					inline RowIndexType i() const { return _i; }
-					inline ColumnIndexType j() const { return _j; }
+					inline RowIndexType i() const {
+						return _i;
+					}
+					inline ColumnIndexType j() const {
+						return _j;
+					}
 					inline ValueType v() const {
-						return _value_producer( _i, _j);
+						return _value_producer( _i, _j );
 					}
 
 				private:
@@ -126,8 +127,8 @@ namespace grb {
 				 * @param non_diag value to emit outside the diagonal
 				 */
 				HaloMatrixGeneratorIterator(
-					const LinearSystemType &system,
-					const ValueCallable &value_producer
+					const LinearSystemType & system,
+					const ValueCallable & value_producer
 				) noexcept :
 					_val( value_producer, 0, 0 ),
 					_lin_system( &system ),
@@ -150,7 +151,7 @@ namespace grb {
 				 * @return HaloMatrixGeneratorIterator<DIMS, T>& \c this object, with the updated state
 				 */
 				SelfType & operator++() noexcept {
-					(void) ++_sys_iter;
+					(void)++_sys_iter;
 					update_coords();
 					return *this;
 				}
@@ -161,7 +162,7 @@ namespace grb {
 					return *this;
 				}
 
-				difference_type operator-( const SelfType &other ) const {
+				difference_type operator-( const SelfType & other ) const {
 					return this->_sys_iter - other._sys_iter;
 				}
 
@@ -172,7 +173,7 @@ namespace grb {
 				 * @return true of the row or the column is different between \p o and \c this
 				 * @return false if both row and column of \p o and \c this are equal
 				 */
-				bool operator!=( const SelfType &o ) const {
+				bool operator!=( const SelfType & o ) const {
 					return this->_sys_iter != o._sys_iter;
 				}
 
@@ -183,7 +184,7 @@ namespace grb {
 				 * @return true of the row or the column is different between \p o and \c this
 				 * @return false if both row and column of \p o and \c this are equal
 				 */
-				bool operator==( const SelfType &o ) const {
+				bool operator==( const SelfType & o ) const {
 					return ! operator!=( o );
 				}
 
@@ -227,7 +228,7 @@ namespace grb {
 
 			private:
 				value_type _val;
-				const LinearSystemType *_lin_system;
+				const LinearSystemType * _lin_system;
 				Iterator _sys_iter;
 
 				void update_coords() {
@@ -237,7 +238,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
index 62e4dcd4a..3a5047277 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
@@ -25,16 +25,15 @@
 #define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
 
 #include <cstddef>
-#include <vector>
 #include <iterator>
 #include <limits>
-#include <cstddef>
+#include <vector>
 
 #include <graphblas/utils/iterators/utils.hpp>
 
-#include "linearized_ndim_system.hpp"
 #include "array_vector_storage.hpp"
 #include "linearized_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
 
 namespace grb {
 	namespace utils {
@@ -100,10 +99,8 @@ namespace grb {
 			 * @tparam SizeType type of coordinates and of sizes (must be large enough to describe the size
 			 * of the system along each direction)
 			 */
-			template<
-				size_t DIMS,
-				typename SizeType
-			> class LinearizedHaloNDimIterator {
+			template< size_t DIMS, typename SizeType >
+			class LinearizedHaloNDimIterator {
 
 				using SystemType = LinearizedHaloNDimSystem< DIMS, SizeType >;
 				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
@@ -120,14 +117,13 @@ namespace grb {
 				 */
 				struct HaloNDimElement {
 				private:
-
 					// for linearization
-					const SystemType* _system;
+					const SystemType * _system;
 
 					// for iteration
 					VectorIteratorType _element_iter; // coordinates iterator
 
-					VectorType _neighbor; //the current neighbor
+					VectorType _neighbor; // the current neighbor
 					SizeType _position;
 
 				public:
@@ -135,11 +131,11 @@ namespace grb {
 
 					HaloNDimElement() = delete;
 
-					HaloNDimElement( const HaloNDimElement& ) = default;
+					HaloNDimElement( const HaloNDimElement & ) = default;
 
-					HaloNDimElement( HaloNDimElement&& ) = delete;
+					HaloNDimElement( HaloNDimElement && ) = delete;
 
-					HaloNDimElement( const SystemType& system ) noexcept :
+					HaloNDimElement( const SystemType & system ) noexcept :
 						_system( &system ),
 						_element_iter( system ),
 						_neighbor( DIMS ),
@@ -148,7 +144,7 @@ namespace grb {
 						std::fill_n( this->_neighbor.begin(), DIMS, 0 );
 					}
 
-					HaloNDimElement& operator=( const HaloNDimElement& ) = default;
+					HaloNDimElement & operator=( const HaloNDimElement & ) = default;
 
 					/**
 					 * Get the element as vector coordinates.
@@ -189,8 +185,8 @@ namespace grb {
 				// interface for std::random_access_iterator
 				using iterator_category = std::random_access_iterator_tag;
 				using value_type = HaloNDimElement;
-				using pointer = const HaloNDimElement*;
-				using reference = const HaloNDimElement&;
+				using pointer = const HaloNDimElement *;
+				using reference = const HaloNDimElement &;
 				using difference_type = signed long;
 
 				LinearizedHaloNDimIterator() = delete;
@@ -203,7 +199,7 @@ namespace grb {
 				 *
 				 * IF \p system is not valid anymore, then also \c this is not.
 				 */
-				LinearizedHaloNDimIterator( const SystemType& system ) noexcept :
+				LinearizedHaloNDimIterator( const SystemType & system ) noexcept :
 					_point( system ),
 					_neighbors_subspace( DIMS, system.halo() + 1 ),
 					_neighbors_start( DIMS ),
@@ -217,7 +213,7 @@ namespace grb {
 
 				SelfType & operator=( const SelfType & ) = default;
 
-				bool operator!=( const SelfType &other ) const {
+				bool operator!=( const SelfType & other ) const {
 					return this->_point._position != other._point._position; // use linear coordinate
 				}
 
@@ -226,7 +222,7 @@ namespace grb {
 				}
 
 				pointer operator->() const {
-					return &(this->_point);
+					return &( this->_point );
 				}
 
 				/**
@@ -243,10 +239,10 @@ namespace grb {
 				 * Does \b not advance the element, which should be done manually via #next_element().
 				 */
 				void next_neighbour() {
-					if( !has_more_neighbours() ) {
-						throw std::out_of_range("the current element has no more neighbors");
+					if( ! has_more_neighbours() ) {
+						throw std::out_of_range( "the current element has no more neighbors" );
 					}
-					++(this->_neighbor_iter);
+					++( this->_neighbor_iter );
 					this->on_neighbor_iter_update();
 					this->_point._position++;
 				}
@@ -255,20 +251,19 @@ namespace grb {
 				 * Tells whether the system has more elements.
 				 */
 				bool has_more_elements() const {
-					return this->_point.get_element_linear() != (this->_point._system)->base_system_size();
+					return this->_point.get_element_linear() != ( this->_point._system )->base_system_size();
 				}
 
 				/**
 				 * Moves \c this to point to the next element, setting the neighbor as the first one.
 				 */
 				void next_element() {
-					if( !has_more_elements() ) {
-						throw std::out_of_range("the system has no more elements");
+					if( ! has_more_elements() ) {
+						throw std::out_of_range( "the system has no more elements" );
 					}
 					size_t num_neighbours = this->_neighbors_subspace.system_size();
-					size_t neighbour_position_offset =
-						this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
-					++(this->_point._element_iter);
+					size_t neighbour_position_offset = this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
+					++( this->_point._element_iter );
 					this->on_element_advance();
 					this->_point._position -= neighbour_position_offset;
 					this->_point._position += num_neighbours;
@@ -278,9 +273,9 @@ namespace grb {
 				 * Moves \c this to point to the next neighbor, also advancing the element if needed.
 				 */
 				SelfType & operator++() noexcept {
-					++(this->_neighbor_iter);
-					if( !has_more_neighbours() ) {
-						++(this->_point._element_iter);
+					++( this->_neighbor_iter );
+					if( ! has_more_neighbours() ) {
+						++( this->_point._element_iter );
 						this->on_element_advance();
 
 					} else {
@@ -302,7 +297,7 @@ namespace grb {
 						throw std::range_error( "neighbor linear value beyond system" );
 					}
 					VectorType final_element( DIMS );
-					size_t neighbor_index = (this->_point._system->neighbour_linear_to_element( final_position, final_element ));
+					size_t neighbor_index = ( this->_point._system->neighbour_linear_to_element( final_position, final_element ) );
 
 					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
 					this->_point._position = final_position;
@@ -323,9 +318,8 @@ namespace grb {
 				 *
 				 * It throws if the result cannot be stored as a difference_type variable.
 				 */
-				difference_type operator-( const SelfType &other ) const {
-					return grb::utils::compute_signed_distance< difference_type, SizeType >(
-						_point.get_position(), other._point.get_position() );
+				difference_type operator-( const SelfType & other ) const {
+					return grb::utils::compute_signed_distance< difference_type, SizeType >( _point.get_position(), other._point.get_position() );
 				}
 
 				/**
@@ -333,7 +327,7 @@ namespace grb {
 				 *
 				 * The implementation depends on the logic of operator++.
 				 */
-				static SelfType make_system_end_iterator( const SystemType& system ) {
+				static SelfType make_system_end_iterator( const SystemType & system ) {
 					SelfType result( system );
 					// go to the very first point outside of space
 					result._point._element_iter = VectorIteratorType::make_system_end_iterator( system );
@@ -355,8 +349,7 @@ namespace grb {
 				 */
 				inline void on_neighbor_iter_update() {
 					for( size_t i = 0; i < DIMS; i++ ) {
-						this->_point._neighbor[i] = this->_neighbors_start[i]
-							+ this->_neighbor_iter->get_position()[i];
+						this->_point._neighbor[ i ] = this->_neighbors_start[ i ] + this->_neighbor_iter->get_position()[ i ];
 					}
 				}
 
@@ -367,11 +360,7 @@ namespace grb {
 				void on_element_update() {
 					// reset everything
 					VectorType neighbors_range( DIMS );
-					this->_point._system->compute_neighbors_range(
-						this->_point._element_iter->get_position(),
-						this->_neighbors_start,
-						neighbors_range
-					);
+					this->_point._system->compute_neighbors_range( this->_point._element_iter->get_position(), this->_neighbors_start, neighbors_range );
 					// re-target _neighbors_subspace
 					this->_neighbors_subspace.retarget( neighbors_range );
 				}
@@ -391,7 +380,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
index 400fdd3ab..1ebe04b73 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -24,20 +24,19 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
 #define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
 
-#include <cstddef>
-#include <vector>
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <vector>
 #ifdef _DEBUG
 #include <iostream>
 #endif
 
 #include "array_vector_storage.hpp"
 #include "dynamic_vector_storage.hpp"
-#include "ndim_vector.hpp"
-#include "linearized_ndim_system.hpp"
 #include "linearized_halo_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
+#include "ndim_vector.hpp"
 
 namespace grb {
 	namespace utils {
@@ -84,7 +83,7 @@ namespace grb {
 			template<
 				size_t DIMS,
 				typename SizeType
-			> class LinearizedHaloNDimSystem:
+			> class LinearizedHaloNDimSystem :
 				public LinearizedNDimSystem< SizeType, ArrayVectorStorage< DIMS, SizeType > > {
 			public:
 				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
@@ -108,16 +107,16 @@ namespace grb {
 					_halo( halo )
 				{
 					for( SizeType __size : sizes ) {
-						if ( __size < halo + 1 ) {
+						if( __size < halo + 1 ) {
 							throw std::invalid_argument(
-								std::string( "the halo (" + std::to_string(halo) +
-								std::string( ") goes beyond a system size (" ) +
-								std::to_string( __size) + std::string( ")" ) ) );
+								std::string( "the halo (" + std::to_string( halo )
+								+ std::string( ") goes beyond a system size (" )
+								+ std::to_string( __size ) + std::string( ")" ) ) );
 						}
 					}
 
-					this->_system_size = init_neigh_to_base_search( this->get_sizes(),
-						_halo, this->_dimension_limits );
+					this->_system_size = init_neigh_to_base_search(
+						this->get_sizes(), _halo, this->_dimension_limits );
 					assert( this->_dimension_limits.size() == DIMS );
 				}
 
@@ -181,16 +180,12 @@ namespace grb {
 				 * otherwise (on corner, edge, or face).
 				 */
 				void compute_neighbors_range(
-					const VectorType &element_coordinates,
-					VectorType &neighbors_start,
-					VectorType &neighbors_range
+					const VectorType & element_coordinates,
+					VectorType & neighbors_start,
+					VectorType & neighbors_range
 				) const noexcept {
 					compute_first_neigh_and_range( this->get_sizes(),
-						this->_halo,
-						element_coordinates,
-						neighbors_start,
-						neighbors_range
-					);
+						this->_halo, element_coordinates, neighbors_start, neighbors_range );
 				}
 
 				/**
@@ -206,9 +201,9 @@ namespace grb {
 				 * \a 0<=i<n is the the index of \p neighbor_linear among \a e's neighbors, computed w.r.t. the
 				 * iteration order.
 				 */
-				size_t neighbour_linear_to_element (
+				size_t neighbour_linear_to_element(
 					SizeType neighbor_linear,
-					VectorType &base_element_vector
+					VectorType & base_element_vector
 				) const noexcept {
 					return map_neigh_to_base_and_index( this->get_sizes(), this->_system_size,
 						this->_dimension_limits, this->_halo, neighbor_linear, base_element_vector );
@@ -216,7 +211,8 @@ namespace grb {
 
 			private:
 				const SizeType _halo;
-				std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > _dimension_limits;
+				std::vector< NDimVector< SizeType, SizeType,
+					DynamicVectorStorage< SizeType > > > _dimension_limits;
 				size_t _system_size;
 
 				/**
@@ -242,12 +238,12 @@ namespace grb {
 				 * @return size_t the total number of neighbors for this configuration and this dimension
 				 */
 				static size_t accumulate_dimension_neighbours(
-					const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > &prev_neighs,
-					SizeType* coords_buffer,
+					const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & prev_neighs,
+					SizeType * coords_buffer,
 					size_t halo,
 					size_t local_size
 				) {
-					size_t neighs =0;
+					size_t neighs = 0;
 					size_t h = 0;
 					for( ; h < halo && local_size > 1; h++ ) {
 						*coords_buffer = h;
@@ -278,14 +274,14 @@ namespace grb {
 				 */
 				static void compute_dim0_neighbors(
 					size_t halo,
-					NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >& config_neighbors
+					NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & config_neighbors
 				) {
-					using it_type = typename NDimVector< SizeType, SizeType,
-						DynamicVectorStorage< SizeType > >::DomainIterator;
+					using it_type = typename NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >::DomainIterator;
 					it_type end = config_neighbors.domain_end();
 					for( it_type it = config_neighbors.domain_begin(); it != end; ++it ) {
 						size_t res = 1;
-						for( size_t h: it->get_position() ) res *= (h + 1 + halo);
+						for( size_t h : it->get_position() )
+							res *= ( h + 1 + halo );
 						config_neighbors.at( it->get_position() ) = res;
 					}
 				}
@@ -328,42 +324,43 @@ namespace grb {
 				 * @return size_t the number of neighbors of the entire system
 				 */
 				static size_t init_neigh_to_base_search(
-					typename LinearizedNDimSystem< SizeType,
-						ArrayVectorStorage< DIMS, SizeType > >::ConstVectorReference
-						sizes,
+					typename LinearizedNDimSystem< SizeType, ArrayVectorStorage< DIMS, SizeType >
+						>::ConstVectorReference sizes,
 					size_t halo,
-					std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > >& dimension_limits
+					std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > & dimension_limits
 				) {
 					using nd_vec = NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >;
 					using nd_vec_iterator = typename nd_vec::DomainIterator;
 
-					std::vector<size_t> halo_sizes( DIMS, halo + 1);
-					dimension_limits.emplace_back(halo_sizes);
+					std::vector< size_t > halo_sizes( DIMS, halo + 1 );
+					dimension_limits.emplace_back( halo_sizes );
 					// initialize values
-					compute_dim0_neighbors( halo, dimension_limits[0] );
+					compute_dim0_neighbors( halo, dimension_limits[ 0 ] );
 					for( size_t i = 1; i < DIMS; i++ ) {
-						std::vector<size_t> halos( DIMS - i, halo + 1 );
-						dimension_limits.emplace_back(halos);
+						std::vector< size_t > halos( DIMS - i, halo + 1 );
+						dimension_limits.emplace_back( halos );
 					}
 
 					std::array< SizeType, DIMS > prev_coords_buffer; // store at most DIMS values
-					SizeType* const prev_coords = prev_coords_buffer.data();
-					SizeType* const second = prev_coords + 1; // store previous coordinates from second position
+					SizeType * const prev_coords = prev_coords_buffer.data();
+					SizeType * const second = prev_coords + 1; // store previous coordinates from second position
 					for( size_t dimension = 1; dimension < DIMS; dimension++ ) {
-						const nd_vec& prev_neighs{dimension_limits[dimension - 1]};
-						nd_vec& current_neighs{dimension_limits[dimension]};
+						const nd_vec & prev_neighs { dimension_limits[ dimension - 1 ] };
+						nd_vec & current_neighs { dimension_limits[ dimension ] };
 
 						nd_vec_iterator end = current_neighs.domain_end();
 						for( nd_vec_iterator it = current_neighs.domain_begin(); it != end; ++it ) {
 							typename nd_vec::ConstDomainVectorReference current_halo_coords = it->get_position();
 
 							std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
-							size_t local_size = sizes[dimension - 1];
-							const size_t neighs = accumulate_dimension_neighbours(prev_neighs, prev_coords, halo, local_size);
-							current_neighs.at(current_halo_coords) = neighs;
+							size_t local_size = sizes[ dimension - 1 ];
+							const size_t neighs = accumulate_dimension_neighbours( prev_neighs,
+								prev_coords, halo, local_size );
+							current_neighs.at( current_halo_coords ) = neighs;
 						}
 					}
-					return accumulate_dimension_neighbours( dimension_limits[DIMS - 1], prev_coords, halo, sizes.back() );
+					return accumulate_dimension_neighbours( dimension_limits[ DIMS - 1 ],
+						prev_coords, halo, sizes.back() );
 				}
 
 				/**
@@ -382,22 +379,24 @@ namespace grb {
 				 * @param[out] neighbors_range stores the range of neighbors around \p element_coordinates
 				 */
 				static void compute_first_neigh_and_range(
-					const ArrayVectorStorage< DIMS, SizeType > &_system_sizes,
+					const ArrayVectorStorage< DIMS, SizeType > & _system_sizes,
 					const SizeType halo,
-					const ArrayVectorStorage< DIMS, SizeType > &element_coordinates,
-					ArrayVectorStorage< DIMS, SizeType > &neighbors_start,
-					ArrayVectorStorage< DIMS, SizeType > &neighbors_range
+					const ArrayVectorStorage< DIMS, SizeType > & element_coordinates,
+					ArrayVectorStorage< DIMS, SizeType > & neighbors_start,
+					ArrayVectorStorage< DIMS, SizeType > & neighbors_range
 				) {
-					for( SizeType i = 0; i < DIMS/* - 1*/; i++ ) {
-						const SizeType start = element_coordinates[i] <= halo ? 0 : element_coordinates[i] - halo;
-						const SizeType end = std::min( element_coordinates[i] + halo, _system_sizes[i] - 1 );
-						neighbors_start[i] = start;
-						neighbors_range[i] = end - start + 1;
+					for( SizeType i = 0; i < DIMS /* - 1*/; i++ ) {
+						const SizeType start = element_coordinates[ i ] <= halo ? 0 :
+							element_coordinates[ i ] - halo;
+						const SizeType end = std::min( element_coordinates[ i ] + halo, _system_sizes[ i ] - 1 );
+						neighbors_start[ i ] = start;
+						neighbors_range[ i ] = end - start + 1;
 					}
 				}
 
 #ifdef _DEBUG
-				template< typename IterType > static std::ostream & print_sequence( IterType begin, IterType end ) {
+				template< typename IterType >
+				static std::ostream & print_sequence( IterType begin, IterType end ) {
 					for( ; begin != end; ++begin ) {
 						std::cout << *begin << ' ';
 					}
@@ -423,17 +422,17 @@ namespace grb {
 				 * @return size_t the index of the neighbor within the element's neighbors
 				 */
 				static size_t map_neigh_to_base_and_index(
-					const std::array< SizeType, DIMS > &sizes,
+					const std::array< SizeType, DIMS > & sizes,
 					size_t system_size,
-					const std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > >
-						&neighbors_per_dimension,
+					const std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >
+						> & neighbors_per_dimension,
 					SizeType halo,
 					SizeType neighbor_linear,
-					ArrayVectorStorage< DIMS, SizeType > &element_vector
-				){
+					ArrayVectorStorage< DIMS, SizeType > & element_vector
+				) {
 					if( neighbor_linear > system_size ) {
 						throw std::invalid_argument( "neighbor number ( " + std::to_string( neighbor_linear )
-							+ " ) >= system size ( " + std::to_string( system_size ) + " )");
+							+ " ) >= system size ( " + std::to_string( system_size ) + " )" );
 					}
 					ArrayVectorStorage< DIMS, SizeType > configuration( DIMS );
 #ifdef _DEBUG
@@ -441,8 +440,7 @@ namespace grb {
 #endif
 					std::fill_n( configuration.begin(), DIMS, 0 );
 
-					for( size_t _dim = DIMS; _dim > 0; _dim--) {
-
+					for( size_t _dim = DIMS; _dim > 0; _dim-- ) {
 						// each iteration looks for the base element along a dimension via the number of neighbors
 						// each element has: once previous_neighs reaches neighbor_linear, the corresponding
 						// base element is found; if the control reaches the end, this means it must explore
@@ -453,21 +451,20 @@ namespace grb {
 						// start from highest dimension
 						const size_t dimension = _dim - 1;
 						// how many elements along this dimension
-						const size_t dimension_size = sizes[dimension];
+						const size_t dimension_size = sizes[ dimension ];
 						// configurations of neighbors along this dimension
 						// (e.g., corner, edge; or edge, inner element)
-						const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >
-							& neighbors = neighbors_per_dimension[dimension];
+						const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & neighbors =
+							neighbors_per_dimension[ dimension ];
 
 						// coordinate to modify to identify each configuration
 						SizeType * const halo_coords_begin = configuration.data() + dimension;
 #ifdef _DEBUG
-						std::cout << "DIMENSION " << dimension << std::endl
-							<< "- setup - neighbour " << neighbor_linear << std::endl
-							<< "\thalo : ";
+						std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour "
+							<< neighbor_linear << std::endl << "\thalo : ";
 						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
 #endif
-						size_t h =0; // configuration type along this dimension
+						size_t h = 0; // configuration type along this dimension
 						size_t previous_neighs = 0;
 						*halo_coords_begin = h;
 						// account for neighbors in the first elements along the dimension, within halo distance:
@@ -481,16 +478,14 @@ namespace grb {
 							halo_max_neighs = neighbors.at( halo_coords_begin );
 						}
 #ifdef _DEBUG
-						std::cout << "- initial halo - neighbour " << neighbor_linear << std::endl
-							<< "\th " << h << std::endl
-							<< "\thalo : ";
+						std::cout << "- initial halo - neighbour " << neighbor_linear << std::endl << "\th " << h << std::endl << "\thalo : ";
 						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
 						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
 #endif
-						if ( h < halo ){
+						if( h < halo ) {
 							// we have already counted enough neighbors: neighbor_linear is thus a neighbor
 							// of one of the first (< halo) elements along this dimension: go to next dimension
-							element_vector[dimension] = h;
+							element_vector[ dimension ] = h;
 							neighbor_linear -= previous_neighs;
 #ifdef _DEBUG
 							std::cout << "end neighbour " << neighbor_linear << std::endl;
@@ -504,17 +499,17 @@ namespace grb {
 						const size_t distance_from_halo = ( neighbor_linear - previous_neighs ) / halo_max_neighs;
 #ifdef _DEBUG
 						std::cout << "- before middle elements - neighbour " << neighbor_linear << std::endl
-							<< "\tprevious_neighs " << previous_neighs << std::endl
-							<< "\thalo_max_neighs " << halo_max_neighs << std::endl
-							<< "\tdistance_from_halo " << distance_from_halo << std::endl
-							<< "\tdimension_size " << dimension_size << std::endl;
+								  << "\tprevious_neighs " << previous_neighs << std::endl
+								  << "\thalo_max_neighs " << halo_max_neighs << std::endl
+								  << "\tdistance_from_halo " << distance_from_halo << std::endl
+								  << "\tdimension_size " << dimension_size << std::endl;
 #endif
-						if ( distance_from_halo < dimension_size - 2 * halo ) {
+						if( distance_from_halo < dimension_size - 2 * halo ) {
 							// the base element is one of the internal elements along this dimension:
 							// hence return its diatance from the halo + the halo itself (= distance from
 							// beginning of the space)
-							element_vector[dimension] =  distance_from_halo + halo;
-							neighbor_linear -= (previous_neighs + distance_from_halo * halo_max_neighs) ;
+							element_vector[ dimension ] = distance_from_halo + halo;
+							neighbor_linear -= ( previous_neighs + distance_from_halo * halo_max_neighs );
 #ifdef _DEBUG
 							std::cout << "end neighbour " << neighbor_linear << std::endl;
 #endif
@@ -546,18 +541,17 @@ namespace grb {
 #endif
 						// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
 						// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
-						element_vector[dimension] = dimension_size - 1 - h;
+						element_vector[ dimension ] = dimension_size - 1 - h;
 #ifdef _DEBUG
 						std::cout << "end neighbour " << neighbor_linear << std::endl;
 #endif
 					}
 					return neighbor_linear;
 				}
-
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
index 199d08926..9b0e61a8a 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
@@ -24,12 +24,11 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
 
-#include <cstddef>
 #include <algorithm>
+#include <cstddef>
+#include <limits>
 #include <stdexcept>
 #include <type_traits>
-#include <limits>
-#include <cstddef>
 
 #include <graphblas/utils/iterators/utils.hpp>
 
@@ -63,7 +62,7 @@ namespace grb {
 			public:
 				using VectorType = InternalVectorType;
 				using LinNDimSysType = LinearizedNDimSystem< SizeType, VectorType >;
-				using ConstVectorReference = const VectorType&;
+				using ConstVectorReference = const VectorType &;
 				using SelfType = LinearizedNDimIterator< SizeType, InternalVectorType >;
 
 				/**
@@ -73,7 +72,7 @@ namespace grb {
 				 */
 				struct NDimPoint {
 				private:
-					const LinNDimSysType* system; // pointer because of copy assignment
+					const LinNDimSysType * system; // pointer because of copy assignment
 					VectorType coords;
 
 				public:
@@ -81,18 +80,15 @@ namespace grb {
 
 					NDimPoint() = delete;
 
-					NDimPoint( const NDimPoint& ) = default;
+					NDimPoint( const NDimPoint & ) = default;
 
-					NDimPoint( NDimPoint&& ) = delete;
+					NDimPoint( NDimPoint && ) = delete;
 
-					NDimPoint( const LinNDimSysType& _system ) noexcept :
-						system( &_system ),
-						coords( _system.dimensions() )
-					{
+					NDimPoint( const LinNDimSysType & _system ) noexcept : system( &_system ), coords( _system.dimensions() ) {
 						std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
 					}
 
-					NDimPoint& operator=( const NDimPoint& ) = default;
+					NDimPoint & operator=( const NDimPoint & ) = default;
 
 					inline ConstVectorReference get_position() const {
 						return coords;
@@ -106,8 +102,8 @@ namespace grb {
 				// interface for std::random_access_iterator
 				using iterator_category = std::random_access_iterator_tag;
 				using value_type = NDimPoint;
-				using pointer = const value_type*;
-				using reference = const value_type&;
+				using pointer = const value_type *;
+				using reference = const value_type &;
 				using difference_type = signed long;
 
 				/**
@@ -118,9 +114,7 @@ namespace grb {
 				 * If \p _system is not a valid object anymore, all iterators created from it are also
 				 * not valid.
 				 */
-				LinearizedNDimIterator( const LinNDimSysType &_system ) noexcept :
-					_p( _system )
-				{}
+				LinearizedNDimIterator( const LinNDimSysType & _system ) noexcept : _p( _system ) {}
 
 				/**
 				 * Construct a new LinearizedNDimIterator object from the original LinNDimSysType
@@ -132,7 +126,8 @@ namespace grb {
 				 * not valid.
 				 */
 				template< typename IterT > LinearizedNDimIterator(
-					const LinNDimSysType &_system, IterT begin
+					const LinNDimSysType & _system,
+					IterT begin
 				) noexcept :
 					_p( _system )
 				{
@@ -141,10 +136,9 @@ namespace grb {
 
 				LinearizedNDimIterator() = delete;
 
-				LinearizedNDimIterator( const SelfType &original ):
-					_p( original._p ) {}
+				LinearizedNDimIterator( const SelfType & original ) : _p( original._p ) {}
 
-				SelfType& operator=( const SelfType &original ) = default;
+				SelfType & operator=( const SelfType & original ) = default;
 
 				~LinearizedNDimIterator() {}
 
@@ -156,7 +150,7 @@ namespace grb {
 					bool rewind = true;
 					// rewind only the first N-1 coordinates
 					for( size_t i = 0; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
-						SizeType& coord = this->_p.coords[ i ];
+						SizeType & coord = this->_p.coords[ i ];
 						// must rewind dimension if we wrap-around
 						SizeType plus = coord + 1;
 						rewind = plus >= this->_p.system->get_sizes()[ i ];
@@ -180,7 +174,7 @@ namespace grb {
 				SelfType & operator+=( size_t offset ) {
 					size_t linear = _p.get_linear_position() + offset;
 					if( linear > _p.system->system_size() ) {
-						throw std::invalid_argument("increment is too large");
+						throw std::invalid_argument( "increment is too large" );
 					}
 					if( offset == 1 ) {
 						return operator++();
@@ -194,10 +188,9 @@ namespace grb {
 				 *
 				 * It throws if the result cannot be stored as a difference_type variable.
 				 */
-				difference_type operator-( const SelfType &other ) const {
+				difference_type operator-( const SelfType & other ) const {
 					return grb::utils::compute_signed_distance< difference_type, SizeType >(
 						_p.get_linear_position(), other._p.get_linear_position() );
-
 				}
 
 				reference operator*() const {
@@ -208,16 +201,16 @@ namespace grb {
 					return &( this->_p );
 				}
 
-				bool operator!=( const SelfType &o ) const {
+				bool operator!=( const SelfType & o ) const {
 					const size_t dims = this->_p.system->dimensions();
 					if( dims != o._p.system->dimensions() ) {
-						throw std::invalid_argument("system sizes do not match");
+						throw std::invalid_argument( "system sizes do not match" );
 					}
 					bool equal = true;
-					for( size_t i =0; i < dims && equal; i++) {
-						equal &= ( this->_p.coords[i] == o._p.coords[i] );
+					for( size_t i = 0; i < dims && equal; i++ ) {
+						equal &= ( this->_p.coords[ i ] == o._p.coords[ i ] );
 					}
-					return !equal;
+					return ! equal;
 				}
 
 				/**
@@ -225,7 +218,7 @@ namespace grb {
 				 *
 				 * Its implementation depending on the logic in operator++.
 				 */
-				static SelfType make_system_end_iterator( const LinNDimSysType &_system ) {
+				static SelfType make_system_end_iterator( const LinNDimSysType & _system ) {
 					// fill with 0s
 					SelfType iter( _system );
 					size_t last = iter->system->dimensions() - 1;
@@ -239,7 +232,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
index 7b3c94341..a02a0c631 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
@@ -24,17 +24,16 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
 
-#include <cstddef>
 #include <algorithm>
-#include <vector>
-#include <utility>
-#include <stdexcept>
 #include <cassert>
-#include <string>
 #include <cstddef>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
 
-#include "ndim_system.hpp"
 #include "linearized_ndim_iterator.hpp"
+#include "ndim_system.hpp"
 
 namespace grb {
 	namespace utils {
@@ -60,9 +59,9 @@ namespace grb {
 			template<
 				typename SizeType,
 				typename InternalVectorType
-			> class LinearizedNDimSystem: public NDimSystem< SizeType, InternalVectorType > {
+			> class LinearizedNDimSystem : public NDimSystem< SizeType, InternalVectorType > {
 			public:
-				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type" );
 
 				using BaseType = NDimSystem< SizeType, InternalVectorType >;
 				using SelfType = LinearizedNDimSystem< SizeType, InternalVectorType >;
@@ -78,32 +77,39 @@ namespace grb {
 				 * where each iterator's position stores the size along each dimension; example:
 				 * *begin is the size along dimension 0, *(++begin) is the size along dimension 1 ...
 				 */
-				template< typename IterT > LinearizedNDimSystem( IterT begin, IterT end) noexcept :
+				template< typename IterT >
+				LinearizedNDimSystem(
+					IterT begin,
+					IterT end
+				) noexcept :
 					BaseType( begin, end ),
 					_offsets( std::distance( begin, end ) )
 				{
-					this->_system_size = compute_range_product( begin, end, this->_offsets.begin() ) ;
+					this->_system_size = compute_range_product( begin, end, this->_offsets.begin() );
 				}
 
 				/**
 				 * Construct a new LinearizedNDimSystem object with dimensions \p _sizes.size()
 				 * and sizes stored in \p _sizes.
 				 */
-				LinearizedNDimSystem( const std::vector< size_t > &_sizes ) noexcept :
+				LinearizedNDimSystem( const std::vector< size_t > & _sizes ) noexcept :
 					LinearizedNDimSystem( _sizes.cbegin(), _sizes.cend() ) {}
 
 				/**
 				 * Construct a new LinearizedNDimSystem object with \p _dimensions dimensions
 				 * and sizes all equal to \p max_value.
 				 */
-				LinearizedNDimSystem( size_t _dimensions, size_t _size ) noexcept :
+				LinearizedNDimSystem(
+					size_t _dimensions,
+					size_t _size
+				) noexcept :
 					BaseType( _dimensions, _size ),
 					_offsets( _dimensions ),
 					_system_size( _dimensions )
 				{
 					SizeType v = 1;
-					for( size_t i =0; i < _dimensions; i++ ) {
-						this->_offsets[i] = v;
+					for( size_t i = 0; i < _dimensions; i++ ) {
+						this->_offsets[ i ] = v;
 						v *= _size;
 					}
 					this->_system_size = v;
@@ -111,19 +117,21 @@ namespace grb {
 
 				LinearizedNDimSystem() = delete;
 
-				LinearizedNDimSystem( const SelfType &original ) = default;
+				LinearizedNDimSystem( const SelfType & original ) = default;
 
-				LinearizedNDimSystem( SelfType &&original ) noexcept:
-					BaseType( std::move(original) ), _offsets( std::move( original._offsets ) ),
-					_system_size( original._system_size ) {
-						original._system_size = 0;
+				LinearizedNDimSystem( SelfType && original ) noexcept :
+					BaseType( std::move( original ) ),
+					_offsets( std::move( original._offsets ) ),
+					_system_size( original._system_size )
+				{
+					original._system_size = 0;
 				}
 
 				~LinearizedNDimSystem() {}
 
-				SelfType& operator=( const SelfType & ) = default;
+				SelfType & operator=( const SelfType & ) = default;
 
-				SelfType& operator=( SelfType &&original ) = delete;
+				SelfType & operator=( SelfType && original ) = delete;
 
 				/**
 				 * Computes the size of the system, i.e. its number of elements;
@@ -147,15 +155,18 @@ namespace grb {
 				 * @param[in] linear linear index
 				 * @param[out] output output vector \p linear corresponds to
 				 */
-				void linear_to_ndim( size_t linear, VectorReference output ) const {
+				void linear_to_ndim(
+					size_t linear,
+					VectorReference output
+				) const {
 					if( linear > this->_system_size ) {
 						throw std::range_error( "linear value beyond system" );
 					}
 					for( size_t _i = this->_offsets.dimensions(); _i > 0; _i-- ) {
 						const size_t dim = _i - 1;
-						const size_t coord = linear / this->_offsets[dim];
-						output[dim] = coord;
-						linear -= ( coord * this->_offsets[dim] );
+						const size_t coord = linear / this->_offsets[ dim ];
+						output[ dim ] = coord;
+						linear -= ( coord * this->_offsets[ dim ] );
 					}
 					assert( linear == 0 );
 				}
@@ -165,7 +176,7 @@ namespace grb {
 				 * a const reference to \p InternalVectorType and checks whether each value in the input
 				 * vector \p ndim_vector is within the system sizes (otherwise it throws).
 				 */
-				size_t ndim_to_linear_check( ConstVectorReference ndim_vector) const {
+				size_t ndim_to_linear_check( ConstVectorReference ndim_vector ) const {
 					return this->ndim_to_linear_check( ndim_vector.storage() );
 				}
 
@@ -178,7 +189,7 @@ namespace grb {
 				size_t ndim_to_linear_check( ConstVectorStorageType ndim_vector ) const {
 					size_t linear = 0;
 					for( size_t i = 0; i < this->dimensions(); i++ ) {
-						if( ndim_vector[i] >= this->get_sizes()[i] ) {
+						if( ndim_vector[ i ] >= this->get_sizes()[ i ] ) {
 							throw std::invalid_argument( "input vector beyond system sizes" );
 						}
 					}
@@ -190,7 +201,7 @@ namespace grb {
 				 * a const reference to \p InternalVectorType but does not check whether each value in the input
 				 * vector \p ndim_vector is within the system sizes.
 				 */
-				size_t ndim_to_linear( ConstVectorReference ndim_vector) const {
+				size_t ndim_to_linear( ConstVectorReference ndim_vector ) const {
 					return this->ndim_to_linear( ndim_vector.storage() );
 				}
 
@@ -202,7 +213,7 @@ namespace grb {
 				size_t ndim_to_linear( ConstVectorStorageType ndim_vector ) const {
 					size_t linear = 0;
 					for( size_t i = 0; i < this->dimensions(); i++ ) {
-						linear += this->_offsets[i] * ndim_vector[i];
+						linear += this->_offsets[ i ] * ndim_vector[ i ];
 					}
 					return linear;
 				}
@@ -215,12 +226,13 @@ namespace grb {
 				 */
 				void retarget( ConstVectorReference _new_sizes ) {
 					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
-						throw std::invalid_argument("new system must have same dimensions as previous: new "
-							+ std::to_string( _new_sizes.dimensions() ) + ", old "
-							+ std::to_string( this->_sizes.dimensions() ) );
+						throw std::invalid_argument(
+							"new system must have same dimensions as previous: new " + std::to_string( _new_sizes.dimensions() )
+								+ ", old " + std::to_string( this->_sizes.dimensions() ) );
 					}
 					this->_sizes = _new_sizes; // copy
-					this->_system_size = compute_range_product( _new_sizes.begin(), _new_sizes.end(), this->_offsets.begin() ) ;
+					this->_system_size = compute_range_product( _new_sizes.begin(), _new_sizes.end(),
+						this->_offsets.begin() );
 				}
 
 				/**
@@ -254,7 +266,11 @@ namespace grb {
 				template<
 					typename IterIn,
 					typename IterOut
-				> static size_t compute_range_product( IterIn in_begin, IterIn in_end, IterOut out_begin ) {
+				> static size_t compute_range_product(
+					IterIn in_begin,
+					IterIn in_end,
+					IterOut out_begin
+				) {
 					size_t prod = 1;
 					for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
 						*out_begin = prod;
@@ -265,7 +281,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
diff --git a/include/graphblas/utils/multigrid/ndim_system.hpp b/include/graphblas/utils/multigrid/ndim_system.hpp
index f184a7042..5df62ace2 100644
--- a/include/graphblas/utils/multigrid/ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/ndim_system.hpp
@@ -24,11 +24,10 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
 
-#include <cstddef>
 #include <algorithm>
-#include <vector>
-#include <type_traits>
 #include <cstddef>
+#include <type_traits>
+#include <vector>
 
 namespace grb {
 	namespace utils {
@@ -48,11 +47,11 @@ namespace grb {
 				typename InternalVectorType
 			> class NDimSystem {
 			public:
-				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type");
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type" );
 
 				using VectorType = InternalVectorType;
-				using VectorReference = VectorType&;
-				using ConstVectorReference = const VectorType&;
+				using VectorReference = VectorType &;
+				using ConstVectorReference = const VectorType &;
 				using SelfType = NDimSystem< SizeType, InternalVectorType >;
 
 				/**
@@ -66,9 +65,8 @@ namespace grb {
 				 * @param begin range begin
 				 * @param end end of range
 				 */
-				template< typename IterType > NDimSystem( IterType begin, IterType end) noexcept :
-					_sizes( std::distance( begin, end ) )
-				{
+				template< typename IterType >
+				NDimSystem( IterType begin, IterType end ) noexcept : _sizes( std::distance( begin, end ) ) {
 					std::copy( begin, end, this->_sizes.begin() );
 				}
 
@@ -76,16 +74,14 @@ namespace grb {
 				 * Construct a new NDimSystem object from an std::vector<>, taking its values
 				 * as system sizes and its length as number of dimensions.
 				 */
-				NDimSystem( const std::vector< size_t > &_sizes ) noexcept :
+				NDimSystem( const std::vector< size_t > & _sizes ) noexcept :
 					SelfType( _sizes.cbegin(), _sizes.cend() ) {}
 
 				/**
 				 * Construct a new NDimSystem object of dimensions \p dimensions
 				 *  and with all sizes initialized to \p max_size
 				 */
-				NDimSystem( size_t _dimensions, size_t max_size ) noexcept :
-					_sizes( _dimensions )
-				{
+				NDimSystem( size_t _dimensions, size_t max_size ) noexcept : _sizes( _dimensions ) {
 					std::fill_n( this->_sizes.begin(), _dimensions, max_size );
 				}
 
@@ -95,9 +91,9 @@ namespace grb {
 
 				NDimSystem( SelfType && ) = delete;
 
-				SelfType & operator=( const SelfType &original ) = default;
+				SelfType & operator=( const SelfType & original ) = default;
 
-				SelfType & operator=( SelfType &&original ) = delete;
+				SelfType & operator=( SelfType && original ) = delete;
 
 				inline size_t dimensions() const noexcept {
 					return _sizes.dimensions();
@@ -116,7 +112,7 @@ namespace grb {
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
diff --git a/include/graphblas/utils/multigrid/ndim_vector.hpp b/include/graphblas/utils/multigrid/ndim_vector.hpp
index 7992f23f6..5a3ef4144 100644
--- a/include/graphblas/utils/multigrid/ndim_vector.hpp
+++ b/include/graphblas/utils/multigrid/ndim_vector.hpp
@@ -24,11 +24,11 @@
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
 #define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
 
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
 #include <utility>
 #include <vector>
-#include <type_traits>
-#include <cstddef>
-#include <algorithm>
 
 #include "linearized_ndim_system.hpp"
 
@@ -64,8 +64,8 @@ namespace grb {
 					"the stored type is not default constructible" );
 				static_assert( std::is_integral< SizeType >::value, "SizeType must be integral" );
 
-				using ConstDomainVectorReference =
-					typename LinearizedNDimSystem< SizeType, InternalVectorType >::ConstVectorReference;
+				using ConstDomainVectorReference = typename LinearizedNDimSystem< SizeType,
+					InternalVectorType >::ConstVectorReference;
 				using ConstDomainVectorStorageType = typename InternalVectorType::ConstVectorStorageType;
 				using DomainIterator = typename LinearizedNDimSystem< SizeType, InternalVectorType >::Iterator;
 				using Selftype = NDimVector< DataType, SizeType, InternalVectorType >;
@@ -77,9 +77,10 @@ namespace grb {
 				 * and number of dimensions equal to the range distance; the data values are
 				 * \b not initialized.
 				 */
-				template< typename IterT > NDimVector( IterT begin, IterT end) :
-					_linearizer( begin, end )
-				{
+				template< typename IterT > NDimVector(
+					IterT begin,
+					IterT end
+				) : _linearizer( begin, end ) {
 					this->data = new DataType[ _linearizer.system_size() ];
 				}
 
@@ -88,26 +89,25 @@ namespace grb {
 				 * and number of dimensions equal to \p _sizes.size(); the data values are
 				 * \b not initialized.
 				 */
-				NDimVector( const std::vector< size_t > &_sizes ) :
+				NDimVector( const std::vector< size_t > & _sizes ) :
 					NDimVector( _sizes.cbegin(), _sizes.cend() ) {}
 
-				NDimVector( const Selftype& original ):
+				NDimVector( const Selftype & original ) :
 					_linearizer( original._linearizer ),
-				    data( new DataType[ original.data_size() ] )
+					data( new DataType[ original.data_size() ] )
 				{
 					std::copy_n( original.data, original.data_size(), this->data );
 				}
 
-				NDimVector( Selftype&& original ) noexcept:
-					_linearizer( std::move( original._linearizer ) )
-				{
+				NDimVector( Selftype && original ) noexcept :
+					_linearizer( std::move( original._linearizer ) ) {
 					this->data = original.data;
 					original.data = nullptr;
 				}
 
-				Selftype& operator=( const Selftype &original ) = delete;
+				Selftype & operator=( const Selftype & original ) = delete;
 
-				Selftype& operator=( Selftype &&original ) = delete;
+				Selftype & operator=( Selftype && original ) = delete;
 
 				~NDimVector() {
 					this->clean_mem();
@@ -131,7 +131,7 @@ namespace grb {
 				 * Access the data element at N-dimension coordinate given by the iterable
 				 * \p coordinates.
 				 */
-				inline DataType& at( ConstDomainVectorReference coordinates ) {
+				inline DataType & at( ConstDomainVectorReference coordinates ) {
 					return this->data[ this->get_coordinate( coordinates.storage() ) ];
 				}
 
@@ -139,7 +139,7 @@ namespace grb {
 				 * Const-access the data element at N-dimension coordinate given by the iterable
 				 * \p coordinates.
 				 */
-				inline const DataType& at( ConstDomainVectorReference coordinates ) const {
+				inline const DataType & at( ConstDomainVectorReference coordinates ) const {
 					return this->data[ this->get_coordinate( coordinates.storage() ) ];
 				}
 
@@ -147,7 +147,7 @@ namespace grb {
 				 * Access the data element at N-dimension coordinate given by the vector
 				 * storage object \p coordinates.
 				 */
-				inline DataType& at( ConstDomainVectorStorageType coordinates ) {
+				inline DataType & at( ConstDomainVectorStorageType coordinates ) {
 					return this->data[ this->get_coordinate( coordinates ) ];
 				}
 
@@ -155,7 +155,7 @@ namespace grb {
 				 * Const-access the data element at N-dimension coordinate given by the vector
 				 * storage object \p coordinates.
 				 */
-				inline const DataType& at( ConstDomainVectorStorageType coordinates ) const {
+				inline const DataType & at( ConstDomainVectorStorageType coordinates ) const {
 					return this->data[ this->get_coordinate( coordinates ) ];
 				}
 
@@ -177,7 +177,7 @@ namespace grb {
 
 			private:
 				const LinearizedNDimSystem< SizeType, InternalVectorType > _linearizer;
-				DataType* data;
+				DataType * data;
 
 				inline size_t get_coordinate( ConstDomainVectorStorageType coordinates ) const {
 					return this->_linearizer.ndim_to_linear( coordinates );
@@ -188,14 +188,14 @@ namespace grb {
 				}
 
 				void clean_mem() {
-					if ( this->data == nullptr ) {
+					if( this->data == nullptr ) {
 						delete[] this->data;
 					}
 				}
 			};
 
 		} // namespace multigrid
-	} // namespace utils
+	}     // namespace utils
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
diff --git a/include/graphblas/utils/telemetry/CSVWriter.hpp b/include/graphblas/utils/telemetry/CSVWriter.hpp
index 94a7111b6..d92d5efd1 100644
--- a/include/graphblas/utils/telemetry/CSVWriter.hpp
+++ b/include/graphblas/utils/telemetry/CSVWriter.hpp
@@ -15,9 +15,11 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
+/**
+ * @file CSVWriter.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the CSVWriter class.
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_CSV_WRITER
@@ -39,87 +41,54 @@ namespace grb {
 	namespace utils {
 		namespace telemetry {
 
+			/// standard CSV separator
 			static constexpr char STD_CSV_SEP = ',';
 
-			template< typename TelControllerType, bool enabled, class T1, class... Ts >
-			class CSVWriter : public TelemetryBase< TelControllerType, enabled > {
-			public:
-				template< class U, class... Us >
-				struct is_csv_printable {
-					static constexpr bool value = std::is_arithmetic< U >::value;
-				};
-
-				template< class U1, class U2, class... Us >
-				struct is_csv_printable< U1, U2, Us... > {
-					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
-				};
-
-				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
-
-				using self_t = CSVWriter< TelControllerType, enabled, T1, Ts... >;
-
-				using base_t = TelemetryBase< TelControllerType, enabled >;
-
-				CSVWriter() = delete;
-
-				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers, char _separator, size_t size ) : base_t( tt ) {
-					(void)tt;
-					(void)_headers;
-					(void)_separator;
-					(void)size;
-				}
-
-				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers ) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
-
-				CSVWriter( const self_t & ) = delete;
-
-				CSVWriter( self_t && ) = delete;
-
-				self_t & operator=( const self_t & ) = delete;
-
-				self_t & operator=( self_t && ) = delete;
-
-				template< class... UTypes >
-				void add_line( UTypes &&... ) {}
-
-				void clear() {}
-
-				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
-					return stream;
-				}
-
-				// print nothing
-				char last_line() const {
-					return '\0';
-				}
-
-				std::ostream & write_to_stream( std::ostream & stream ) const {
-					return stream;
-				}
+			template< class U, class... Us >
+			struct __is_csv_printable {
+				static constexpr bool value = std::is_arithmetic< U >::value;
+			};
 
-				void write_to_file( const char * name ) const {
-					(void)name;
-				}
+			template< class U1, class U2, class... Us >
+			struct __is_csv_printable< U1, U2, Us... > {
+				static constexpr bool value = __is_csv_printable< U1 >::value
+					&& __is_csv_printable< U2, Us... >::value;
 			};
 
-			template< typename TelControllerType, class T1, class... Ts >
-			class CSVWriter< TelControllerType, true, T1, Ts... > : public TelemetryBase< TelControllerType, true > {
+			/**
+			 * Class to store numerical information in form of lines and emit it as a CSV, with
+			 * heading, field separator and newlines.
+			 *
+			 * The user should add an line at once via #add_line( UTypes && ) and can
+			 * then output it to an \a std::ostream or a file, together with the
+			 * heading specified at construction. The output is a fully compliant CSV file
+			 * that can be read by common tools like spreadsheets and parsers (e.g. Pandas,
+			 * https://pandas.pydata.org/). This class allows easily emitting telemetry
+			 * information and importing them into advanced tools for thourough analysis.
+			 *
+			 * This implementation assumes telemetry is enabled, since a specialization for
+			 * disabled telemetry follows.
+			 * It internally allocates memory dynamically to store the lines.
+			 * Only numerical information can be stored.
+			 *
+			 * @tparam TelControllerType type for the telemetry controller
+			 * @tparam enabled whether telemetry is enabled
+			 * @tparam T1 numerical type of the first field to store (at least one is required)
+			 * @tparam Ts numerical types of the following fields to store
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled,
+				class T1,
+				class... Ts
+			> class CSVWriter :
+				public TelemetryBase< TelControllerType, enabled > {
 			public:
-				template< class U, class... Us >
-				struct is_csv_printable {
-					static constexpr bool value = std::is_arithmetic< U >::value;
-				};
-
-				template< class U1, class U2, class... Us >
-				struct is_csv_printable< U1, U2, Us... > {
-					static constexpr bool value = is_csv_printable< U1 >::value && is_csv_printable< U2, Us... >::value;
-				};
+				static_assert( __is_csv_printable< T1, Ts... >::value, "not all types are printable" );
 
-				static_assert( is_csv_printable< T1, Ts... >::value, "not all types are printable" );
-
-				using self_t = CSVWriter< TelControllerType, true, T1, Ts... >;
+				using self_t = CSVWriter< TelControllerType, enabled, T1, Ts... >;
 
-				using base_t = TelemetryBase< TelControllerType, true >;
+				using base_t = TelemetryBase< TelControllerType, enabled >;
 
 				class CSVLastTuple {
 				public:
@@ -137,7 +106,23 @@ namespace grb {
 
 				CSVWriter() = delete;
 
-				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers, char _separator, size_t size ) : base_t( tt ), separator( _separator ) {
+				/**
+				 * Full constructor for a CSVWriter.
+				 *
+				 * @param tt telemetry controller
+				 * @param _headers CSV headers, whose number must match the number of T types to print
+				 * @param _separator field separator for printing
+				 * @param size hint size for initial memory allocation (dynamic allocation may occur anyway)
+				 */
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers,
+					char _separator,
+					size_t size
+				) :
+					base_t( tt ),
+					separator( _separator )
+				{
 					if( _headers.size() != NUM_FIELDS ) {
 						throw std::runtime_error( "wrong number of headers, it must match the unmber of line elements" );
 					}
@@ -154,7 +139,14 @@ namespace grb {
 					// std::memset( reinterpret_cast< void * >( lines.data() ), 0, lines.size() * sizeof( tuple_t ) );
 				}
 
-				CSVWriter( const TelControllerType & tt, std::initializer_list< const char * > _headers ) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
+				/**
+				 * Construct a new CSVWriter object assuming a comma separator and an initial
+				 * amount of lines to store.
+				 */
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers
+				) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
 
 				CSVWriter( const self_t & ) = delete;
 
@@ -164,6 +156,12 @@ namespace grb {
 
 				self_t & operator=( self_t && ) = delete;
 
+				/**
+				 * Add a line to the CSV, i.e., store the numerical information internally.
+				 *
+				 * @tparam UTypes information types whose number must match the number of fields in the CSV;
+				 * 	these types must also be implicitly convertible to the corresponding T1, Ts... types
+				 */
 				template< class... UTypes >
 				void add_line( UTypes &&... vs ) {
 					if( this->is_active() ) {
@@ -171,10 +169,22 @@ namespace grb {
 					}
 				}
 
+				/**
+				 * Remove all lines from the CSV.
+				 */
 				void clear() {
 					lines.clear();
 				}
 
+				/**
+				 * Emit the last line of the CSV into \p stream as actual text, i.e. with the fields separated.
+				 * Does not print the newline.
+				 *
+				 * If there is no line stored, the behavior is undefined.
+				 *
+				 * @param stream stream to write into
+				 * @return std::ostream& \p stream itself
+				 */
 				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
 					if( lines.size() > 0 && this->is_active() ) {
 						write_line( stream, lines.back() );
@@ -182,6 +192,12 @@ namespace grb {
 					return stream;
 				}
 
+				/**
+				 * Returns an object that can be streamed into an std::cout stream via the \a << operator
+				 * in order to print the last line stored.
+				 *
+				 * If there is no line stored, the behavior is undefined.
+				 */
 				CSVLastTuple last_line() const {
 					if( lines.size() == 0 ) {
 						throw std::runtime_error( "no measures" );
@@ -189,6 +205,9 @@ namespace grb {
 					return CSVLastTuple( *this );
 				}
 
+				/**
+				 * Write the entire CSV into \p stream, with heading (heading, separated fields with newline).
+				 */
 				std::ostream & write_to_stream( std::ostream & stream ) const {
 					if( ! this->is_active() ) {
 						return stream;
@@ -202,6 +221,10 @@ namespace grb {
 					return stream;
 				}
 
+				/**
+				 * Creates a new file named \p name (or overwrites an existing one) and stores the entire CSV
+				 * into it.
+				 */
 				void write_to_file( const char * name ) const {
 					if( ! this->is_active() ) {
 						return;
@@ -252,6 +275,75 @@ namespace grb {
 				}
 			};
 
+			/**
+			 * Temaplate specialization that assumes disabled telemetry: no state is kept,
+			 * operations produce no result when invoked (no output into streams, no file creation).
+			 *
+			 * @tparam TelControllerType
+			 * @tparam T1
+			 * @tparam Ts
+			 */
+			template<
+				typename TelControllerType,
+				class T1,
+				class... Ts
+			> class CSVWriter< TelControllerType, false, T1, Ts... > :
+				public TelemetryBase< TelControllerType, false > {
+			public:
+				static_assert( __is_csv_printable< T1, Ts... >::value, "not all types are printable" );
+
+				using self_t = CSVWriter< TelControllerType, false, T1, Ts... >;
+
+				using base_t = TelemetryBase< TelControllerType, false >;
+
+				CSVWriter() = delete;
+
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * >,
+					char,
+					size_t
+				) : base_t( tt ) {}
+
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers
+				) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
+
+				CSVWriter( const self_t & ) = delete;
+
+				CSVWriter( self_t && ) = delete;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				self_t & operator=( self_t && ) = delete;
+
+				template< class... UTypes > void add_line( UTypes &&... ) {
+					static_assert( sizeof...( UTypes ) == sizeof...( Ts ) + 1 );
+				}
+
+				void clear() {}
+
+				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				char last_line() const {
+					return '\0';
+				}
+
+				std::ostream & write_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				void write_to_file( const char * name ) const {
+					(void)name;
+				}
+			};
+
+			/**
+			 * Implementation of CSVWriter for enabled telemetry, with implemented operations.
+			 */
 			template< class T1, class... Ts >
 			using StaticCSVWriter = CSVWriter< TelemetryControllerAlwaysOn, true, T1, Ts... >;
 
diff --git a/include/graphblas/utils/telemetry/OutputStream.hpp b/include/graphblas/utils/telemetry/OutputStream.hpp
index 8ec0606d7..3d7c9fb1b 100644
--- a/include/graphblas/utils/telemetry/OutputStream.hpp
+++ b/include/graphblas/utils/telemetry/OutputStream.hpp
@@ -15,9 +15,11 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
+/**
+ * @file OutputStream.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the OutputStream class.
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
@@ -34,7 +36,12 @@ namespace grb {
 	namespace utils {
 		namespace telemetry {
 
+			/**
+			 * SFINAE-based class to check whether the type \p T can be input to an std::ostream
+			 * via the \a << operator.
+			 */
 			template< typename T > struct is_ostream_input {
+			private:
 
 				template< typename U > static constexpr bool is_input(
 					typename std::enable_if< std::is_same<
@@ -49,13 +56,22 @@ namespace grb {
 					return false;
 				}
 
+			public:
 				static constexpr bool value = is_input< T >( nullptr );
 			};
 
-			class OutputStreamLazy {
-				constexpr char operator()() const { return '\0'; }
-			};
-
+			/**
+			 * Telemetry-controllable output stream with basic interface, based on the \a << operator.
+			 *
+			 * It accepts in input any type \a std::ostream accepts. In addition, it also accepts
+			 * the internl #OutputStreamLazy<RetType> type, which marks callable objects and allows
+			 * lazy evaluation of their result if the telemetry is active; if not, the object is
+			 * not called, avoiding runtime costs. This functionality allows paying time and memory
+			 * costs of computation only if really needed.
+			 *
+			 * @tparam TelControllerType type of the telemetry controller
+			 * @tparam enabled whether telemetry is enabled for this type
+			 */
 			template<
 				typename TelControllerType,
 				bool enabled = TelControllerType::enabled
@@ -63,64 +79,77 @@ namespace grb {
 			public:
 				using self_t = OutputStream< TelControllerType, enabled >;
 
-				OutputStream() = default;
-
-				OutputStream( const TelControllerType & _tt, std::ostream & _out ) :
-					TelemetryBase< TelControllerType, enabled >( _tt )
-				{
-					( void ) _out;
+				using base_t = TelemetryBase< TelControllerType, enabled >;
+
+				/**
+				 * Marker object to indicate that the stored callable object is to be called
+				 * in a lazy way, i.e., only if output is active.
+				 *
+				 * @tparam RetType return type of the collable object, to be printed
+				 */
+				template< typename RetType > class OutputStreamLazy {
+
+					const std::function< RetType() > f;
+
+				public:
+					static_assert( is_ostream_input< RetType >::value );
+
+					template< class F > OutputStreamLazy( F&& _f ) : f( std::forward< F >( _f ) ) {}
+
+					RetType operator()() const { return f(); }
+				};
+
+				/**
+				 * Convenience function to create an #OutputStreamLazy<RetType> object from
+				 * a callable one, inferring all template parameters automatically.
+				 *
+				 * @tparam CallableType type of the given callable object
+				 * @tparam RetType return type of the callable object, to be printed
+				 * @param f callable object
+				 * @return OutputStreamLazy< RetType > object marking lazy evaluation for printing
+				 */
+				template<
+					typename CallableType,
+					typename RetType = decltype( std::declval< CallableType >()() )
+				> static OutputStreamLazy< RetType > makeLazy( CallableType&& f ) {
+					static_assert( is_ostream_input< RetType >::value );
+					return OutputStreamLazy< RetType >( std::forward< CallableType >( f ) );
 				}
 
-				OutputStream( const self_t & _out ) = default;
-
-				OutputStream & operator=( const self_t & _out ) = delete;
-
-				template< typename T > inline typename std::enable_if<
-					is_ostream_input< T >::value,
-				self_t & >::type operator<<( T&& v ) {
-					( void ) v;
-					return *this;
-				}
-
-				inline self_t & operator<<( std::ostream& (*func)( std::ostream& ) ) {
-					( void ) func;
-					return *this;
-				}
-
-				template< class F > inline typename std::enable_if<
-					is_ostream_input< decltype( std::declval< F >()() ) >::value
-					&& std::is_base_of< OutputStreamLazy, F >::value,
-				self_t & >::type operator<<( F&& fun ) {
-					( void ) fun;
-					return *this;
-				}
-			};
-
-			template< typename TelControllerType > class OutputStream< TelControllerType, true > :
-				public TelemetryBase< TelControllerType, true > {
-			public:
-				using self_t = OutputStream< TelControllerType, true >;
-
-				using base_t = TelemetryBase< TelControllerType, true >;
-
-				OutputStream( const TelControllerType & _tt, std::ostream & _out ) :
-					TelemetryBase< TelControllerType, true >( _tt ),
+				/**
+				 * Construct a new Output Stream object from a telemetry controller \p -tt
+				 * and an output stream \p _out (usually \a std::cout)
+				 */
+				OutputStream(
+					const TelControllerType & _tt,
+					std::ostream & _out
+				) :
+					TelemetryBase< TelControllerType, enabled >( _tt ),
 					out( _out )
 				{}
 
+				/**
+				 * Copy constructor.
+				 */
 				OutputStream( const self_t & _outs ) = default;
 
 				OutputStream & operator=( const self_t & _out ) = delete;
 
-				template< typename T > inline typename std::enable_if<
-					is_ostream_input< T >::value,
-				self_t & >::type operator<<( T&& v ) {
+				/**
+				 * Stream input operator, enabled for all types std::ostream supports.
+				 */
+				template< typename T > inline typename std::enable_if< is_ostream_input< T >::value,
+					self_t & >::type operator<<( T&& v ) {
 					if ( this->is_active() ) {
 						out << std::forward< T >( v );
 					}
 					return *this;
 				}
 
+				/**
+				 * Specialization of the \a << operator for stream manipulators, to support
+				 * \a std::endl and similar manipulators.
+				 */
 				inline self_t & operator<<( std::ostream& (*func)( std::ostream& ) ) {
 					if ( this->is_active() ) {
 						out << func;
@@ -128,10 +157,24 @@ namespace grb {
 					return *this;
 				}
 
+				/**
+				 * Specialization of the \a << operator for lazy evaluation of callable objects.
+				 *
+				 * A callable object can be wrapped into an #OutputStreamLazy<F> object in order
+				 * to be called only if necessary, i.e., only if the stream \a this is active.
+				 * In this case, the internal callable object is called, its result is materialized
+				 * and sent into the stream.
+				 *
+				 * To conveniently instantiate an #OutputStreamLazy<F> to pass to this operator,
+				 * see #makeLazy(CallableType&&).
+				 *
+				 * @tparam F type of the callable object
+				 * @param fun callable object
+				 * @return self_t & the stream itself
+				 */
 				template< class F > inline typename std::enable_if<
-					is_ostream_input< decltype( std::declval< F >()() ) >::value
-					&& std::is_base_of< OutputStreamLazy, F >::value,
-				self_t & >::type operator<<( F&& fun ) {
+					is_ostream_input< decltype( std::declval< OutputStreamLazy< F > >()() ) >::value,
+				self_t & >::type operator<<( const OutputStreamLazy< F >& fun ) {
 					if ( this->is_active() ) {
 						out << fun();
 					}
@@ -142,9 +185,69 @@ namespace grb {
 				std::ostream & out;
 			};
 
-			using OutputStreamOff = OutputStream< TelemetryControllerAlwaysOff, false >;
+			/**
+			 * Template specialization of OutputStream<TelControllerType,enabled>
+			 * for deactivated telemetry: no information is stored, no output produced.
+			 */
+			template<
+				typename TelControllerType
+			> class OutputStream< TelControllerType, false > :
+				public TelemetryBase< TelControllerType, false > {
+			public:
+				using self_t = OutputStream< TelControllerType, false >;
+
+
+				template< typename RetType > struct OutputStreamLazy {
+
+					static_assert( is_ostream_input< RetType >::value );
+
+					template< class F > OutputStreamLazy( F&& ) {}
 
+					constexpr char operator()() const { return '\0'; }
+				};
+
+				template<
+					typename CallableType,
+					typename RetType = decltype( std::declval< CallableType >()() )
+				> static OutputStreamLazy< RetType > makeLazy( CallableType&& f ) {
+					static_assert( is_ostream_input< RetType >::value );
+					return OutputStreamLazy< RetType >( std::forward< CallableType >( f ) );
+				}
+
+				OutputStream() = default;
+
+				OutputStream( const TelControllerType & _tt, std::ostream & ) :
+					TelemetryBase< TelControllerType, false >( _tt ) {}
+
+				OutputStream( const self_t & _out ) = default;
+
+				OutputStream & operator=( const self_t & _out ) = delete;
+
+				inline self_t & operator<<( std::ostream& (*)( std::ostream& ) ) {
+					return *this;
+				}
+
+				/**
+				 * All-capturing implementation for the input stream operator, printing nothing.
+				 *
+				 * This operator is convenient especially for debugging cases.
+				 * In case of "normal" stream types used with custom data types, the user
+				 * must extend them manually to print the custom data type. If the user uses a
+				 * deactivated stream (for example as a default template parameter to disable
+				 * logging by default), she needs not extend it for custom types in order
+				 * to make it compile, which is especially nonsensical when the output is deactivated.
+				*/
+				template< typename T > self_t & operator<<( T&& ) {
+					return *this;
+				}
+			};
+
+			/// Always active output stream, mainly for debugging purposes.
 			using OutputStreamOn = OutputStream< TelemetryControllerAlwaysOn, true >;
+
+			/// Always inactive output stream
+			using OutputStreamOff = OutputStream< TelemetryControllerAlwaysOff, false >;
+
 		}
 	}
 }
diff --git a/include/graphblas/utils/telemetry/Stopwatch.hpp b/include/graphblas/utils/telemetry/Stopwatch.hpp
index 1faa2e186..a607a3cbd 100644
--- a/include/graphblas/utils/telemetry/Stopwatch.hpp
+++ b/include/graphblas/utils/telemetry/Stopwatch.hpp
@@ -15,9 +15,11 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
+/**
+ * @file Stopwatch.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the Stopwatch class.
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_STOPWATCH
@@ -31,49 +33,62 @@ namespace grb {
 	namespace utils {
 		namespace telemetry {
 
+			/**
+			 * Type to store time duration in nanoseconds, which is the default time granularity.
+			 */
 			using duration_nano_t = size_t;
 
+			/**
+			 * Duration as floating point type, for time granularities coarser than nanoseconds.
+			 */
 			using duration_float_t = double;
 
+			/**
+			 * Base class for Stopwatch, with common logic.
+			 */
 			class StopwatchBase {
 			public:
+
+				/**
+				 * Convert nanoseconds to microseconds, returned as floating point type duration_float_t.
+				 */
 				static inline duration_float_t nano2Micro( duration_nano_t nano ) {
 					return static_cast< duration_float_t >( nano ) / 1000UL;
 				}
 
+				/**
+				 * Convert nanoseconds to milliseconds, returned as floating point type duration_float_t.
+				 */
 				static inline duration_float_t nano2Milli( duration_nano_t nano ) {
 					return static_cast< duration_float_t >( nano ) / 1000000UL;
 				}
 
+				/**
+				 * Convert nanoseconds to seconds, returned as floating point type duration_float_t.
+				 */
 				static inline duration_float_t nano2Sec( duration_nano_t nano ) {
 					return static_cast< duration_float_t >( nano ) / 1000000000UL;
 				}
 			};
 
-			template< typename TelControllerType, bool enabled = TelControllerType::enabled >
-			class Stopwatch : public StopwatchBase, public TelemetryBase< TelControllerType, enabled > {
-			public:
-				Stopwatch( const TelControllerType & tt ) : StopwatchBase(), TelemetryBase< TelControllerType, enabled >( tt ) {}
-
-				Stopwatch( const Stopwatch & ) = default;
-
-				constexpr inline void start() {}
-
-				constexpr inline duration_nano_t stop() {
-					return static_cast< duration_nano_t >( 0 );
-				}
-
-				constexpr inline duration_nano_t reset() {
-					return static_cast< duration_nano_t >( 0 );
-				}
-
-				constexpr inline duration_nano_t getElapsedNano() const {
-					return static_cast< duration_nano_t >( 0 );
-				}
-			};
-
-			template< typename TelControllerType >
-			class Stopwatch< TelControllerType, true > : public StopwatchBase, public TelemetryBase< TelControllerType, true > {
+			/**
+			 * Class with functionalities to measure elapsed time for telemetry purposes: start, stop, reset.
+			 *
+			 * The time granularity is nanoseconds.
+			 *
+			 * Copy semantics is not available.
+			 *
+			 * This implementation assumes telemetry is enabled and the active state is controlled via
+			 * a telemetry controller of type \p TelControllerType.
+			 *
+			 * @tparam TelControllerType underlying telemetry controller type
+			 * @tparam enabled whether it is compile-time enabled
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class Stopwatch :
+				public StopwatchBase, public TelemetryBase< TelControllerType, enabled > {
 
 				typedef typename std::chrono::high_resolution_clock clock_t;
 
@@ -81,21 +96,40 @@ namespace grb {
 
 				typedef typename std::chrono::high_resolution_clock::time_point time_point_t;
 
-				duration_t elapsedTime;
+				duration_t elapsedTime; ///< measured elapsed time so far, i.e., accumulated time periods between successive calls to #start() and #stop()
 
-				time_point_t beginning;
+				time_point_t beginning; ///< time instant of last call to #start()
 
 			public:
-				Stopwatch( const TelControllerType & tt ) : StopwatchBase(), TelemetryBase< TelControllerType, true >( tt ), elapsedTime( duration_t::zero() ) {}
-
-				Stopwatch( const Stopwatch & s ) = default;
-
+				/**
+				 * Construct a new Stopwatch object from a telemetry controller.
+				 *
+				 * @param tt underlying telemetry controller, to be (de)activated at runtime
+				 */
+				Stopwatch( const TelControllerType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelControllerType, true >( tt ),
+					elapsedTime( duration_t::zero() ) {}
+
+				Stopwatch( const Stopwatch< TelControllerType, enabled > &  ) = delete;
+
+				/**
+				 * Start measuring time.
+				 *
+				 * Subsequent calls to this method "reset" the measure of elapsed time: if the user calls #start()
+				 * twice and then #stop(), the elapsed time accumulated internally after the call to #stop() is
+				 * the time elapsed from the \b second call of #start() to the call to #stop().
+				 */
 				inline void start() {
 					if( this->is_active() ) {
 						beginning = clock_t::now();
 					}
 				}
 
+				/**
+				 * Stops time measurement, returning the elapsed time since the last #start() invocation.
+				 * Elapsed time is internally accounted only if this method is invoked.
+				 */
 				inline duration_nano_t stop() {
 					duration_nano_t count = 0;
 					if( this->is_active() ) {
@@ -107,21 +141,95 @@ namespace grb {
 					return count;
 				}
 
+				/**
+				 * Returns the elapsed time, which is accounted \b only if #stop() is called.
+				 *
+				 * The value of the elapsed time is not erased, so that successive calls return
+				 * the same value.
+				 */
+				inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( elapsedTime.count() );
+				}
+
+				/**
+				 * To be called on a stopped watch, it returns the elapsed time and sets it to 0.
+				 */
 				inline duration_nano_t reset() {
-					duration_t r = duration_t::zero();
+					duration_nano_t r = getElapsedNano();
 					if( this->is_active() ) {
-						r = elapsedTime;
 						elapsedTime = duration_t::zero();
 					}
-					return static_cast< duration_nano_t >( r.count() );
+					return r;
 				}
 
-				inline duration_nano_t getElapsedNano() const {
-					return static_cast< duration_nano_t >( elapsedTime.count() );
+				/**
+				 * Stops the watch, sets the elapsed time to 0, starts it again
+				 * and returns the time elapsed between the previous #start()
+				 * and the #stop() internally called.
+				*/
+				inline duration_nano_t restart() {
+					stop();
+					duration_nano_t r = reset();
+					start();
+					return r;
 				}
 			};
 
-			using StaticStopwatch = Stopwatch< TelemetryControllerAlwaysOn, true >;
+			/**
+			 * Template specialization of Stopwatch<TelControllerType, enabled> for disabled telemetry:
+			 * no state is stored, all functions are inactive.
+			 */
+			template<
+				typename TelControllerType
+			> class Stopwatch< TelControllerType, false > :
+				public StopwatchBase, public TelemetryBase< TelControllerType, false > {
+			public:
+				Stopwatch( const TelControllerType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelControllerType, false >( tt ) {}
+
+				Stopwatch( const Stopwatch< TelControllerType, false > & ) = delete;
+
+				constexpr inline void start() {}
+
+				constexpr inline duration_nano_t stop() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t reset() {
+					return static_cast< duration_nano_t >( 0 );
+
+				}
+
+				constexpr inline duration_nano_t restart() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			};
+
+			/**
+			 * Always active stopwatch, requiring no telemetry controller for construction.
+			 * Mainly for debugging purposes.
+			 */
+			class ActiveStopwatch : public Stopwatch< TelemetryControllerAlwaysOn, true > {
+			public:
+
+				using base_t = Stopwatch< TelemetryControllerAlwaysOn, true >;
+
+				ActiveStopwatch():
+					base_t( tt ),
+					tt( true ) {}
+
+				ActiveStopwatch( const ActiveStopwatch & ) = delete;
+
+			private:
+				TelemetryControllerAlwaysOn tt;
+			};
+
 		} // namespace telemetry
 	}     // namespace utils
 } // namespace grb
diff --git a/include/graphblas/utils/telemetry/Telemetry.hpp b/include/graphblas/utils/telemetry/Telemetry.hpp
index 0bb35909b..3da512b82 100644
--- a/include/graphblas/utils/telemetry/Telemetry.hpp
+++ b/include/graphblas/utils/telemetry/Telemetry.hpp
@@ -15,9 +15,28 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
+/**
+ * @dir include/graphblas/utils/telemetry
+ * This folder contains all telemetry functionalities, i.e., those meant to measure
+ * and report code execution in detail. They are designed with two goals in mind:
+ *   -# <b>compile-time control</b>: all functionalities can be activated or deactivated
+ * 		at compile-time; if deactivated, they incur no runtime and memory cost
+ *   -# <b>fine granularity</b>: since telemetry is complex and very application-specific,
+ * 		they allow fine-grained measurement and reporting; hence, they are also meant
+ * 		to be conveniently integrated into an existing application at fine granularity
+ *   -# <b>no pre-processor cluttering</b>: multiple specializations may exist for the same functionality,
+ * 		for example to avoid memory or runtime costs if telemetry is deactivated; all
+ * 		implementations \b must compile against the same code paths, to avoid verbose
+ * 		insertion of #ifdef or similar directives on user's behalf.
+ *
+ * See the documentation of TelemetryController.hpp for some basic examples.
+ */
+
+/**
+ * @file OutputStream.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Convenience all-include header for all telemetry-related functionalities.
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY
diff --git a/include/graphblas/utils/telemetry/TelemetryBase.hpp b/include/graphblas/utils/telemetry/TelemetryBase.hpp
index fcb9f5105..04773591a 100644
--- a/include/graphblas/utils/telemetry/TelemetryBase.hpp
+++ b/include/graphblas/utils/telemetry/TelemetryBase.hpp
@@ -15,9 +15,11 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 1st March, 2023
+/**
+ * @file TelemetryBase.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the TelemetryBase class.
  */
 
 #ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
@@ -30,57 +32,74 @@ namespace grb {
 		namespace telemetry {
 
 			/**
+			 * Base class provided as a convenience, exposing whether the telemetry is active.
 			 *
+			 * Default contruction is unavailable, because telemetry functionalities need an
+			 * underlying telemetry controller to know whether they are enabled and active.
 			 *
-			 * @tparam TelControllerType
-			 * @tparam enabled
+			 * Instead, copy construction is available for inheriting classes to easily implement copy semantics
+			 * if needed; the copy shares the same telemetry controller of the original object via a reference.
+			 *
+			 * This implementation corresponds to enabled telemetry and stores an actual
+			 * telemetry controller at runtime to be notified about its active state.
+			 *
+			 * @tparam TelControllerType type of the underlying telemetry controller,
+			 * 	usually derived from TelemetryControllerBase
+			 * @tparam enabled whther the current type is enabled (usually equals to TelControllerType::enabled)
 			 */
 			template<
 				typename TelControllerType,
 				bool enabled = TelControllerType::enabled
 			> class TelemetryBase {
+
+				const TelControllerType & telemetry_Controller;
+
 			public:
 				static_assert( is_telemetry_controller< TelControllerType >::value,
 					"type TelControllerType does not implement Telemetry Controller interface" );
 
 				using self_t = TelemetryBase< TelControllerType, enabled >;
 
-				TelemetryBase() = default;
-
-				TelemetryBase( const TelControllerType & tt ) {
-					( void ) tt;
-				}
+				TelemetryBase( const TelControllerType & tt ): telemetry_Controller( tt ) {}
 
-				TelemetryBase( const self_t & ) = default;
+				TelemetryBase( const self_t & tb ) : telemetry_Controller( tb.telemetry_Controller ) {}
 
 				self_t & operator=( const self_t & ) = delete;
 
-				constexpr bool is_active() const { return false; }
+				bool is_active() const { return telemetry_Controller.is_active(); }
 			};
 
-
-			template<
+			/**
+			 * Template specialization for disabled telemetry: no state, no activity.
+			 *
+			 * @tparam TelControllerType
+			 */
+			template <
 				typename TelControllerType
-			> class TelemetryBase< TelControllerType, true > {
-
-				const TelControllerType & telemetry_Controller;
-
+			> class TelemetryBase< TelControllerType, false > {
 			public:
 				static_assert( is_telemetry_controller< TelControllerType >::value,
 					"type TelControllerType does not implement Telemetry Controller interface" );
 
-				using self_t = TelemetryBase< TelControllerType, true >;
+				using self_t = TelemetryBase< TelControllerType, false >;
 
-				TelemetryBase( const TelControllerType & tt ): telemetry_Controller( tt ) {}
+				TelemetryBase() = default;
 
-				TelemetryBase( const self_t & tb ) : telemetry_Controller( tb.telemetry_Controller ) {}
+				TelemetryBase( const TelControllerType & ) {}
+
+				TelemetryBase( const self_t & ) = default;
 
 				self_t & operator=( const self_t & ) = delete;
 
-				bool is_active() const { return telemetry_Controller.is_active(); }
+				constexpr bool is_active() const { return false; }
 			};
 
-			// always actibe base, especially for prototyping scenarios
+			/**
+			 * Specialization of TelemetryControllerBase for enabled and always active telemetry,
+			 * mainly for debugging purposes: it is always active.
+			 *
+			 * For API compliance, it accepts an always-on telemetry controller, but does not store it.
+			 */
 			template<> class TelemetryBase< TelemetryControllerAlwaysOn, true > {
 			public:
 				static_assert( is_telemetry_controller< TelemetryControllerAlwaysOn >::value,
diff --git a/include/graphblas/utils/telemetry/TelemetryController.hpp b/include/graphblas/utils/telemetry/TelemetryController.hpp
index 63a013eab..f32c9ca21 100644
--- a/include/graphblas/utils/telemetry/TelemetryController.hpp
+++ b/include/graphblas/utils/telemetry/TelemetryController.hpp
@@ -16,8 +16,8 @@
  */
 
 /**
- * @author Alberto Scolari
- * @date 1st March, 2023
+ * @file TelemetryController.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
  *
  * This file defines the basic functionalities for <b>Telemetry Controllers</b>, i.e.,
  * objects that enable/disable telemetry at compile-time and runtime.
@@ -102,104 +102,113 @@ namespace grb {
 			 * field, possibly "short-circuiting" when #enabled is \a false. This implementation does
 			 * exactly this, disabling telemetry at compile-time and ignoring any runtime information.
 			 *
-			 * @tparam en whether telemetry is enabled (\p en = \a true has a dedicated template specialization)
+			 * Copy semantics is not available, because a controller stores just one piece of information
+			 * (whether it is active) and a copy would essentially behave as a new object.
+			 * Therefore, users should rather create new controllers themselves or pass around references
+			 * to the same controller, in order to centralize control via a single controller object.
+			 *
+			 * Also move semantics is not available, since an "empty" controller makes no sense.
+			 *
+			 * This implementation assumes \p en = \a true, because a specialization for
+			 * \p en = \a false exists (hence #enabled is set as \a true at compile-time).
+			 *
+			 * @tparam en whether telemetry is enabled (\p en = \a false has a
+			 * dedicated template specialization)
 			 */
 			template< bool en > class TelemetryControllerBase {
 			public:
 				using self_t = TelemetryControllerBase< en >;
 
 				/**
-				 * Construct a new Telemetry Controller Base object with runtime information.
-				 *
-				 * HEre, runtime information is ignored, as this implementation disables any telemetry.
+				 * Construct a new Telemetry oCntroller Base object, specifying the \a active state.
 				 *
-				 * @param _enabled whether telemetry is runtime-enabled (ignored here)
+				 * @param _active whether the controller is \a active or not
 				 */
-				TelemetryControllerBase( bool _enabled ) {
-					(void) _enabled;
-				}
+				TelemetryControllerBase( bool _active ) : active( _active ) {}
 
 				TelemetryControllerBase() = delete;
 
-				TelemetryControllerBase( const self_t & ) = delete;
+				TelemetryControllerBase( const self_t & ) = default;
 
 				TelemetryControllerBase& operator=( const self_t & ) = delete;
 
 				/**
-				 * Whether telemetry is runtime-active.
-				 *
-				 * @return true never here
-				 * @return false always
-				 */
-				constexpr bool inline is_active() const { return false; }
+				 * Tells whether the controller is \a active.
+				*/
+				bool is_active() const { return this->active; }
 
 				/**
-				 * Set the active status of the telemetry controller.
+				 * Set the \a active status of the controller at runtime.
 				 *
-				 * This \a disabled implementation ignores the input \p _active.
+				 * @param _active whether to activate the controller
 				 */
 				void inline set_active( bool _active ) {
-					( void ) _active;
+					this->active = _active;
 				}
 
 				/**
-				 * Whether telemetry is compile-time active (never here).
-				 */
-				static constexpr bool enabled = false;
-			};
+				 * Whether telemetry is compile-time active (here always).
+				*/
+				static constexpr bool enabled = true;
 
-			/**
-			 * Convenience definition fo an always-off telemetry controller.
-			 */
-			using TelemetryControllerAlwaysOff = TelemetryControllerBase< false >;
+			protected:
+				bool active;
+			};
 
 			/**
-			 * Template specialization for compile-time enabled telemetry, which
-			 * can be controlled at runtime.
+			 * Template specialization for compile-time disabled telemetry,
+			 * whose functionalities are all disabled.
 			 *
-			 * The controller is \b enabled by default, and its \a active status can be controlled
-			 * at runtime via the constructor and the #set_active(bool) method.
+			 * The controller is \b disabled by default, and modifications to
+			 * its \a active status are ignored.
 			 */
-			template<> class TelemetryControllerBase< true > {
+			template< > class TelemetryControllerBase< false > {
 			public:
-				using self_t = TelemetryControllerBase< true >;
+				using self_t = TelemetryControllerBase< false >;
 
 				/**
-				 * Construct a new Telemetry oCntroller Base object, specifying the \a active state.
+				 * Construct a new Telemetry Controller Base object with runtime information.
 				 *
-				 * @param _active whether the controller is \a active or not
+				 * Here, runtime information is ignored, as this implementation disables any telemetry.
+				 *
+				 * @param _enabled whether telemetry is runtime-enabled (ignored here)
 				 */
-				TelemetryControllerBase( bool _active ) : active( _active ) {}
+				TelemetryControllerBase( bool _enabled ) {
+					(void) _enabled;
+				}
 
 				TelemetryControllerBase() = delete;
 
-				TelemetryControllerBase( const self_t & ) = default;
+				TelemetryControllerBase( const self_t & ) = delete;
 
 				TelemetryControllerBase& operator=( const self_t & ) = delete;
 
 				/**
-				 * Tells whether the controller is \a active.
-				*/
-				bool is_active() const { return this->active; }
+				 * Whether telemetry is runtime-active.
+				 *
+				 * @return true never here
+				 * @return false always
+				 */
+				constexpr bool inline is_active() const { return false; }
 
 				/**
-				 * Set the \a active status of the controller at runtime.
+				 * Set the active status of the telemetry controller.
 				 *
-				 * @param _active whether to activate the controller
+				 * This \a disabled implementation ignores the input \p _active.
 				 */
-				void inline set_active( bool _active ) {
-					this->active = _active;
-				}
+				void inline set_active( bool ) {}
 
 				/**
-				 * Whether telemetry is compile-time active (here always).
-				*/
-				static constexpr bool enabled = true;
-
-			protected:
-				bool active;
+				 * Whether telemetry is compile-time active (never here).
+				 */
+				static constexpr bool enabled = false;
 			};
 
+			/**
+			 * Convenience definition fo an always-off telemetry controller.
+			 */
+			using TelemetryControllerAlwaysOff = TelemetryControllerBase< false >;
+
 			/**
 			 * Always active controller, useful especially for prototyping scenarios.
 			 */
@@ -225,9 +234,7 @@ namespace grb {
 				 *
 				 * This \a disabled implementation ignores the input \p _active.
 				 */
-				void inline set_active( bool _active ) {
-					( void ) _active;
-				}
+				void inline set_active( bool ) {}
 
 				/**
 				 * Whether telemetry is compile-time active (here always).
@@ -294,15 +301,17 @@ namespace grb {
  * This declaration requires the declaration of an associated controller enabler type, which controls
  * whether the controller is enabled at compile-time; the controller is by default \b deactivated.
  */
-#define DEFINE_TELEMETRY_CONTROLLER( name ) 																\
-	class __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) {};												\
-	using name = class __TELEMETRY_CONTROLLER_NAME( name ) :												\
-		public grb::utils::telemetry::TelemetryControllerBase<											\
-			grb::utils::telemetry::is_controller_enabled< __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() > {	\
-	public:																							\
-		using base_t = grb::utils::telemetry::TelemetryControllerBase<									\
-			grb::utils::telemetry::is_controller_enabled< __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() >;	\
-		__TELEMETRY_CONTROLLER_NAME( name )( bool _enabled ) : base_t( _enabled ) {}						\
+#define DEFINE_TELEMETRY_CONTROLLER( name ) 											\
+	class __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) {};								\
+	using name = class __TELEMETRY_CONTROLLER_NAME( name ) :							\
+		public grb::utils::telemetry::TelemetryControllerBase<							\
+			grb::utils::telemetry::is_controller_enabled<								\
+				__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() > {						\
+	public:																				\
+		using base_t = grb::utils::telemetry::TelemetryControllerBase<					\
+			grb::utils::telemetry::is_controller_enabled<								\
+				__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() >;						\
+		__TELEMETRY_CONTROLLER_NAME( name )( bool _enabled ) : base_t( _enabled ) {}	\
 	};
 
 /**
@@ -311,9 +320,9 @@ namespace grb {
  * Once enabled, it can be runtime activated.
  */
 #define ENABLE_TELEMETRY_CONTROLLER( name ) class __TELEMETRY_CONTROLLER_ENABLER_NAME( name );	\
-	namespace grb { namespace utils { namespace telemetry {						\
-		template<> constexpr bool is_controller_enabled<								\
-			__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() { return true; } 		\
+	namespace grb { namespace utils { namespace telemetry {										\
+		template<> constexpr bool is_controller_enabled<										\
+			__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() { return true; } 					\
 	} } }
 
 #endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
diff --git a/include/graphblas/utils/telemetry/Timeable.hpp b/include/graphblas/utils/telemetry/Timeable.hpp
index 95d1bdfa2..2ffb97723 100644
--- a/include/graphblas/utils/telemetry/Timeable.hpp
+++ b/include/graphblas/utils/telemetry/Timeable.hpp
@@ -15,13 +15,15 @@
  * limitations under the License.
  */
 
-/*
- * @author Alberto Scolari
- * @date 14th February, 2023
+/**
+ * @file Timeable.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the Timeable class.
  */
 
-#ifndef _H_GRB_UTILS_TIMEABLE
-#define _H_GRB_UTILS_TIMEABLE
+#ifndef _H_GRB_UTILS_TELEMETRY_TIMEABLE
+#define _H_GRB_UTILS_TELEMETRY_TIMEABLE
 
 #include "Stopwatch.hpp"
 
@@ -29,6 +31,14 @@ namespace grb {
 	namespace utils {
 		namespace telemetry {
 
+			/**
+			 * Facility for inheriting classes that want to time interal operations:
+			 * this class provides protected methods to measure elapsed time and public methods to expose
+			 * elapsed time and allow resetting the internal elapsed time.
+			 *
+			 * @tparam TelControllerType type of telemetry controller
+			 * @tparam enabled whether telemetry is enabled
+			 */
 			template<
 				typename TelControllerType,
 				bool enabled = TelControllerType::enabled
@@ -44,23 +54,41 @@ namespace grb {
 
 				Timeable& operator=( const self_t & ) = delete;
 
+				/**
+				 * Get the elapsed time, in nanoseconds.
+				 */
 				constexpr inline duration_nano_t getElapsedNano() const {
 					return static_cast< duration_nano_t >( 0 );
 				}
 
+				/**
+				 * Reset the internal value of elapsed time.
+				 */
 				constexpr inline duration_nano_t reset() {
 					return static_cast< duration_nano_t >( 0 );
 				}
 
 			protected:
+
+				/**
+				 * Starts measuring the elapsed time.
+				 */
 				inline void start() {}
 
+				/**
+				 * Stops measuring elapsed time.
+				 */
 				constexpr inline duration_nano_t stop() {
 					return static_cast< duration_nano_t >( 0 );
 				}
 
 			};
 
+			/**
+			 * Implementation of Timeable for enabled telemetry.
+			 *
+			 * @tparam TelControllerType type of telemetry controller.
+			 */
 			template< typename TelControllerType > class Timeable< TelControllerType, true > {
 			public:
 				using self_t = Timeable< TelControllerType, true >;
@@ -98,4 +126,4 @@ namespace grb {
 	}
 }
 
-#endif // _H_GRB_UTILS_TIMEABLE
+#endif // _H_GRB_UTILS_TELEMETRY_TIMEABLE
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 07c38cc99..45c89cd29 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -24,28 +24,25 @@
  * benchmark impementation in https://github.com/hpcg-benchmark/hpcg.
  */
 
+#include <algorithm>
+#include <array>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <algorithm>
-#include <array>
 #include <cstring>
 #include <iomanip>
+#include <iostream>
 #include <locale>
+#include <memory>
+#include <type_traits>
 
 #include <graphblas.hpp>
-
+#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
 #include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
 #include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
-#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
-#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
-
-#include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
-#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
-
 #include <graphblas/utils/Timer.hpp>
 #include <graphblas/utils/telemetry/Telemetry.hpp>
 
@@ -83,7 +80,7 @@ struct HPCGTypes {
 	using NonzeroType = value_t;
 	using InputType = value_t;
 	using ResidualType = value_t;
-	using Ring = Semiring< grb::operators::add< NonzeroType >, grb::operators::mul< NonzeroType >,
+	using Ring = Semiring< grb::operators::add< NonzeroType >,grb::operators::mul< NonzeroType >,
 		grb::identities::zero, grb::identities::one >;
 	using Minus = operators::subtract< NonzeroType >;
 	using Divide = operators::divide< NonzeroType >;
@@ -139,8 +136,8 @@ using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_controller
 using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
 
 struct dotter : grb::utils::telemetry::OutputStreamLazy {
-	const grb::Vector< IOType > &v;
-	dotter( const grb::Vector< IOType > &_v ) : v( _v ) {}
+	const grb::Vector< IOType > & v;
+	dotter( const grb::Vector< IOType > & _v ) : v( _v ) {}
 	ResidualType operator()() const {
 		Ring ring;
 		ResidualType r = 0;
@@ -154,10 +151,10 @@ static inline DBGStream & operator<<( DBGStream & stream, const grb::Vector< IOT
 	return stream << dotter( v );
 }
 
-static const IOType io_zero = Ring(). template getZero< IOType >();
-static const NonzeroType nz_zero = Ring(). template getZero< NonzeroType >();
-static const InputType input_zero = Ring(). template getZero< InputType >();
-static const ResidualType residual_zero = Ring(). template getZero< ResidualType >();
+static const IOType io_zero = Ring().template getZero< IOType >();
+static const NonzeroType nz_zero = Ring().template getZero< NonzeroType >();
+static const InputType input_zero = Ring().template getZero< InputType >();
+static const ResidualType residual_zero = Ring().template getZero< ResidualType >();
 
 static constexpr size_t MAX_CSV_PATH_LENGTH = 255;
 
@@ -202,14 +199,12 @@ struct output {
 };
 
 #ifdef HPCG_PRINT_SYSTEM
-static void print_system(
-	const std::vector< std::unique_ptr< mg_data_t > > &system_levels,
-	const std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels
-) {
+static void print_system( const std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	const std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels ) {
 	assert( spmd<>::nprocs() == 1 ); // distributed printin of system not implemented
 	print_matrix( system_levels[ 0 ]->A, 70, "A" );
 	for( size_t i = 0; i < coarsener_levels.size(); i++ ) {
-		print_matrix( coarsener_levels[i ] ->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( coarsener_levels[ i ]->coarsening_matrix, 50, "COARSENING MATRIX" );
 		print_matrix( system_levels[ i + 1 ]->A, 50, "COARSER SYSTEM MATRIX" );
 	}
 }
@@ -233,7 +228,7 @@ static void allocate_system_structures( std::vector< std::unique_ptr< mg_data_t
 ) {
 	grb::utils::Timer timer;
 
-	hpcg_data_t *data = new hpcg_data_t( mg_sizes[ 0 ] );
+	hpcg_data_t * data = new hpcg_data_t( mg_sizes[ 0 ] );
 	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
 	logger << "allocating data for the MultiGrid simulation...";
 	timer.reset();
@@ -247,25 +242,29 @@ static void allocate_system_structures( std::vector< std::unique_ptr< mg_data_t
 	grb::RC rc = data->init_vectors( io_zero );
 	ASSERT_RC_SUCCESS( rc );
 	std::for_each( system_levels.begin(), system_levels.end(),
-		[]( std::unique_ptr< mg_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
+		[]( std::unique_ptr< mg_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
 	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
-		[]( std::unique_ptr< coarsening_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
+		[]( std::unique_ptr< coarsening_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
 	std::for_each( smoother_levels.begin(), smoother_levels.end(),
-		[]( std::unique_ptr< smoothing_data_t > &s) { ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) ); } );
+		[]( std::unique_ptr< smoothing_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
 	time = timer.time();
 	logger << " time (ms) " << time << std::endl;
 }
 
-
 /**
  * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
  * It allocates the data structures and populates them according to the algorithms chosen for HPCG.
  */
-static void build_3d_system(
-	std::vector< std::unique_ptr< mg_data_t > > &system_levels,
-	std::vector< std::unique_ptr< coarsening_data_t > > &coarsener_levels,
-	std::vector< std::unique_ptr< smoothing_data_t > > &smoother_levels,
-	std::unique_ptr< hpcg_data_t > &cg_system_data,
+static void build_3d_system( std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
+	std::unique_ptr< hpcg_data_t > & cg_system_data,
 	const simulation_input & in,
 	const mg_controller_t & tt,
 	DistStream & logger
@@ -274,10 +273,8 @@ static void build_3d_system(
 	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
 	grb::utils::Timer timer;
 
-	HPCGSystemParams< DIMS, NonzeroType > params = {
-		{ in.nx, in.ny, in.nz }, HALO_RADIUS, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE,
-			PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
-	};
+	HPCGSystemParams< DIMS, NonzeroType > params = { { in.nx, in.ny, in.nz }, HALO_RADIUS,
+		SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2 };
 
 	std::vector< builder_t > mg_generators;
 	logger << "building HPCG generators for " << ( in.max_coarsening_levels + 1 ) << " levels...";
@@ -286,39 +283,43 @@ static void build_3d_system(
 	hpcg_build_multigrid_generators( params, mg_generators );
 	double time = timer.time();
 	logger << " time (ms) " << time << std::endl;
-	logger << "built HPCG generators for " << mg_generators.size()
-		<< " levels" << std::endl;
+	logger << "built HPCG generators for " << mg_generators.size() << " levels" << std::endl;
 
 	// extract the size for each level
 	std::vector< size_t > mg_sizes;
-	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes  ),
-		[] ( const builder_t &b ) { return b.system_size(); } );
+	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes ),
+		[]( const builder_t & b ) {
+		return b.system_size();
+	} );
 	// given the sizes, allocate the data structures for all the inputs of the algorithms
-	allocate_system_structures( system_levels, coarsener_levels, smoother_levels, cg_system_data, mg_sizes, tt, logger );
+	allocate_system_structures( system_levels, coarsener_levels, smoother_levels,
+		cg_system_data, mg_sizes, tt, logger );
 	assert( mg_generators.size() == system_levels.size() );
 	assert( mg_generators.size() == smoother_levels.size() );
 	assert( mg_generators.size() - 1 == coarsener_levels.size() ); // coarsener acts between two levels
 
 	// for each grid level, populate the data structures according to the specific algorithm
 	// and track the time for diagnostics purposes
-	for( size_t i = 0; i < mg_generators.size(); i++) {
+	for( size_t i = 0; i < mg_generators.size(); i++ ) {
 		logger << "SYSTEM LEVEL " << i << std::endl;
-		auto& sizes = mg_generators[ i ].get_generator().get_sizes();
+		auto & sizes = mg_generators[ i ].get_generator().get_sizes();
 		logger << " sizes: ";
 		for( size_t s = 0; s < DIMS - 1; s++ ) {
-			logger <<sizes[ s ] << " x ";
+			logger << sizes[ s ] << " x ";
 		}
 		logger << sizes[ DIMS - 1 ] << std::endl;
 		logger << " populating system matrix: ";
 		timer.reset();
-		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ], system_levels.at(i)->A, logger );
+		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ],
+			system_levels.at( i )->A, logger );
 		time = timer.time();
 		ASSERT_RC_SUCCESS( rc );
 		logger << " time (ms) " << time << std::endl;
 
 		logger << " populating smoothing data: ";
 		timer.reset();
-		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ], logger );
+		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ],
+			logger );
 		time = timer.time();
 		ASSERT_RC_SUCCESS( rc );
 		logger << " time (ms) " << time << std::endl;
@@ -326,10 +327,12 @@ static void build_3d_system(
 		if( i > 0 ) {
 			logger << " populating coarsening data: ";
 			timer.reset();
-			if( !in.use_average_coarsener ) {
-				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+			if( ! in.use_average_coarsener ) {
+				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ],
+					*coarsener_levels[ i - 1 ] );
 			} else {
-				rc = hpcg_populate_coarsener_avg( mg_generators[ i - 1 ], mg_generators[ i ], *coarsener_levels[ i - 1 ] );
+				rc = hpcg_populate_coarsener_avg( mg_generators[ i - 1 ], mg_generators[ i ],
+					*coarsener_levels[ i - 1 ] );
 			}
 			time = timer.time();
 			ASSERT_RC_SUCCESS( rc );
@@ -338,7 +341,6 @@ static void build_3d_system(
 	}
 }
 
-
 /**
  * Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
@@ -400,9 +402,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	Matrix< NonzeroType > &A = mg_runner.system_levels[ 0 ]->A;
-	Vector< IOType > &x = hpcg_state->x;
-	Vector< NonzeroType > &b = hpcg_state->b;
+	Matrix< NonzeroType > & A = mg_runner.system_levels[ 0 ]->A;
+	Vector< IOType > & x = hpcg_state->x;
+	Vector< NonzeroType > & b = hpcg_state->b;
 
 	RC rc = SUCCESS;
 	// set vectors as from standard HPCG benchmark
@@ -420,7 +422,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	out.times.preamble = timer.time();
 
-	mg_data_t &grid_base = *mg_runner.system_levels[ 0 ];
+	mg_data_t & grid_base = *mg_runner.system_levels[ 0 ];
 
 	// do a cold run to warm the system up
 	logger << TEXT_HIGHLIGHT << "beginning cold run..." << std::endl;
@@ -471,7 +473,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.times.useful /= static_cast< double >( in.inner_test_repetitions );
 
 	logger << TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
-				<< ", " << out.times.useful << std::endl;
+		<< ", " << out.times.useful << std::endl;
 	std::cout.imbue( old_locale );
 
 	// start postamble
@@ -490,10 +492,10 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.times.postamble = timer.time();
 
 	// write measurements into CSV files
-	if ( in.hpcg_log ) {
+	if( in.hpcg_log ) {
 		hpcg_csv.write_to_file( in.hpcg_csv.data() );
 	}
-	if ( in.mg_log ) {
+	if( in.mg_log ) {
 		mg_csv.write_to_file( in.mg_csv.data() );
 	}
 }
@@ -514,17 +516,19 @@ int main( int argc, char ** argv ) {
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
 	thcout << "System size z: " << sim_in.nz << std::endl;
-	thcout << "Coarsener: " << (sim_in.use_average_coarsener ? "average" : "single point sampler" ) << std::endl;
+	thcout << "Coarsener: " << ( sim_in.use_average_coarsener ? "average" :
+		"single point sampler" ) << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
 	thcout << "Test repetitions: " << sim_in.inner_test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
-	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
-	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
+	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run
+		<< std::noboolalpha << std::endl;
+	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning
+		<< std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
 	thcout << "Maximum norm for residual: " << max_diff_norm << std::endl;
 
-
 	// the output struct
 	struct output out;
 
@@ -543,10 +547,11 @@ int main( int argc, char ** argv ) {
 		// compute number of inner repetitions to achieve at least 1s duration
 		sim_in.inner_test_repetitions = static_cast< size_t >( 1000.0 / out.times.useful ) + 1;
 		thcout << "Evaluation run" << std::endl
-			<< "  computed residual: " << out.cg_out.norm_residual << std::endl
-			<< "  iterations: " << out.cg_out.iterations << std::endl
-			<< "  time taken (ms): " << out.times.useful << std::endl
-			<< "  deduced inner repetitions for 1s duration: " << sim_in.inner_test_repetitions << std::endl;
+				<< "  computed residual: " << out.cg_out.norm_residual << std::endl
+				<< "  iterations: " << out.cg_out.iterations << std::endl
+				<< "  time taken (ms): " << out.times.useful << std::endl
+				<< "  deduced inner repetitions for 1s duration: " << sim_in.inner_test_repetitions
+				<< std::endl;
 	}
 
 	// launch full benchmark
@@ -556,18 +561,19 @@ int main( int argc, char ** argv ) {
 	ASSERT_RC_SUCCESS( rc );
 	ASSERT_RC_SUCCESS( out.error_code );
 	thcout << "completed successfully!" << std::endl
-		<< "  final residual: " << out.cg_out.norm_residual << std::endl
-		<< "  solver iterations: " << out.cg_out.iterations << std::endl
-		<< "  total time (ms): " << out.times.useful << std::endl;
+		   << "  final residual: " << out.cg_out.norm_residual << std::endl
+		   << "  solver iterations: " << out.cg_out.iterations << std::endl
+		   << "  total time (ms): " << out.times.useful << std::endl;
 
 	// check result vector, stored inside a pinned vector
 	ASSERT_TRUE( out.pinnedVector );
-	const PinnedVector< double > &solution = *out.pinnedVector;
+	const PinnedVector< double > & solution = *out.pinnedVector;
 	ASSERT_EQ( solution.size(), sim_in.nx * sim_in.ny * sim_in.nz );
 
 	// check norm of solution w.r.t. expected solution (i.e. vector of all 1)
 	double diff_norm = sqrt( out.square_norm_diff );
-	thcout << "Norm of difference vector: |<exact solution> - <actual solution>| = " << diff_norm << std::endl;
+	thcout << "Norm of difference vector: |<exact solution> - <actual solution>| = "
+		<< diff_norm << std::endl;
 	ASSERT_LT( diff_norm, max_diff_norm );
 
 	thcout << "Test OK" << std::endl;
@@ -576,15 +582,10 @@ int main( int argc, char ** argv ) {
 
 static const char * const empty = "";
 
-static void parse_arguments(
-	simulation_input & sim_in,
-	size_t & outer_iterations,
-	double & max_diff_norm,
-	int argc,
-	char ** argv
-) {
+static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations,
+	double & max_diff_norm, int argc, char ** argv ) {
 	argument_parser parser;
-	const char * hpcg_csv, * mg_csv;
+	const char *hpcg_csv, *mg_csv;
 
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
@@ -598,18 +599,17 @@ static void parse_arguments(
 			"test repetitions with complete initialization" )
 		.add_optional_argument( "--max-cg-iterations", sim_in.max_iterations, MAX_ITERATIONS_DEF,
 			"maximum number of CG iterations" )
-		.add_optional_argument( "--max-difference-norm", max_diff_norm, MAX_NORM,
-			"maximum acceptable norm |<exact solution> - <actual solution>| (does NOT limit "
-			"the execution of the algorithm)" )
+		.add_optional_argument( "--max-difference-norm", max_diff_norm, MAX_NORM, "maximum acceptable"
+			" norm |<exact solution> - <actual solution>| (does NOT limit the execution of the algorithm)" )
 		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF,
 			"number of pre/post-smoother steps; 0 disables smoothing" )
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
 			"launch single run directly, without benchmarker (ignore repetitions)" )
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
 			"do not apply pre-conditioning via multi-grid V cycle" )
-		.add_optional_argument( "--hpcg-csv", hpcg_csv , empty,
+		.add_optional_argument( "--hpcg-csv", hpcg_csv, empty,
 			"file for HPCG run measurements (overwrites any previous)" )
-		.add_optional_argument( "--mg-csv", mg_csv , empty,
+		.add_optional_argument( "--mg-csv", mg_csv, empty,
 			"file for Multigrid run measurements (overwrites any previous)" )
 		.add_option( "--use-average-coarsener", sim_in.use_average_coarsener, false,
 			"coarsen by averaging instead of by sampling a single point (slower, but more accurate)" );
@@ -634,18 +634,18 @@ static void parse_arguments(
 	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
 	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
 		std::lldiv_t div_res = std::div( static_cast< long long >( s ), static_cast< long long >( max_system_divider ) );
-		if ( div_res.rem != 0) {
-			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
-				<< sim_in.max_coarsening_levels << " times because it is not exactly divisible" << std::endl;
+		if( div_res.rem != 0 ) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened " << sim_in.max_coarsening_levels
+				<< " times because it is not exactly divisible" << std::endl;
 			std::exit( -1 );
 		}
-		if ( div_res.quot < static_cast< long long >( PHYS_SYSTEM_SIZE_MIN ) ) {
-			std::cerr << "ERROR: system size " << s << " cannot be coarsened "
-				<< sim_in.max_coarsening_levels << " times because it is too small" << std::endl;
+		if( div_res.quot < static_cast< long long >( PHYS_SYSTEM_SIZE_MIN ) ) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened " << sim_in.max_coarsening_levels
+				<< " times because it is too small" << std::endl;
 			std::exit( -1 );
 		}
-		if ( div_res.quot % 2 != 0 ) {
-			std::cerr << "ERROR: the coarsest size " << div_res.rem << " is not a multiple of 2" << std::endl;
+		if( div_res.quot % 2 != 0 ) {
+			std::cerr << "ERROR: the coarsest size " << div_res.rem << " is not even" << std::endl;
 			std::exit( -1 );
 		}
 	}
@@ -653,7 +653,7 @@ static void parse_arguments(
 	// check output CSVs
 	size_t len = std::strlen( hpcg_csv );
 	if( ( sim_in.hpcg_log = len > 0 ) ) {
-		if ( len > MAX_CSV_PATH_LENGTH ) {
+		if( len > MAX_CSV_PATH_LENGTH ) {
 			std::cerr << "HPCG CSV file name is too long!" << std::endl;
 			std::exit( -1 );
 		}
@@ -661,7 +661,7 @@ static void parse_arguments(
 	}
 	len = std::strlen( mg_csv );
 	if( ( sim_in.mg_log = len > 0 ) ) {
-		if ( len > MAX_CSV_PATH_LENGTH ) {
+		if( len > MAX_CSV_PATH_LENGTH ) {
 			std::cerr << "HPCG CSV file name is too long!" << std::endl;
 			std::exit( -1 );
 		}

From a727e993819bcd622d55f77e155f95aa5c86ae5f Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 24 Feb 2023 17:25:51 +0100
Subject: [PATCH 23/28] adding comments to refactored classes

---
 .../algorithms/hpcg/greedy_coloring.hpp       |  1 +
 .../algorithms/hpcg/system_building_utils.hpp |  9 ++---
 .../algorithms/multigrid/multigrid_cg.hpp     | 30 ++++++++------
 .../multigrid/multigrid_v_cycle.hpp           | 27 ++++++++-----
 .../multigrid/red_black_gauss_seidel.hpp      | 12 +++---
 .../multigrid/single_matrix_coarsener.hpp     |  9 +++--
 tests/smoke/hpcg.cpp                          | 40 ++++++++++---------
 7 files changed, 70 insertions(+), 58 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
index bb4759d6f..24bb1e1e4 100644
--- a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -61,6 +61,7 @@ namespace grb {
 		 * @tparam DIMS dimensions of the system
 		 * @tparam CoordType type of the coordinates
 		 * @tparam lower_color_first start greedy assignment of colors from lowest first
+		 *
 		 * @param[in] system generator for an \p DIMS - dimesional system with halo
 		 * @param[out] row_colors if \p reorder_rows_per_color is false, stores the color of each row;
 		 * 	if \p reorder_rows_per_color is true, stores the new position of each row, so that rows
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 9f3fdf583..ddf9e45a5 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -167,6 +167,7 @@ namespace grb {
 		 * @tparam DIMS number of dimensions
 		 * @tparam CoordType type storing the coordinates and the sizes
 		 * @tparam NonzeroType type of the nonzero
+		 *
 		 * @param finer_system_generator object generating the finer system
 		 * @param coarser_system_generator object generating the finer system
 		 * @param coarsener structure with the matrix to populate
@@ -256,8 +257,7 @@ namespace grb {
 			 * @param[out] per_color_rows for each position \a i it stores an std::vector with all rows
 			 *  of color \a i inside \p row_colors
 			 */
-			template< typename CoordType >
-			void hpcg_split_rows_by_color(
+			template< typename CoordType > void hpcg_split_rows_by_color(
 				const std::vector< CoordType > & row_colors,
 				size_t num_colors, std::vector< std::vector< CoordType > > & per_color_rows
 			) {
@@ -276,10 +276,7 @@ namespace grb {
 			 *
 			 * @tparam CoordType type of the internal coordinate
 			 */
-			template< typename CoordType >
-			struct true_iter {
-
-				// static const bool __TRUE;
+			template< typename CoordType > struct true_iter {
 
 				using self_t = true_iter< CoordType >;
 				using iterator_category = std::random_access_iterator_tag;
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index f465ba8da..cd1761589 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -100,14 +100,13 @@ namespace grb {
 		 * The \p MultiGridrunnerType must implement a functional interface whose input (from CG)
 		 * is the structure with the system information for one level of the grid.
 		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam InputType type of values of the right-hand side vector b
-		 * @tparam ResidualType type of the residual norm
+		 * @tparam MGCGTypes types container for algebraic information (IOType, NonzeroType,
+		 * 	InputType, ResidualType, Ring, Minus)
 		 * @tparam MultiGridrunnerType type for the multi-grid runner object
-		 * @tparam Ring algebraic ring type
-		 * @tparam Minus minus operator
 		 * @tparam descr descriptors with statically-known data for computation and containers
+		 * @tparam DbgOutputStreamType type for the debugging stream, i.e. the stream to trace simulation
+		 * 	results alongside execution; the default type #grb::utils::telemetry::OutputStreamOff disables
+		 * 	all output at compile time
 		 */
 		template<
 			typename MGCGTypes,
@@ -117,12 +116,14 @@ namespace grb {
 			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
 		> struct MultiGridCGRunner : public grb::utils::telemetry::Timeable< TelControllerType > {
 
+			// algebraic types
 			using IOType = typename MGCGTypes::IOType;
 			using NonzeroType = typename MGCGTypes::NonzeroType;
 			using InputType = typename MGCGTypes::InputType;
 			using ResidualType = typename MGCGTypes::ResidualType;
 			using Ring = typename MGCGTypes::Ring;
 			using Minus = typename MGCGTypes::Minus;
+			// input types for simulation (CG and MG)
 			using HPCGInputType = MultiGridCGData< IOType, NonzeroType, InputType >;
 			using MGRunnerType = MultiGridRunnerType;
 
@@ -141,14 +142,13 @@ namespace grb {
 			ResidualType tolerance = ring.template getZero< ResidualType >(); ///< ratio between initial residual and current residual that halts the solver
 			                                                                  ///< if reached, for the solution is to be considered "good enough"
 
-			MultiGridRunnerType & mg_runner;
-			DbgOutputStreamType dbg_logger;
+			MultiGridRunnerType & mg_runner; ///< runner object for MG
+			DbgOutputStreamType dbg_logger; ///< logger to trace execution
 
 			/**
-			 * Construct a new MultiGridCGRunner object by moving the required MG runner.
+			 * Construct a new MultiGridCGRunner object with the required MG runner.
 			 *
-			 * Moving the state of the MG is safer in that it avoids use-after-free issues,
-			 * as the state of the MG runner is managed automatically with this object.
+			 * The debug logger is unavailable.
 			 */
 			MultiGridCGRunner(
 				const TelControllerType & tt,
@@ -161,6 +161,10 @@ namespace grb {
 				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
 			}
 
+			/**
+			 * Construct a new MultiGridCGRunner object with the required MG runner and
+			 * 	the user-given debug logger.
+			 */
 			MultiGridCGRunner(
 				const TelControllerType & tt,
 				MultiGridRunnerType & _mg_runner,
@@ -200,7 +204,6 @@ namespace grb {
 			 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
 			 * the failure code.
 			 *
-			 *
 			 * @param cg_data data for the CG solver only
 			 * @param grid_base base (i.e., finer) level of the multi-grid, with the information of the physical system
 			 * @param out_info solver output information
@@ -327,7 +330,8 @@ namespace grb {
 					++iter;
 					out_info.iterations = iter;
 					out_info.norm_residual = norm_residual;
-				} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
+				} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance
+					&& ret == SUCCESS );
 
 				return ret;
 			}
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 31b623024..1a036c1cc 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -48,17 +48,17 @@ namespace grb {
 		 * It is built by transferring into it the state of both the smoother and the coarsener,
 		 * in order to avoid use-after-free issues.
 		 *
+		 * @tparam MGTypes types container for algebraic information (IOType, NonzeroType, Ring, Minus)
 		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
 		 *  smoothing steps
 		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam NonzeroType type of matrix values
-		 *  and prolongation
-		 * @tparam Ring the ring of algebraic operators and zero values
-		 * @tparam Minus the minus operator for subtractions
 		 * @tparam descr descriptors with statically-known data for computation and containers
+		 * @tparam DbgOutputStreamType type for the debugging stream, i.e. the stream to trace simulation
+		 * 	results alongside execution; the default type #grb::utils::telemetry::OutputStreamOff disables
+		 * 	all output at compile time
 		 */
-		template< typename MGTypes,
+		template<
+			typename MGTypes,
 			typename MGSmootherType,
 			typename CoarsenerType,
 			typename TelControllerType,
@@ -88,7 +88,7 @@ namespace grb {
 
 			MGSmootherType & smoother_runner; ///< object to run the smoother
 			CoarsenerType & coarsener_runner; ///< object to run the coarsener
-			DbgOutputStreamType dbg_logger;
+			DbgOutputStreamType dbg_logger;   ///< logger to trace execution
 
 			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
 			Ring ring;                                                          ///< algebraic ring
@@ -113,6 +113,8 @@ namespace grb {
 			/**
 			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
 			 * smoother and coarsener.
+			 *
+			 * The debug logger is deactivated.
 			 */
 			MultiGridRunner(
 				MGSmootherType & _smoother_runner,
@@ -124,6 +126,10 @@ namespace grb {
 				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
 			}
 
+			/**
+			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
+			 * smoother and coarsener and with a user-given debug logger.
+			 */
 			MultiGridRunner(
 				MGSmootherType & _smoother_runner,
 				CoarsenerType & _coarsener_runner,
@@ -141,6 +147,9 @@ namespace grb {
 					__unique_ptr_extractor( system_levels.end() ) );
 			}
 
+			/**
+			 * Operator to invoke a multi-grid run among given levels.
+			 */
 			inline grb::RC operator()(
 				__unique_ptr_extractor begin,
 				const __unique_ptr_extractor end
@@ -171,10 +180,6 @@ namespace grb {
 			 *
 			 * @param mgiter_begin iterator pointing to the current level of the multi-grid
 			 * @param mgiter_end end iterator, indicating the end of the recursion
-			 * @param smoother callable object to invoke the smoothing steps
-			 * @param coarsener callable object to coarsen and prolong (between current and coarser grid levels)
-			 * @param ring the ring to perform the operations on
-			 * @param minus the \f$ - \f$ operator for vector subtractions
 			 * @return grb::RC if the algorithm could correctly terminate, the error code of the first
 			 *  unsuccessful operation otherwise
 			 */
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index aa7157de7..d86b2382b 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -75,6 +75,7 @@ namespace grb {
 			Descriptor descr = descriptors::no_operation
 		> struct RedBlackGSSmootherRunner {
 
+			// algebraic types
 			using IOType = typename SmootherTypes::IOType;
 			using NonzeroType = typename SmootherTypes::NonzeroType;
 			using Ring = typename SmootherTypes::Ring;
@@ -132,12 +133,8 @@ namespace grb {
 			/**
 			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
 			 *
-			 * @param[in] A the system matrix
-			 * @param[in] A_diagonal a vector storing the diagonal elements of \p A
-			 * @param[in] r the residual
-			 * @param[in,out] z the initial solution to start from, and where the smoothed solution is stored to
-			 * @param[out] smoother_temp a vector for temporary values
-			 * @param[in] color_mask the mask of colors to filter the rows to smooth
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level: vector to smooth, system matrix, residual
+			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *  unsuccessful operation otherwise
 			 */
@@ -189,7 +186,8 @@ namespace grb {
 			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
 			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
 			 *
-			 * @param[in,out] data structure with the data of a single grid level
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level: vector to smooth, system matrix, residual
+			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index 4a19f9deb..3b2379802 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -35,7 +35,7 @@ namespace grb {
 	namespace algorithms {
 
 		/**
-		 * Structure storing the data for the coarsener
+		 * Structure storing the data for the coarsener.
 		 */
 		template<
 			typename IOType,
@@ -76,6 +76,7 @@ namespace grb {
 			Descriptor descr = descriptors::no_operation
 		> struct SingleMatrixCoarsener {
 
+			// algebraic types
 			using IOType = typename CoarsenerTypes::IOType;
 			using NonzeroType = typename CoarsenerTypes::NonzeroType;
 			using Ring = typename CoarsenerTypes::Ring;
@@ -132,13 +133,14 @@ namespace grb {
 			 * The coarsening information are stored inside \p CoarseningData.
 			 *
 			 * @param[in] r_fine fine residual vector
+			 * @param[out] r_coarse coarse residual vector, the output
 			 * @param[in,out] coarsening_data \ref MultiGridData data structure storing the information for coarsening
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
 			grb::RC compute_coarsening(
-				const grb::Vector< IOType > & r_fine, // fine residual
-				grb::Vector< IOType > & r_coarse, // coarse residual
+				const grb::Vector< IOType > & r_fine,
+				grb::Vector< IOType > & r_coarse,
 				CoarseningData< IOType, NonzeroType > & coarsening_data
 			) {
 				RC ret = SUCCESS;
@@ -160,6 +162,7 @@ namespace grb {
 			 *
 			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
 			 *
+			 * @param[out] z_coarse input solution vector, to be coarsened
 			 * @param[out] z_fine the solution vector to store the prolonged solution into
 			 * @param[in,out] coarsening_data information for coarsening
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 45c89cd29..86cf798b1 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -135,6 +135,7 @@ using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_controller
 	hpcg_desc, DBGStream >;
 using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
 
+// allow DBGStream to print grb::Vector's in a lazy way (i.e., no code generated if deactivated)
 struct dotter : grb::utils::telemetry::OutputStreamLazy {
 	const grb::Vector< IOType > & v;
 	dotter( const grb::Vector< IOType > & _v ) : v( _v ) {}
@@ -151,16 +152,16 @@ static inline DBGStream & operator<<( DBGStream & stream, const grb::Vector< IOT
 	return stream << dotter( v );
 }
 
+// various algebraic zeros
 static const IOType io_zero = Ring().template getZero< IOType >();
 static const NonzeroType nz_zero = Ring().template getZero< NonzeroType >();
 static const InputType input_zero = Ring().template getZero< InputType >();
 static const ResidualType residual_zero = Ring().template getZero< ResidualType >();
 
+// input/output structure (serializable for distributed execution),
+// with the parameters for the HPCG simulation
 static constexpr size_t MAX_CSV_PATH_LENGTH = 255;
 
-/**
- * Container for the parameters for the HPCG simulation.
- */
 struct simulation_input {
 	// physical parameters for the multi-grid
 	size_t nx, ny, nz;
@@ -186,9 +187,6 @@ struct simulation_input {
 	simulation_input( const simulation_input & ) = default;
 };
 
-/**
- * Container for test outputs.
- */
 struct output {
 	RC error_code = SUCCESS;
 	size_t inner_test_repetitions = 0;
@@ -199,6 +197,7 @@ struct output {
 };
 
 #ifdef HPCG_PRINT_SYSTEM
+// routine to print the system matrices
 static void print_system( const std::vector< std::unique_ptr< mg_data_t > > & system_levels,
 	const std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels ) {
 	assert( spmd<>::nprocs() == 1 ); // distributed printin of system not implemented
@@ -218,7 +217,8 @@ static void print_system( const std::vector< std::unique_ptr< mg_data_t > > & sy
  * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
  * explained in \ref multigrid_allocate_data().
  */
-static void allocate_system_structures( std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+static void allocate_system_structures(
+	std::vector< std::unique_ptr< mg_data_t > > & system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
 	std::unique_ptr< hpcg_data_t > & cg_system_data,
@@ -261,7 +261,8 @@ static void allocate_system_structures( std::vector< std::unique_ptr< mg_data_t
  * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
  * It allocates the data structures and populates them according to the algorithms chosen for HPCG.
  */
-static void build_3d_system( std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+static void build_3d_system(
+	std::vector< std::unique_ptr< mg_data_t > > & system_levels,
 	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
 	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
 	std::unique_ptr< hpcg_data_t > & cg_system_data,
@@ -354,7 +355,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	dist_controller_t dist( pid == 0 );
 	// separate thousands when printing integers
 	class IntegerSeparation : public std::numpunct< char > {
-		// protected:
 		char do_thousands_sep() const override {
 			return '\'';
 		}
@@ -379,7 +379,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	dbg_controller_t dbg_controller( pid == 0 );
 	DBGStream dbg_stream( dbg_controller, std::cout );
 
-	// define the main HPCG runner and initialize the options of its components
+	// define the main runners and initialize the options of its components
 	coarsener_runner_t coarsener;
 	smoother_runner_t smoother;
 	smoother.presmoother_steps = smoother.postsmoother_steps = in.smoother_steps;
@@ -406,12 +406,15 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	Vector< IOType > & x = hpcg_state->x;
 	Vector< NonzeroType > & b = hpcg_state->b;
 
-	RC rc = SUCCESS;
 	// set vectors as from standard HPCG benchmark
-	set( x, 1.0 );
-	set( b, nz_zero );
+	RC rc = set( x, 1.0 );
+	ASSERT_RC_SUCCESS( rc );
+	rc = set( b, nz_zero );
+	ASSERT_RC_SUCCESS( rc );
 	rc = grb::mxv( b, A, x, Ring() );
-	set( x, io_zero );
+	ASSERT_RC_SUCCESS( rc );
+	rc = set( x, io_zero );
+	ASSERT_RC_SUCCESS( rc );
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
@@ -474,11 +477,12 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 	logger << TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
 		<< ", " << out.times.useful << std::endl;
+	// restore previous output options
 	std::cout.imbue( old_locale );
 
 	// start postamble
 	timer.reset();
-	// set error code
+	// set error code to caller
 	out.error_code = rc;
 
 	grb::set( b, 1.0 );
@@ -521,9 +525,9 @@ int main( int argc, char ** argv ) {
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
 	thcout << "Test repetitions: " << sim_in.inner_test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
-	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run
+	thcout << "Is evaluation run: " << std::boolalpha << sim_in.evaluation_run
 		<< std::noboolalpha << std::endl;
-	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning
+	thcout << "Conditioning: " << std::boolalpha << !sim_in.no_preconditioning
 		<< std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
@@ -650,7 +654,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		}
 	}
 
-	// check output CSVs
+	// check output CSV file names
 	size_t len = std::strlen( hpcg_csv );
 	if( ( sim_in.hpcg_log = len > 0 ) ) {
 		if( len > MAX_CSV_PATH_LENGTH ) {

From b58370041890c85d7cd755b0c8fa92e96795cc7d Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Mon, 27 Feb 2023 08:09:00 +0000
Subject: [PATCH 24/28] implementing RBGS with foldl + eWiseApply instead of
 eWiseLambda

---
 .../multigrid/red_black_gauss_seidel.hpp      | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index d86b2382b..f004610f4 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -80,6 +80,7 @@ namespace grb {
 			using NonzeroType = typename SmootherTypes::NonzeroType;
 			using Ring = typename SmootherTypes::Ring;
 			using Minus = typename SmootherTypes::Minus;
+			using Divide = typename SmootherTypes::Divide;
 			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external input structure
 			using SmootherDataType = SmootherData< IOType >; ///< smoothing information and temporary variables (per MG level)
 
@@ -89,6 +90,8 @@ namespace grb {
 			std::vector< std::unique_ptr< SmootherDataType > > levels; ///< for each grid level,
 			                                                           ///< the smoothing data (finest first)
 			Ring ring;                                                 ///< the algebraic ring
+			Minus minus;
+			Divide divide;
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring operator with default values" );
@@ -161,17 +164,31 @@ namespace grb {
 				// Replace below with masked calls:
 				// z[mask] = r[mask] - smoother_temp[mask] + z[mask] .* diagonal[mask]
 				// z[mask] = z[maks] ./ diagonal[mask]
+
+// by default use foldl()'s, although eWiseLambda() might be more performing
+// TODO: leave this choice for future experimentation
+#if defined(RBGS_EWL)
+				Ring & ri = ring;
+				Minus & mi = minus;
+				Divide & di = divide;
+
 				ret = ret ? ret :
                             grb::eWiseLambda(
-								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
-									// if the mask was properly initialized, the check on the mask value is unnecessary;
-					                // if( color_mask[ i ] ) {
+								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal , &ri, &mi, &di ]( const size_t i ) {
 									IOType d = A_diagonal[ i ];
-									IOType v = r[ i ] - smoother_temp[ i ] + z[ i ] * d;
-									z[ i ] = v / d;
-									// }
+									IOType v;
+									ri.getMultiplicativeOperator().apply( z[ i ], d, v  );
+									ri.getAdditiveOperator().apply( v, r[ i ], v  );
+									mi.apply( v, smoother_temp[ i ], v );
+									di.apply( v, d, z[ i ] );
 								},
 								color_mask, z, r, smoother_temp, A_diagonal );
+#else
+				grb::foldl( z, color_mask, A_diagonal, ring.getMultiplicativeOperator() );
+				grb::foldl( z, color_mask, smoother_temp, minus );
+				grb::foldl( z, color_mask, r, ring.getAdditiveOperator() );
+				grb::foldl( z, color_mask, A_diagonal, divide );
+#endif
 				assert( ret == SUCCESS );
 				return ret;
 			}

From be240cb74173fbbabcb133dd685f3e44fd4206b9 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 3 Mar 2023 15:49:08 +0100
Subject: [PATCH 25/28] using new Stopwatch facilities

---
 tests/smoke/hpcg.cpp | 66 +++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 86cf798b1..dac62457b 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -43,7 +43,6 @@
 #include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
 #include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
 #include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
-#include <graphblas/utils/Timer.hpp>
 #include <graphblas/utils/telemetry/Telemetry.hpp>
 
 #include <utils/argument_parser.hpp>
@@ -135,10 +134,14 @@ using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_controller
 	hpcg_desc, DBGStream >;
 using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
 
+// Stopwatch type, to measure various setup phases
+using Stw = utils::telemetry::ActiveStopwatch;
+
+
 // allow DBGStream to print grb::Vector's in a lazy way (i.e., no code generated if deactivated)
-struct dotter : grb::utils::telemetry::OutputStreamLazy {
+struct dotter  {
 	const grb::Vector< IOType > & v;
-	dotter( const grb::Vector< IOType > & _v ) : v( _v ) {}
+
 	ResidualType operator()() const {
 		Ring ring;
 		ResidualType r = 0;
@@ -149,7 +152,7 @@ struct dotter : grb::utils::telemetry::OutputStreamLazy {
 
 static inline DBGStream & operator<<( DBGStream & stream, const grb::Vector< IOType > & v ) {
 	stream << std::setprecision( 7 );
-	return stream << dotter( v );
+	return stream << DBGStream::makeLazy( dotter{ v } );
 }
 
 // various algebraic zeros
@@ -226,19 +229,17 @@ static void allocate_system_structures(
 	const mg_controller_t & mg_controller,
 	DistStream & logger
 ) {
-	grb::utils::Timer timer;
+	Stw timer;
 
 	hpcg_data_t * data = new hpcg_data_t( mg_sizes[ 0 ] );
 	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
 	logger << "allocating data for the MultiGrid simulation...";
-	timer.reset();
+	timer.start();
 	multigrid_allocate_data( system_levels, coarsener_levels, smoother_levels, mg_sizes, mg_controller );
-	double time = timer.time();
-	logger << " time (ms) " << time << std::endl;
+	logger << " time (ms) " << Stw::nano2Milli( timer.restart() ) << std::endl;
 
 	// zero all vectors
 	logger << "zeroing all vectors...";
-	timer.reset();
 	grb::RC rc = data->init_vectors( io_zero );
 	ASSERT_RC_SUCCESS( rc );
 	std::for_each( system_levels.begin(), system_levels.end(),
@@ -253,8 +254,7 @@ static void allocate_system_structures(
 		[]( std::unique_ptr< smoothing_data_t > & s ) {
 		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
 	} );
-	time = timer.time();
-	logger << " time (ms) " << time << std::endl;
+	logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
 }
 
 /**
@@ -272,18 +272,17 @@ static void build_3d_system(
 ) {
 	constexpr size_t DIMS = 3;
 	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
-	grb::utils::Timer timer;
+	Stw timer;
 
 	HPCGSystemParams< DIMS, NonzeroType > params = { { in.nx, in.ny, in.nz }, HALO_RADIUS,
 		SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2 };
 
 	std::vector< builder_t > mg_generators;
 	logger << "building HPCG generators for " << ( in.max_coarsening_levels + 1 ) << " levels...";
-	timer.reset();
+	timer.start();
 	// construct the builder_t generator for each grid level, which depends on the system physics
 	hpcg_build_multigrid_generators( params, mg_generators );
-	double time = timer.time();
-	logger << " time (ms) " << time << std::endl;
+	logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
 	logger << "built HPCG generators for " << mg_generators.size() << " levels" << std::endl;
 
 	// extract the size for each level
@@ -310,24 +309,21 @@ static void build_3d_system(
 		}
 		logger << sizes[ DIMS - 1 ] << std::endl;
 		logger << " populating system matrix: ";
-		timer.reset();
+		timer.start();
 		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ],
 			system_levels.at( i )->A, logger );
-		time = timer.time();
 		ASSERT_RC_SUCCESS( rc );
-		logger << " time (ms) " << time << std::endl;
+		logger << " time (ms) " << Stw::nano2Milli( timer.restart() ) << std::endl;
 
 		logger << " populating smoothing data: ";
-		timer.reset();
 		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ],
 			logger );
-		time = timer.time();
+		logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
 		ASSERT_RC_SUCCESS( rc );
-		logger << " time (ms) " << time << std::endl;
 
 		if( i > 0 ) {
 			logger << " populating coarsening data: ";
-			timer.reset();
+			timer.start();
 			if( ! in.use_average_coarsener ) {
 				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ],
 					*coarsener_levels[ i - 1 ] );
@@ -335,9 +331,8 @@ static void build_3d_system(
 				rc = hpcg_populate_coarsener_avg( mg_generators[ i - 1 ], mg_generators[ i ],
 					*coarsener_levels[ i - 1 ] );
 			}
-			time = timer.time();
+			logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
 			ASSERT_RC_SUCCESS( rc );
-			logger << " time (ms) " << time << std::endl;
 		}
 	}
 }
@@ -349,7 +344,7 @@ static void build_3d_system(
 void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
 	const size_t pid = spmd<>::pid();
-	grb::utils::Timer timer;
+	Stw timer;
 
 	// standard logger: active only on master node
 	dist_controller_t dist( pid == 0 );
@@ -389,12 +384,11 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	hpcg_runner.tolerance = residual_zero;
 	hpcg_runner.with_preconditioning = ! in.no_preconditioning;
 
-	timer.reset();
+	timer.start();
 	// build the entire multi-grid system
 	build_3d_system( mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels,
 		hpcg_state, in, mg_controller, logger );
-	double input_duration = timer.time();
-	logger << "input generation time (ms): " << input_duration << std::endl;
+	logger << "input generation time (ms): " << Stw::nano2Milli( timer.restart() ) << std::endl;
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( pid == 0 ) {
@@ -423,18 +417,16 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	out.times.preamble = timer.time();
+	out.times.preamble = Stw::nano2Milli( timer.restart() );
 
 	mg_data_t & grid_base = *mg_runner.system_levels[ 0 ];
 
 	// do a cold run to warm the system up
 	logger << TEXT_HIGHLIGHT << "beginning cold run..." << std::endl;
 	hpcg_runner.max_iterations = 1;
-	timer.reset();
 	rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
-	double iter_duration = timer.time();
+	logger << " time (ms): " << Stw::nano2Milli( timer.restart() ) << std::endl;
 	ASSERT_RC_SUCCESS( rc );
-	logger << " time (ms): " << iter_duration << std::endl;
 
 	// restore CG options to user-given values
 	hpcg_runner.max_iterations = in.max_iterations;
@@ -445,16 +437,14 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// initialize CSV writers (if activated)
 	hpcg_csv_t hpcg_csv( hpcg_controller, { "repetition", "time" } );
 	mg_csv_t mg_csv( mg_controller, { "repetition", "level", "mg time", "smoother time" } );
+	timer.reset();
 
 	// do benchmark
 	for( size_t i = 0; i < in.inner_test_repetitions; ++i ) {
 		rc = set( x, io_zero );
 		ASSERT_RC_SUCCESS( rc );
 		logger << TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl;
-		timer.reset();
 		rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
-		iter_duration = timer.time();
-		out.times.useful += iter_duration;
 		ASSERT_RC_SUCCESS( rc );
 		hpcg_csv.add_line( i, hpcg_runner.getElapsedNano() );
 		logger << "repetition,duration (ns): " << hpcg_csv.last_line() << std::endl;
@@ -468,6 +458,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 		out.inner_test_repetitions++;
 	}
+	timer.stop();
+	out.times.useful += Stw::nano2Milli( timer.getElapsedNano() );
 	if( in.evaluation_run ) {
 		// get maximum execution time among processes
 		rc = collectives<>::reduce( out.times.useful, 0, operators::max< double >() );
@@ -481,7 +473,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	std::cout.imbue( old_locale );
 
 	// start postamble
-	timer.reset();
+	timer.restart();
 	// set error code to caller
 	out.error_code = rc;
 
@@ -493,7 +485,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// output
 	out.pinnedVector.reset( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
 	// finish timing
-	out.times.postamble = timer.time();
 
 	// write measurements into CSV files
 	if( in.hpcg_log ) {
@@ -502,6 +493,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	if( in.mg_log ) {
 		mg_csv.write_to_file( in.mg_csv.data() );
 	}
+	out.times.postamble = Stw::nano2Milli( timer.stop() );
 }
 
 #define thcout ( std::cout << TEXT_HIGHLIGHT )

From 03783cf6ce9720dd05ca76351148cf120d68748f Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Thu, 30 Mar 2023 18:19:19 +0200
Subject: [PATCH 26/28] removing missing (and useless) header from nonblocking
 matrix

---
 include/graphblas/nonblocking/matrix.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/graphblas/nonblocking/matrix.hpp b/include/graphblas/nonblocking/matrix.hpp
index 251e2037d..5554d78ae 100644
--- a/include/graphblas/nonblocking/matrix.hpp
+++ b/include/graphblas/nonblocking/matrix.hpp
@@ -50,7 +50,6 @@
 #include <graphblas/utils/DMapper.hpp>
 #include <graphblas/type_traits.hpp>
 
-#include <graphblas/algorithms/hpcg/ndim_matrix_builders.hpp>
 #include <graphblas/utils/iterators/utils.hpp>
 
 #include <graphblas/reference/NonzeroWrapper.hpp>

From 8fe366d6c5c9e6c80c2beddabda7798939f7d9d1 Mon Sep 17 00:00:00 2001
From: Alberto Scolari <alberto.scolari@huawei.com>
Date: Fri, 31 Mar 2023 12:25:10 +0200
Subject: [PATCH 27/28] re-flowing long lines

---
 .../algorithms/hpcg/average_coarsener.hpp     |  8 +++---
 .../algorithms/hpcg/greedy_coloring.hpp       |  3 ++-
 .../hpcg/single_point_coarsener.hpp           |  8 +++---
 .../algorithms/hpcg/system_building_utils.hpp |  6 +++--
 .../algorithms/multigrid/multigrid_cg.hpp     | 16 +++++------
 .../algorithms/multigrid/multigrid_data.hpp   | 14 +++++-----
 .../multigrid/multigrid_v_cycle.hpp           |  4 +--
 .../multigrid/red_black_gauss_seidel.hpp      | 27 +++++++++++--------
 .../multigrid/single_matrix_coarsener.hpp     |  3 ++-
 .../multigrid/dynamic_vector_storage.hpp      |  5 ++--
 .../halo_matrix_generator_iterator.hpp        |  8 +++---
 .../linearized_halo_ndim_iterator.hpp         | 15 +++++++----
 .../multigrid/linearized_halo_ndim_system.hpp | 12 +++++----
 .../multigrid/linearized_ndim_iterator.hpp    |  5 +++-
 .../multigrid/linearized_ndim_system.hpp      |  6 ++---
 .../graphblas/utils/telemetry/Stopwatch.hpp   |  3 ++-
 tests/smoke/hpcg.cpp                          |  3 ++-
 17 files changed, 88 insertions(+), 58 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/average_coarsener.hpp b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
index eb3853c61..983e5ad8f 100644
--- a/include/graphblas/algorithms/hpcg/average_coarsener.hpp
+++ b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
@@ -341,9 +341,11 @@ namespace grb {
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
 			grb::utils::multigrid::LinearizedNDimSystem< CoordType,
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > _finer_subspace;
-			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;///< array of steps, i.e. how much each column coordinate (finer system) must be
-																				//// incremented when incrementing the row coordinates; is is the ration between
-			                                                                    //// #finer_sizes and row_generator#physical_sizes
+			///
+			/// array of steps, i.e. how much each column coordinate (finer system) must be
+			/// incremented when incrementing the row coordinates; it is the ratio between
+			//// #finer_sizes and row_generator#physical_sizes
+			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
index 24bb1e1e4..366465c41 100644
--- a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -82,7 +82,8 @@ namespace grb {
 			bool reorder_rows_per_color = false
 		) {
 			CoordType nrows = system.system_size();
-			row_colors.insert( row_colors.begin(), nrows, nrows ); // value `nrows' means `uninitialized'; initialized colors go from 0 to nrow-1
+			// value `nrows' means `uninitialized'; initialized colors go from 0 to nrow-1
+			row_colors.insert( row_colors.begin(), nrows, nrows );
 			CoordType totalColors = 1;
 			row_colors[ 0 ] = 0; // first point gets color 0
 
diff --git a/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
index 92ef47263..e412a630c 100644
--- a/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
+++ b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
@@ -312,9 +312,11 @@ namespace grb {
 			const grb::utils::multigrid::LinearizedNDimSystem< CoordType,
 				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
 
-			ArrayType steps; ///< array of steps, i.e. how much each column coordinate (finer system) must be
-							 //// incremented when incrementing the row coordinates; is is the ration between
-			                 //// #finer_sizes and row_generator#physical_sizes
+			///
+			/// array of steps, i.e. how much each column coordinate (finer system) must be
+			/// incremented when incrementing the row coordinates; it is the ratio between
+			/// #finer_sizes and row_generator#physical_sizes
+			ArrayType steps;
 		};
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index ddf9e45a5..6ee46c7b3 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -365,7 +365,8 @@ namespace grb {
 					std::vector< size_t >::const_iterator begin = rows.cbegin();
 					std::vector< size_t >::const_iterator end = rows.cend();
 					// partition_iteration_range( rows.size(), begin, end );
-					grb::RC rc = grb::buildVectorUnique( output_mask, begin, end, true_iter< size_t >( 0 ), true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
+					grb::RC rc = grb::buildVectorUnique( output_mask, begin, end,
+						true_iter< size_t >( 0 ), true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
 					if( rc != SUCCESS ) {
 						std::cerr << "error while creating output mask for color " << i << ": " << toString( rc ) << std::endl;
 						return rc;
@@ -434,7 +435,8 @@ namespace grb {
 			}
 			logger << "- found " << color_counters.size() << " colors,"
 				   << " generating color masks...";
-			return internal::hpcg_build_static_color_masks( system_generator.system_size(), per_color_rows, smoothing_info.color_masks );
+			return internal::hpcg_build_static_color_masks( system_generator.system_size(),
+				per_color_rows, smoothing_info.color_masks );
 		}
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
index cd1761589..5fa1a3772 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -134,14 +134,14 @@ namespace grb {
 			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
 				"cannot construct the Multi-Grid runner by move" );
 
-			Ring ring;                                                        ///< algebraic ring to be used
-			Minus minus;                                                      ///< minus operator to be used
-			bool with_preconditioning = true;                                 ///<  whether preconditioning is enabled
-			size_t max_iterations = 10;                                       ///< max number of allowed iterations for CG: after that, the solver is halted
-			                                                                  ///< and the result achieved so far returned
-			ResidualType tolerance = ring.template getZero< ResidualType >(); ///< ratio between initial residual and current residual that halts the solver
-			                                                                  ///< if reached, for the solution is to be considered "good enough"
-
+			Ring ring; ///< algebraic ring to be used
+			Minus minus; ///< minus operator to be used
+			bool with_preconditioning = true; ///<  whether preconditioning is enabled
+			size_t max_iterations = 10; ///< max number of allowed iterations for CG:
+			///< after that, the solver is halted and the result achieved so far returned
+			ResidualType tolerance = ring.template getZero< ResidualType >(); ///< ratio
+			///< between initial residual and current residual that halts the solver
+			///< if reached, for the solution is to be considered "good enough"
 			MultiGridRunnerType & mg_runner; ///< runner object for MG
 			DbgOutputStreamType dbg_logger; ///< logger to trace execution
 
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
index 4f0d0eed4..a0a76191e 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_data.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -55,13 +55,15 @@ namespace grb {
 			typename TelControllerType
 		> struct MultiGridData {
 
-			grb::utils::telemetry::Stopwatch< TelControllerType > mg_stopwatch; ///< stopwatch to measure the execution time in MG
-			grb::utils::telemetry::Stopwatch< TelControllerType > sm_stopwatch; ///< stopwatch to measure the execution time in the smoother
-			const size_t level;           ///< level of the grid (0 for the finest physical system)
-			const size_t system_size;     ///< size of the system, i.e. side of the #A system matrix
+			grb::utils::telemetry::Stopwatch< TelControllerType > mg_stopwatch; ///< stopwatch
+			///< to measure the execution time in MG
+			grb::utils::telemetry::Stopwatch< TelControllerType > sm_stopwatch; ///< stopwatch
+			///< to measure the execution time in the smoother
+			const size_t level; ///< level of the grid (0 for the finest physical system)
+			const size_t system_size; ///< size of the system, i.e. side of the #A system matrix
 			grb::Matrix< NonzeroType > A; ///< system matrix
-			grb::Vector< IOType > z;      ///< multi-grid solution
-			grb::Vector< IOType > r;      ///< residual
+			grb::Vector< IOType > z; ///< multi-grid solution
+			grb::Vector< IOType > r; ///< residual
 
 			/**
 			 * Construct a new multigrid data object from level information and system size.
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
index 1a036c1cc..bd9a393a4 100644
--- a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -91,8 +91,8 @@ namespace grb {
 			DbgOutputStreamType dbg_logger;   ///< logger to trace execution
 
 			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
-			Ring ring;                                                          ///< algebraic ring
-			Minus minus;                                                        ///< minus operator
+			Ring ring; ///< algebraic ring
+			Minus minus; ///< minus operator
 
 			// operator to extract the reference out of an std::unique_ptr object
 			struct __extractor {
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
index f004610f4..3b558e9f1 100644
--- a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -81,15 +81,17 @@ namespace grb {
 			using Ring = typename SmootherTypes::Ring;
 			using Minus = typename SmootherTypes::Minus;
 			using Divide = typename SmootherTypes::Divide;
-			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external input structure
-			using SmootherDataType = SmootherData< IOType >; ///< smoothing information and temporary variables (per MG level)
-
-			size_t presmoother_steps = 1UL;                            ///< number of pre-smoother steps
-			size_t postsmoother_steps = 1UL;                           ///< number of post-smoother steps
-			size_t non_recursive_smooth_steps = 1UL;                   ///< number of smoother steps for the last grid level
+			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external
+			///< input structure
+			using SmootherDataType = SmootherData< IOType >; ///< smoothing information
+			///< and temporary variables (per MG level)
+
+			size_t presmoother_steps = 1UL; ///< number of pre-smoother steps
+			size_t postsmoother_steps = 1UL; ///< number of post-smoother steps
+			size_t non_recursive_smooth_steps = 1UL; ///< number of smoother steps for the last grid level
 			std::vector< std::unique_ptr< SmootherDataType > > levels; ///< for each grid level,
-			                                                           ///< the smoothing data (finest first)
-			Ring ring;                                                 ///< the algebraic ring
+			///< the smoothing data (finest first)
+			Ring ring; ///< the algebraic ring
 			Minus minus;
 			Divide divide;
 
@@ -136,7 +138,8 @@ namespace grb {
 			/**
 			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
 			 *
-			 * @param[in,out] data structure with external containers, corresponsign to an MG level: vector to smooth, system matrix, residual
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level:
+			 * 	vector to smooth, system matrix, residual
 			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *  unsuccessful operation otherwise
@@ -174,7 +177,8 @@ namespace grb {
 
 				ret = ret ? ret :
                             grb::eWiseLambda(
-								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal , &ri, &mi, &di ]( const size_t i ) {
+								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ,
+									&ri, &mi, &di ]( const size_t i ) {
 									IOType d = A_diagonal[ i ];
 									IOType v;
 									ri.getMultiplicativeOperator().apply( z[ i ], d, v  );
@@ -203,7 +207,8 @@ namespace grb {
 			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
 			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
 			 *
-			 * @param[in,out] data structure with external containers, corresponsign to an MG level: vector to smooth, system matrix, residual
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level:
+			 * 	vector to smooth, system matrix, residual
 			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
index 3b2379802..40f8163f5 100644
--- a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -83,7 +83,8 @@ namespace grb {
 			using Minus = typename CoarsenerTypes::Minus;
 
 			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< input data from MG
-			using CoarseningDataType = CoarseningData< IOType, NonzeroType >; ///< internal data with coarsening information
+			using CoarseningDataType = CoarseningData< IOType, NonzeroType >; ///< internal data
+			///< with coarsening information
 
 			static_assert( std::is_default_constructible< Ring >::value,
 				"cannot construct the Ring with default values" );
diff --git a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
index fff89b6db..0d6250aae 100644
--- a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
+++ b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
@@ -32,8 +32,9 @@ namespace grb {
 		namespace multigrid {
 
 			/**
-			 * Array with fixed size (i.e. decided at object creation) allocated on the heap with an interface compliant
-			 * to what other classes in the geometry namespace expect, like storage() and dimensions() methods.
+			 * Array with fixed size (i.e. decided at object creation) allocated on the heap
+			 * with an interface compliant to what other classes in the geometry namespace expect,
+			 * like storage() and dimensions() methods.
 			 *
 			 * It describes a vector of dimensions #dimensions().
 			 *
diff --git a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
index e51d7d6df..ebda27890 100644
--- a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
+++ b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
@@ -69,7 +69,8 @@ namespace grb {
 				typename ValueCallable
 			> struct HaloMatrixGeneratorIterator {
 
-				static_assert( std::is_copy_constructible< ValueCallable >::value, "ValueCallable must be copy-constructible" );
+				static_assert( std::is_copy_constructible< ValueCallable >::value,
+					"ValueCallable must be copy-constructible" );
 
 				using RowIndexType = CoordType; ///< numeric type of rows
 				using ColumnIndexType = CoordType;
@@ -145,8 +146,9 @@ namespace grb {
 				 * Increments the iterator by moving coordinates to the next (row, column) to iterate on.
 				 *
 				 * This operator internally increments the columns coordinates until wrap-around, when it increments
-				 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
-				 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
+				 * the row coordinates and resets the column coordinates to the first possible columns;
+				 * this column coordinate depends on the row coordinates according to the dimensions
+				 * iteration order and on the parameter \p halo.
 				 *
 				 * @return HaloMatrixGeneratorIterator<DIMS, T>& \c this object, with the updated state
 				 */
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
index 3a5047277..6c020c39d 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
@@ -262,7 +262,8 @@ namespace grb {
 						throw std::out_of_range( "the system has no more elements" );
 					}
 					size_t num_neighbours = this->_neighbors_subspace.system_size();
-					size_t neighbour_position_offset = this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
+					size_t neighbour_position_offset =
+						this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
 					++( this->_point._element_iter );
 					this->on_element_advance();
 					this->_point._position -= neighbour_position_offset;
@@ -297,7 +298,8 @@ namespace grb {
 						throw std::range_error( "neighbor linear value beyond system" );
 					}
 					VectorType final_element( DIMS );
-					size_t neighbor_index = ( this->_point._system->neighbour_linear_to_element( final_position, final_element ) );
+					size_t neighbor_index =
+						this->_point._system->neighbour_linear_to_element( final_position, final_element );
 
 					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
 					this->_point._position = final_position;
@@ -319,7 +321,8 @@ namespace grb {
 				 * It throws if the result cannot be stored as a difference_type variable.
 				 */
 				difference_type operator-( const SelfType & other ) const {
-					return grb::utils::compute_signed_distance< difference_type, SizeType >( _point.get_position(), other._point.get_position() );
+					return grb::utils::compute_signed_distance< difference_type, SizeType >( _point.get_position(),
+						other._point.get_position() );
 				}
 
 				/**
@@ -349,7 +352,8 @@ namespace grb {
 				 */
 				inline void on_neighbor_iter_update() {
 					for( size_t i = 0; i < DIMS; i++ ) {
-						this->_point._neighbor[ i ] = this->_neighbors_start[ i ] + this->_neighbor_iter->get_position()[ i ];
+						this->_point._neighbor[ i ] = this->_neighbors_start[ i ]
+							+ this->_neighbor_iter->get_position()[ i ];
 					}
 				}
 
@@ -360,7 +364,8 @@ namespace grb {
 				void on_element_update() {
 					// reset everything
 					VectorType neighbors_range( DIMS );
-					this->_point._system->compute_neighbors_range( this->_point._element_iter->get_position(), this->_neighbors_start, neighbors_range );
+					this->_point._system->compute_neighbors_range( this->_point._element_iter->get_position(),
+						this->_neighbors_start, neighbors_range );
 					// re-target _neighbors_subspace
 					this->_neighbors_subspace.retarget( neighbors_range );
 				}
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
index 1ebe04b73..34e16069d 100644
--- a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -276,7 +276,8 @@ namespace grb {
 					size_t halo,
 					NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & config_neighbors
 				) {
-					using it_type = typename NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >::DomainIterator;
+					using it_type = typename NDimVector< SizeType, SizeType,
+						DynamicVectorStorage< SizeType > >::DomainIterator;
 					it_type end = config_neighbors.domain_end();
 					for( it_type it = config_neighbors.domain_begin(); it != end; ++it ) {
 						size_t res = 1;
@@ -300,9 +301,9 @@ namespace grb {
 				 *   face slabs or inner slabs)
 				 *  - dimension 1 (y axis) moves along "rows" within each slab, whose total number of neighbors
 				 *	  depends on whether the row is at the extreme sides (top or bottom of the face) or inside;
-				 *   in turn, each type of slab has different geometry (face slabs comprise mesh corners, edges and faces,
-				 *   while inner slabs comprise edges, faces and inner elements), thus resulting in 2*2 different
-				 *   configurations of dimension-1 total neighbors
+				 *   in turn, each type of slab has different geometry (face slabs comprise mesh corners, edges and
+				 * 	 faces, while inner slabs comprise edges, faces and inner elements), thus resulting in
+				 *   2*2 different configurations of dimension-1 total neighbors
 				 *  - dimension 0 (x axis) moves along "column" elements within each row, where the first (or last)
 				 *   column has a different number of neighbors than the inner ones; here again are two configuration
 				 *   for each dimension-1 configuration, leading to a total of 8 dimension-1 configurations
@@ -478,7 +479,8 @@ namespace grb {
 							halo_max_neighs = neighbors.at( halo_coords_begin );
 						}
 #ifdef _DEBUG
-						std::cout << "- initial halo - neighbour " << neighbor_linear << std::endl << "\th " << h << std::endl << "\thalo : ";
+						std::cout << "- initial halo - neighbour " << neighbor_linear
+							<< std::endl << "\th " << h << std::endl << "\thalo : ";
 						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
 						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
 #endif
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
index 9b0e61a8a..a4ae8af5e 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
@@ -84,7 +84,10 @@ namespace grb {
 
 					NDimPoint( NDimPoint && ) = delete;
 
-					NDimPoint( const LinNDimSysType & _system ) noexcept : system( &_system ), coords( _system.dimensions() ) {
+					NDimPoint( const LinNDimSysType & _system ) noexcept :
+						system( &_system ),
+						coords( _system.dimensions() )
+					{
 						std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
 					}
 
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
index a02a0c631..c4b62707a 100644
--- a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
+++ b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
@@ -226,9 +226,9 @@ namespace grb {
 				 */
 				void retarget( ConstVectorReference _new_sizes ) {
 					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
-						throw std::invalid_argument(
-							"new system must have same dimensions as previous: new " + std::to_string( _new_sizes.dimensions() )
-								+ ", old " + std::to_string( this->_sizes.dimensions() ) );
+						throw std::invalid_argument( "new system must have same dimensions as previous: new "
+						+ std::to_string( _new_sizes.dimensions() ) + ", old "
+						+ std::to_string( this->_sizes.dimensions() ) );
 					}
 					this->_sizes = _new_sizes; // copy
 					this->_system_size = compute_range_product( _new_sizes.begin(), _new_sizes.end(),
diff --git a/include/graphblas/utils/telemetry/Stopwatch.hpp b/include/graphblas/utils/telemetry/Stopwatch.hpp
index a607a3cbd..f599ede03 100644
--- a/include/graphblas/utils/telemetry/Stopwatch.hpp
+++ b/include/graphblas/utils/telemetry/Stopwatch.hpp
@@ -96,7 +96,8 @@ namespace grb {
 
 				typedef typename std::chrono::high_resolution_clock::time_point time_point_t;
 
-				duration_t elapsedTime; ///< measured elapsed time so far, i.e., accumulated time periods between successive calls to #start() and #stop()
+				duration_t elapsedTime; ///< measured elapsed time so far, i.e.,
+				///< accumulated time periods between successive calls to #start() and #stop()
 
 				time_point_t beginning; ///< time instant of last call to #start()
 
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index dac62457b..e2f5644c2 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -629,7 +629,8 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	// check sizes
 	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
 	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
-		std::lldiv_t div_res = std::div( static_cast< long long >( s ), static_cast< long long >( max_system_divider ) );
+		std::lldiv_t div_res = std::div( static_cast< long long >( s ),
+			static_cast< long long >( max_system_divider ) );
 		if( div_res.rem != 0 ) {
 			std::cerr << "ERROR: system size " << s << " cannot be coarsened " << sim_in.max_coarsening_levels
 				<< " times because it is not exactly divisible" << std::endl;

From bf1145562466cc2214d276909083d3cb8d9413e6 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 23 Jun 2023 11:51:36 +0200
Subject: [PATCH 28/28] Code review of average_coarsener.hpp

---
 .../algorithms/hpcg/average_coarsener.hpp     | 609 ++++++++++--------
 .../algorithms/hpcg/system_building_utils.hpp |   2 +-
 2 files changed, 335 insertions(+), 276 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/average_coarsener.hpp b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
index 983e5ad8f..41abed9e2 100644
--- a/include/graphblas/algorithms/hpcg/average_coarsener.hpp
+++ b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
@@ -34,320 +34,379 @@
 #include <graphblas/utils/multigrid/array_vector_storage.hpp>
 #include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
 
+
 namespace grb {
+
 	namespace algorithms {
 
-		// forward declaration
-		template<
-			size_t DIMS,
-			typename CoordType,
-			typename ValueType
-		> class AverageCoarsenerBuilder;
-
-		/**
-		 * Iterator class to generate the coarsening matrix that averages over the elements of the finer
-		 * domain corresponding to the element of the coarser domain.
-		 *
-		 * The coarsening matrix averages \b all elements that are coarsened into one.
-		 *
-		 * This coarsening method requires some computation but should be relatively robust to noise
-		 * or to partitioning strategies to parallelize the smoother (usually run before coarsening).
-		 *
-		 * This iterator is random-access.
-		 *
-		 * @tparam DIMS number of dimensions
-		 * @tparam CoordType type storing the coordinates and the sizes
-		 * @tparam ValueType type of the nonzero: it must be able to represent 1 /
-		 * 	<number of finer elements per coarser elements>
-		 */
-		template<
-			size_t DIMS,
-			typename CoordType,
-			typename ValueType
-		> struct AverageGeneratorIterator {
-
-			friend AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
-
-			using RowIndexType = CoordType; ///< numeric type of rows
-			using ColumnIndexType = CoordType;
-			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType,
-				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
-			using LinearSystemIterType = typename LinearSystemType::Iterator;
-			using SelfType = AverageGeneratorIterator< DIMS, CoordType, ValueType >;
-			using ArrayType = std::array< CoordType, DIMS >;
-
-			struct _ValueGenerator {
-
-				friend SelfType;
-
-				_ValueGenerator(
-					RowIndexType i,
-					ColumnIndexType j,
-					ValueType value
-				) noexcept :
-					_i( i ),
-					_j( j ),
-					_value( value ) {}
+		namespace hpcg {
 
-				_ValueGenerator( const _ValueGenerator & ) = default;
+			// forward declaration
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			class AverageCoarsenerBuilder;
 
-				_ValueGenerator & operator=( const _ValueGenerator & ) = default;
+			/**
+			 * Iterator class to generate the coarsening matrix that averages over the
+			 * elements of the finer domain corresponding to the element of the coarser
+			 * domain.
+			 *
+			 * The coarsening matrix averages \b all elements that are coarsened into
+			 * one.
+			 *
+			 * This coarsening method requires some computation but should be relatively
+			 * robust to noise or to partitioning strategies that parallelize the
+			 * smoother (usually run before coarsening).
+			 *
+			 * This iterator is random-access.
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType type storing the coordinates and the sizes
+			 * @tparam ValueType type of the nonzero: it must be able to represent 1 /
+			 *                   <number of finer elements per coarser elements>
+			 */
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			struct AverageGeneratorIterator {
 
-				inline RowIndexType i() const {
-					return _i;
-				}
-				inline ColumnIndexType j() const {
-					return _j;
-				}
-				inline ValueType v() const {
-					return _value;
-				}
+				friend AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
 
-			private:
-				RowIndexType _i;
-				ColumnIndexType _j;
-				ValueType _value;
-			};
+				/** Numeric type of rows */
+				typedef CoordType RowIndexType;
 
-			// interface for std::random_access_iterator
-			using iterator_category = std::random_access_iterator_tag;
-			using value_type = _ValueGenerator;
-			using pointer = const value_type;
-			using reference = const value_type &;
-			using difference_type = typename LinearSystemIterType::difference_type;
+				/** Numeric type of columns */
+				typedef CoordType ColumnIndexType;
 
-			AverageGeneratorIterator( const SelfType & o ) = default;
+				typedef typename grb::utils::multigrid::LinearizedNDimSystem<
+						CoordType,
+						grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+					> LinearSystemType;
 
-			AverageGeneratorIterator( SelfType && o ) = default;
+				typedef typename LinearSystemType::Iterator LinearSystemIterType;
 
-			SelfType & operator=( const SelfType & ) = default;
+				typedef AverageGeneratorIterator< DIMS, CoordType, ValueType > SelfType;
 
-			SelfType & operator=( SelfType && ) = default;
+				typedef std::array< CoordType, DIMS > ArrayType;
 
-			/**
-			 * Advances \c this by 1 in constant time.
-			 */
-			SelfType & operator++() noexcept {
-				(void)++_subspace_iter;
-				size_t subspace_position = _subspace_iter->get_linear_position();
-				// std::cout << "subspace_position " << subspace_position << std::endl;
-				if( subspace_position == _num_neighbors ) {
-					(void)++_sys_iter;
+				class ValueGenerator {
+
+					friend SelfType;
+
+
+					private:
+
+						RowIndexType _i;
+
+						ColumnIndexType _j;
+
+						ValueType _value;
+
+
+					public:
+
+						ValueGenerator(
+							RowIndexType i,
+							ColumnIndexType j,
+							ValueType value
+						) noexcept :
+							_i( i ),
+							_j( j ),
+							_value( value )
+						{}
+
+						ValueGenerator( const ValueGenerator & ) = default;
+
+						ValueGenerator & operator=( const ValueGenerator & ) = default;
+
+						inline RowIndexType i() const {
+							return _i;
+						}
+
+						inline ColumnIndexType j() const {
+							return _j;
+						}
+
+						inline ValueType v() const {
+							return _value;
+						}
+
+				};
+
+				// interface for std::random_access_iterator
+				typedef std::random_access_iterator_tag iterator_category;
+
+				typedef ValueGenerator value_type;
+
+				typedef const value_type pointer;
+
+				typedef const value_type & reference;
+
+				typedef typename LinearSystemIterType::difference_type difference_type;
+
+				AverageGeneratorIterator( const SelfType &o ) = default;
+
+				AverageGeneratorIterator( SelfType && ) = default;
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				SelfType & operator=( SelfType && ) = default;
+
+				/**
+				 * Advances \c this by 1 in constant time.
+				 */
+				SelfType & operator++() noexcept {
+					(void) ++_subspace_iter;
+					size_t subspace_position = _subspace_iter->get_linear_position();
+					if( subspace_position == _num_neighbors ) {
+						(void) ++_sys_iter;
+						_subspace_iter = _finer_subspace->begin();
+					}
+					update_coords();
+					return *this;
+				}
+
+				/**
+				 * Advances \c this by \p offset in constant time.
+				 */
+				SelfType & operator+=( size_t offset ) {
+					CoordType sub_offset = _subspace_iter->get_linear_position() + offset;
+					std::ldiv_t res = std::ldiv( sub_offset, _num_neighbors );
+					_sys_iter += res.quot;
 					_subspace_iter = _finer_subspace->begin();
+					_subspace_iter += res.rem;
+					update_coords();
+					return *this;
 				}
-				update_coords();
-				return *this;
-			}
 
-			/**
-			 * Advances \c this by \p offset in constant time.
-			 */
-			SelfType & operator+=( size_t offset ) {
-				CoordType sub_offset = _subspace_iter->get_linear_position() + offset;
-				std::ldiv_t res = std::ldiv( sub_offset, _num_neighbors );
-				_sys_iter += res.quot;
-				_subspace_iter = _finer_subspace->begin();
-				_subspace_iter += res.rem;
-				update_coords();
-				return *this;
-			}
+				/**
+				 * Computes the difference between \c this and \p o as integer.
+				 */
+				difference_type operator-( const SelfType &o ) const {
+					return this->_sys_iter - o._sys_iter;
+				}
 
-			/**
-			 * Computes the difference between \c this and \p o as integer.
-			 */
-			difference_type operator-( const SelfType & o ) const {
-				return this->_sys_iter - o._sys_iter;
-			}
+				/**
+				 * Returns whether \c this and \p o differ.
+				 */
+				bool operator!=( const SelfType &o ) const {
+					return this->_sys_iter != o._sys_iter;
+				}
 
-			/**
-			 * Returns whether \c this and \p o differ.
-			 */
-			bool operator!=( const SelfType & o ) const {
-				return this->_sys_iter != o._sys_iter;
-			}
+				/**
+				 * Returns whether \c this and \p o are equal.
+				 */
+				bool operator==( const SelfType &o ) const {
+					return ! this->operator!=( o );
+				}
 
-			/**
-			 * Returns whether \c this and \p o are equal.
-			 */
-			bool operator==( const SelfType & o ) const {
-				return ! this->operator!=( o );
-			}
+				reference operator*() const {
+					return _val;
+				}
 
-			reference operator*() const {
-				return _val;
-			}
+				pointer operator->() const {
+					return &_val;
+				}
 
-			pointer operator->() const {
-				return &_val;
-			}
+				/**
+				 * Returns the current row, within the coarser system.
+				 */
+				inline RowIndexType i() const {
+					return _val.i();
+				}
 
-			/**
-			 * Returns the current row, within the coarser system.
-			 */
-			inline RowIndexType i() const {
-				return _val.i();
-			}
+				/**
+				 * Returns the current column, within the finer system.
+				 */
+				inline ColumnIndexType j() const {
+					return _val.j();
+				}
 
-			/**
-			 * Returns the current column, within the finer system.
-			 */
-			inline ColumnIndexType j() const {
-				return _val.j();
-			}
+				/**
+				 * Returns always 1, as the coarsening keeps the same value.
+				 */
+				inline ValueType v() const {
+					return _val.v();
+				}
 
-			/**
-			 * Returns always 1, as the coarsening keeps the same value.
-			 */
-			inline ValueType v() const {
-				return _val.v();
-			}
-
-		private:
-			const LinearSystemType * _lin_sys;
-			const LinearSystemType * _finer_subspace;
-			const ArrayType * _steps;
-			CoordType _num_neighbors;
-			LinearSystemIterType _sys_iter;
-			LinearSystemIterType _subspace_iter;
-			value_type _val;
 
-			/**
-			 * Construct a new AverageGeneratorIterator object starting from the LinearizedNDimSystem
-			 * object \p system describing the \b coarser system and the \b ratios \p steps between each finer and
-			 * the corresponding corser dimension.
-			 *
-			 * @param system LinearizedNDimSystem object describing the coarser system
-			 * @param finer_subspace LinearizedNDimSystem object describing the subspace of each element
-			 *  in the finer system
-			 * @param steps ratios per dimension between finer and coarser system
-			 */
-			AverageGeneratorIterator(
-				const LinearSystemType & system,
-				const LinearSystemType & finer_subspace,
-				const ArrayType & steps
-			) noexcept :
-				_lin_sys( &system ),
-				_finer_subspace( &finer_subspace ),
-				_steps( &steps ),
-				_num_neighbors( std::accumulate( steps.cbegin(), steps.cend(), 1UL, std::multiplies< CoordType >() ) ),
-				_sys_iter( system.begin() ),
-				_subspace_iter( finer_subspace.begin() ),
-				_val( 0, 0, static_cast< ValueType >( 1 ) / static_cast< ValueType >( _num_neighbors ) )
-			{
-				update_coords();
-			}
-
-			void update_coords() noexcept {
-				_val._i = _sys_iter->get_linear_position();
-				_val._j = coarse_rows_to_finer_col();
-			}
+			private:
 
-			/**
-			 * Returns the row coordinates converted to the finer system, to compute
-			 * the column value.
-			 */
-			ColumnIndexType coarse_rows_to_finer_col() const noexcept {
-				ColumnIndexType finer = 0;
-				ColumnIndexType s = 1;
-				for( size_t i = 0; i < DIMS; i++ ) {
-					finer += s * _subspace_iter->get_position()[ i ];
-					s *= ( *_steps )[ i ];
-					finer += s * _sys_iter->get_position()[ i ];
-					s *= _lin_sys->get_sizes()[ i ];
+				const LinearSystemType * _lin_sys;
+				const LinearSystemType * _finer_subspace;
+				const ArrayType * _steps;
+				CoordType _num_neighbors;
+				LinearSystemIterType _sys_iter;
+				LinearSystemIterType _subspace_iter;
+				value_type _val;
+
+				/**
+				 * Construct a new AverageGeneratorIterator object starting from the
+				 * LinearizedNDimSystem object \p system describing the \b coarser system
+				 * and the \b ratios \p steps between each finer and the corresponding
+				 * coarser dimension.
+				 *
+				 * @param system LinearizedNDimSystem object describing the coarser system
+				 * @param finer_subspace LinearizedNDimSystem object describing the subspace
+				 *                       of each element in the finer system
+				 * @param steps Ratios per dimension between finer and coarser system
+				 */
+				AverageGeneratorIterator(
+					const LinearSystemType &system,
+					const LinearSystemType &finer_subspace,
+					const ArrayType &steps
+				) noexcept :
+					_lin_sys( &system ),
+					_finer_subspace( &finer_subspace ),
+					_steps( &steps ),
+					_num_neighbors( std::accumulate( steps.cbegin(), steps.cend(), 1UL,
+						std::multiplies< CoordType >() ) ),
+					_sys_iter( system.begin() ),
+					_subspace_iter( finer_subspace.begin() ),
+					_val( 0, 0, static_cast< ValueType >( 1 ) /
+						static_cast< ValueType >( _num_neighbors ) )
+				{
+					update_coords();
+				}
+
+				void update_coords() noexcept {
+					_val._i = _sys_iter->get_linear_position();
+					_val._j = coarse_rows_to_finer_col();
 				}
-				return finer;
-			}
-		};
-
-		/**
-		 * Builder object to create iterators that generate an averaging-coarsening matrix.
-		 *
-		 * It is a facility to generate beginning and end iterators and abstract the logic away from users.
-		 *
-		 * @tparam DIMS number of dimensions
-		 * @tparam CoordType type storing the coordinates and the sizes
-		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
-		 *  the finer value)
-		 */
-		template<
-			size_t DIMS,
-			typename CoordType,
-			typename ValueType
-		> class AverageCoarsenerBuilder {
-		public:
-			using ArrayType = std::array< CoordType, DIMS >;
-			using Iterator = AverageGeneratorIterator< DIMS, CoordType, ValueType >;
-			using SelfType = AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+				/**
+				 * Returns the row coordinates converted to the finer system, to compute
+				 * the column value.
+				 */
+				ColumnIndexType coarse_rows_to_finer_col() const noexcept {
+					ColumnIndexType finer = 0;
+					ColumnIndexType s = 1;
+					for( size_t i = 0; i < DIMS; i++ ) {
+						finer += s * _subspace_iter->get_position()[ i ];
+						s *= ( *_steps )[ i ];
+						finer += s * _sys_iter->get_position()[ i ];
+						s *= _lin_sys->get_sizes()[ i ];
+					}
+					return finer;
+				}
+
+			};
 
 			/**
-			 * Construct a new AverageCoarsenerBuilder object from the sizes of finer system
-			 * and those of the coarser system; finer sizes must be an exact multiple of coarser sizes,
-			 * otherwise an exception is raised.
+			 * Builder object to create iterators that generate an averaging-coarsening
+			 * matrix.
+			 *
+			 * It is a facility to generate beginning and end iterators and abstract the
+			 * logic away from users.
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType type storing the coordinates and the sizes
+			 * @tparam ValueType type of the nonzero: it must be able to represent 1
+			 *                   (the value to sample the finer value)
 			 */
-			AverageCoarsenerBuilder(
-				const ArrayType & _finer_sizes,
-				const ArrayType & _coarser_sizes
-			) :
-				system( _coarser_sizes.begin(), _coarser_sizes.end() ),
-				_finer_subspace( _coarser_sizes.cbegin(), _coarser_sizes.cend() ),
-				steps( DIMS )
-			{
-				for( size_t i = 0; i < DIMS; i++ ) {
-					// finer size MUST be an exact multiple of coarser_size
-					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
-					if( ratio.quot < 2 || ratio.rem != 0 ) {
-						throw std::invalid_argument( std::string( "finer size of dimension " )
-							+ std::to_string( i ) + std::string( "is not an exact multiple of coarser size" ) );
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			class AverageCoarsenerBuilder {
+
+				public:
+
+					typedef std::array< CoordType, DIMS > ArrayType;
+					typedef AverageGeneratorIterator< DIMS, CoordType, ValueType > Iterator;
+					typedef AverageCoarsenerBuilder< DIMS, CoordType, ValueType > SelfType;
+
+					/**
+					 * Construct a new AverageCoarsenerBuilder object from the sizes of finer
+					 * system and those of the coarser system; finer sizes must be an exact
+					 * multiple of coarser sizes, otherwise an exception is raised.
+					 */
+					AverageCoarsenerBuilder(
+						const ArrayType &_finer_sizes,
+						const ArrayType &_coarser_sizes
+					) :
+						system( _coarser_sizes.begin(), _coarser_sizes.end() ),
+						_finer_subspace( _coarser_sizes.cbegin(), _coarser_sizes.cend() ),
+						steps( DIMS )
+					{
+						for( size_t i = 0; i < DIMS; i++ ) {
+							// finer size MUST be an exact multiple of coarser_size
+							std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
+							if( ratio.quot < 2 || ratio.rem != 0 ) {
+								throw std::invalid_argument(
+									std::string( "finer size of dimension " ) + std::to_string( i ) +
+									std::string( "is not an exact multiple of coarser size" ) );
+							}
+							steps[ i ] = ratio.quot;
+						}
+						_finer_subspace.retarget( steps );
 					}
-					steps[ i ] = ratio.quot;
-				}
-				_finer_subspace.retarget( steps );
-			}
 
-			AverageCoarsenerBuilder( const SelfType & ) = delete;
+				AverageCoarsenerBuilder( const SelfType & ) = delete;
 
-			AverageCoarsenerBuilder( SelfType && ) = delete;
+				AverageCoarsenerBuilder( SelfType && ) = delete;
 
-			SelfType & operator=( const SelfType & ) = delete;
+				SelfType & operator=( const SelfType & ) = delete;
 
-			SelfType & operator=( SelfType && ) = delete;
+				SelfType & operator=( SelfType && ) = delete;
 
-			/**
-			 * Returns the size of the finer system, i.e. its number of elements.
-			 */
-			size_t system_size() const {
-				return system.system_size();
-			}
+				/**
+				 * Returns the size of the finer system, i.e. its number of elements.
+				 */
+				size_t system_size() const {
+					return system.system_size();
+				}
 
-			/**
-			 * Produces a beginning iterator to generate the coarsening matrix.
-			 */
-			Iterator make_begin_iterator() {
-				return Iterator( system, _finer_subspace, steps );
-			}
+				/**
+				 * Produces a beginning iterator to generate the coarsening matrix.
+				 */
+				Iterator make_begin_iterator() {
+					return Iterator( system, _finer_subspace, steps );
+				}
 
-			/**
-			 * Produces an end iteratormto stop the generation of the coarsening matrix.
-			 */
-			Iterator make_end_iterator() {
-				Iterator result( system, _finer_subspace, steps );
-				result += ( system_size() * _finer_subspace.system_size() ); // do not trigger boundary checks
-				// ++result;
-				return result;
-			}
-
-		private:
-			const grb::utils::multigrid::LinearizedNDimSystem< CoordType,
-				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
-			grb::utils::multigrid::LinearizedNDimSystem< CoordType,
-				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > _finer_subspace;
-			///
-			/// array of steps, i.e. how much each column coordinate (finer system) must be
-			/// incremented when incrementing the row coordinates; it is the ratio between
-			//// #finer_sizes and row_generator#physical_sizes
-			grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;
-		};
+				/**
+				 * Produces an end iterator to stop the generation of the coarsening
+				 * matrix.
+				 */
+				Iterator make_end_iterator() {
+					Iterator result( system, _finer_subspace, steps );
+					// do not trigger boundary checks
+					result += ( system_size() * _finer_subspace.system_size() );
+					return result;
+				}
+
+
+			private:
+
+				const grb::utils::multigrid::LinearizedNDimSystem<
+					CoordType,
+					grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+				> system;
+
+				grb::utils::multigrid::LinearizedNDimSystem<
+					CoordType,
+					grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+				> _finer_subspace;
+
+				/**
+				 * Array of steps, i.e. how much each column coordinate (finer system) must
+				 * be incremented when incrementing the row coordinates; it is the ratio
+				 * between #finer_sizes and row_generator#physical_sizes
+				 */
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;
+			};
+
+		} // namespace internal
 
 	} // namespace algorithms
+
 } // namespace grb
+
 #endif // _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 6ee46c7b3..37e6da311 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -243,7 +243,7 @@ namespace grb {
 			CoarseningData< IOType, NonzeroType > & coarsener
 		) {
 			return hpcg_populate_coarsener_any_builder<
-			grb::algorithms::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
+			grb::algorithms::hpcg::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
 				finer_system_generator, coarser_system_generator, coarsener );
 		}