From 474802143b3e3f2f43c558df5d0525d02879c025 Mon Sep 17 00:00:00 2001 From: byjtew Date: Wed, 24 May 2023 17:33:34 +0200 Subject: [PATCH 01/37] Add new unit-test for eWiseApply(matrices) --- tests/unit/CMakeLists.txt | 4 + tests/unit/eWiseApplyMatrix_variants.cpp | 284 +++++++++++++++++++++++ tests/unit/unittests.sh | 14 +- 3 files changed, 298 insertions(+), 4 deletions(-) create mode 100644 tests/unit/eWiseApplyMatrix_variants.cpp diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 16999fd42..815db9d2b 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -85,6 +85,10 @@ add_grb_executables( ewiseapply ewiseapply.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) +add_grb_executables( eWiseApplyMatrix_variants eWiseApplyMatrix_variants.cpp + BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking +) + add_grb_executables( eWiseMatrix eWiseMatrix.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp new file mode 100644 index 000000000..0e57b8f58 --- /dev/null +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -0,0 +1,284 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author Benjamin Lozes + * @date 24th of May, 2023 + * + * @brief Test for eWiseApply(Matrix, Monoid) + * and eWiseApply(Matrix, Operator) variants + * + * This test is meant to ensure the behaviour of the eWiseApply(Matrix, Monoid) + * and eWiseApply(Matrix, Operator) variants is correct. Precisely, we expect + * the following behaviour: + * - eWiseApply(Matrix, Monoid) should apply the monoid to all elements of + * the two matrices, INCLUDING the couples (non_zero, zero), using the + * provided identity value for the zero elements. + * - eWiseApply(Matrix, Operator) should apply the operator to all elements + * of the two matrices, EXCLUDING the couples (non_zero, zero) + * + */ + +#include +#include +#include +#include + +#include + +#define _DEBUG + +using nz_type = int; + +constexpr size_t M = 10; +constexpr size_t N = 10; +constexpr nz_type A_INITIAL_VALUE = 1; +constexpr nz_type B_INITIAL_VALUE = 3; + +namespace utils { + template< class Iterator > + void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) { +#ifndef _DEBUG + return; +#endif + std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl; + if( rows > 50 || cols > 50 ) { + os << " Matrix too large to print" << std::endl; + } else { + // os.precision( 3 ); + for( size_t y = 0; y < rows; y++ ) { + os << std::string( 3, ' ' ); + for( size_t x = 0; x < cols; x++ ) { + auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) { + return a.first.first == y && a.first.second == x; + } ); + if( nnz_val != end ) + os << std::fixed << ( *nnz_val ).second; + else + os << '_'; + os << " "; + } + os << std::endl; + } + } + os << "]" << std::endl; + std::flush( os ); + } + + template< typename D > + void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) { + grb::wait( mat ); + printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os ); + } + + template< typename D > + bool equals_matrix( const grb::Matrix< D > & A, const grb::Matrix< D > & B ) { + if( grb::nrows( A ) != grb::nrows( B ) || grb::ncols( A ) != grb::ncols( B ) ) + return false; + grb::wait( A ); + grb::wait( B ); + std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() ); + std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() ); + return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); + } +} // namespace utils + +template< class Monoid > +struct input_t { + const grb::Matrix< nz_type > & A; + const grb::Matrix< nz_type > & B; + const grb::Matrix< nz_type > & C_monoid; + const grb::Matrix< nz_type > & C_operator; + const Monoid & monoid; + + input_t( + const grb::Matrix< nz_type > & A = {0,0}, + const grb::Matrix< nz_type > & B = {0,0}, + const grb::Matrix< nz_type > & C_monoid = {0,0}, + const grb::Matrix< nz_type > & C_operator = {0,0}, + const Monoid & monoid = Monoid() ) : + A( A ), B( B ), C_monoid( C_monoid ), C_operator( C_operator ), monoid( monoid ) {} +}; + +struct output_t { + grb::RC rc; +}; + +template< class Monoid > +void grb_program( const input_t< Monoid > & input, output_t & output ) { + static_assert( grb::is_monoid< Monoid >::value, "Monoid required" ); + const auto & op = input.monoid.getOperator(); + grb::wait( input.A ); + grb::wait( input.B ); + + auto & rc = output.rc; + + utils::printSparseMatrix( input.A, "A" ); + utils::printSparseMatrix( input.B, "B" ); + + { // Operator variant + std::cout << "-- eWiseApply using Operator, supposed to be annihilating non-zeroes -> INTERSECTION\n"; + grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) ); + rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::RESIZE ); + grb::wait( C ); + if( rc != grb::RC::SUCCESS ) { + std::cerr << "Error: Phase::RESIZE\n"; + return; + } + rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::EXECUTE ); + grb::wait( C ); + if( rc != grb::RC::SUCCESS ) { + std::cerr << "Error: Phase::EXECUTE\n"; + return; + } + + if( ! utils::equals_matrix( C, input.C_operator ) ) { + std::cerr << "Error: Wrong result\n"; + utils::printSparseMatrix( C, "Obtained (operator)", std::cerr ); + utils::printSparseMatrix( input.C_operator, "Truth (operator)", std::cerr ); + rc = grb::RC::FAILED; + return; + } + + std::cout << "Result (operator) is correct\n"; + } + + { // Monoid variant + std::cout << "-- eWiseApply using Monoid, supposed to consider non-zeroes as the identity -> UNION\n"; + grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) ); + rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::RESIZE ); + grb::wait( C ); + if( rc != grb::RC::SUCCESS ) { + std::cerr << "Error: Phase::RESIZE\n"; + return; + } + rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::EXECUTE ); + grb::wait( C ); + if( rc != grb::RC::SUCCESS ) { + std::cerr << "Error: Phase::EXECUTE\n"; + return; + } + + if( ! utils::equals_matrix( C, input.C_monoid ) ) { + std::cerr << "Error: Wrong result\n"; + utils::printSparseMatrix( C, "Obtained (monoid)", std::cerr ); + utils::printSparseMatrix( input.C_monoid, "Truth (monoid)", std::cerr ); + rc = grb::RC::FAILED; + return; + } + + std::cout << "Result (monoid) is correct\n"; + } + + rc = grb::RC::SUCCESS; +} + +int main( int argc, char ** argv ) { + (void) argc; + (void) argv; + + if(argc > 1) std::cout << "Usage: " << argv[ 0 ] << std::endl; + + std::cout << "This is functional test " << argv[ 0 ] << std::endl; + grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher; + grb::RC rc = grb::RC::SUCCESS; + + // Create input data + /** Matrix A: Row matrix filled with A_INITIAL_VALUE + * X X X X X + * _ _ _ _ _ + * _ _ _ _ _ (...) + * _ _ _ _ _ + * _ _ _ _ _ + * (...) + */ + grb::Matrix< nz_type > A( M, N, N ); + std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); + std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); + std::iota( A_cols.begin(), A_cols.end(), 0 ); + rc = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::SEQUENTIAL ); + assert( rc == grb::RC::SUCCESS ); + + /** Matrix B: Column matrix filled with B_INITIAL_VALUE + * Y _ _ _ _ + * Y _ _ _ _ + * Y _ _ _ _ (...) + * Y _ _ _ _ + * Y _ _ _ _ + * (...) + */ + grb::Matrix< nz_type > B( M, N, N ); + std::vector< size_t > B_rows( M, 0 ), B_cols( M, 0 ); + std::vector< nz_type > B_values( M, B_INITIAL_VALUE ); + std::iota( B_rows.begin(), B_rows.end(), 0 ); + rc = grb::buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), grb::IOMode::SEQUENTIAL ); + assert( rc == grb::RC::SUCCESS ); + + { + /** Matrix C_monoid_truth: Union of A and B + * X+Y X X X X + * Y ___ ___ ___ ___ + * Y ___ ___ ___ ___ (...) + * Y ___ ___ ___ ___ + * Y ___ ___ ___ ___ + * (...) + */ + grb::Matrix< nz_type > C_monoid_truth( M, N ); + size_t nvalues = grb::nrows( A ) + grb::ncols( B ) - 1; + std::vector< size_t > C_monoid_truth_rows( nvalues, 0 ), C_monoid_truth_cols( nvalues, 0 ); + std::vector< nz_type > C_monoid_truth_values( nvalues, 0 ); + C_monoid_truth_values[ 0 ] = A_INITIAL_VALUE + B_INITIAL_VALUE; + std::iota( C_monoid_truth_rows.begin() + grb::nrows( A ), C_monoid_truth_rows.end(), 1 ); + std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + grb::nrows( A ), 1 ); + std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + grb::nrows( A ), A_INITIAL_VALUE ); + std::fill( C_monoid_truth_values.begin() + grb::nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE ); + rc = grb::buildMatrixUnique( C_monoid_truth, C_monoid_truth_rows.data(), C_monoid_truth_cols.data(), C_monoid_truth_values.data(), C_monoid_truth_values.size(), grb::IOMode::SEQUENTIAL ); + assert( rc == grb::RC::SUCCESS ); + + /** Matrix C_op_truth: Intersection of A and B + * X+Y ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ (...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + grb::Matrix< nz_type > C_op_truth( M, N ); + std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 ); + std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE ); + rc = grb::buildMatrixUnique( C_op_truth, C_op_truth_rows.data(), C_op_truth_cols.data(), C_op_truth_values.data(), C_op_truth_values.size(), grb::IOMode::SEQUENTIAL ); + assert( rc == grb::RC::SUCCESS ); + + { /** Test using addition operator, same type for lhs and rhs + */ + input_t< grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero > > input { A, B, C_monoid_truth, C_op_truth, + grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero >() }; + output_t output { grb::RC::SUCCESS }; + // Run the test + rc = launcher.exec( &grb_program, input, output, false ); + // Check the result + assert( rc == grb::RC::SUCCESS ); + if( output.rc != grb::RC::SUCCESS ) { + std::cout << "Test FAILED (" << grb::toString( output.rc ) << ")" << std::endl; + return 1; + } + } + } + + std::cout << "Test OK" << std::endl; + return 0; +} diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index 3817164c8..f34229c16 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -539,10 +539,16 @@ for MODE in ${MODES}; do grep 'Test OK' ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED" echo " " - echo ">>> [x] [ ] Testing grb::eWiseLambda (matrices)" - $runner ${TEST_BIN_DIR}/eWiseMatrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log - head -1 ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log - grep 'Test OK' ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo ">>> [x] [ ] Testing grb::id on vectors and matrices" + $runner ${TEST_BIN_DIR}/id_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo " " + + echo ">>> [x] [ ] Testing grb::eWiseApply (matrices, Monoid / Operator)" + $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " echo ">>> [x] [ ] Testing grb::zip on two vectors of doubles and" From 0294312403fe3ac88571e431ad09b3d6974fb76c Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 29 Jun 2023 14:20:11 +0200 Subject: [PATCH 02/37] Implement Monoid variant of BLAS3::eWiseApply --- include/graphblas/reference/blas3.hpp | 333 ++++++++++++++++++++++---- 1 file changed, 285 insertions(+), 48 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index f3f918734..e77478564 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -26,6 +26,7 @@ #include #include +#include #include "io.hpp" #include "matrix.hpp" @@ -928,22 +929,20 @@ namespace grb { * \a allow_void is true; otherwise, will be ignored. * \endinternal */ - template< bool allow_void, Descriptor descr, - class MulMonoid, class Operator, + class Operator, typename OutputType, typename InputType1, typename InputType2, typename RIT1, typename CIT1, typename NIT1, typename RIT2, typename CIT2, typename NIT2, typename RIT3, typename CIT3, typename NIT3 > - RC eWiseApply_matrix_generic( + RC eWiseApply_matrix_generic_intersection( Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, const Operator &oper, - const MulMonoid &mulMonoid, const Phase &phase, const typename std::enable_if< !grb::is_object< OutputType >::value && @@ -958,15 +957,14 @@ namespace grb { std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ), - "grb::internal::eWiseApply_matrix_generic: the non-monoid version of " + "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of " "elementwise mxm can only be used if neither of the input matrices " "is a pattern matrix (of type void)" ); assert( phase != TRY ); #ifdef _DEBUG - std::cout << "In grb::internal::eWiseApply_matrix_generic\n"; + std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; #endif - // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; constexpr bool trans_right = descr & descriptors::transpose_right; @@ -992,31 +990,6 @@ namespace grb { auto &C_raw = internal::getCRS( C ); auto &CCS_raw = internal::getCCS( C ); -#ifdef _DEBUG - std::cout << "\t\t A offset array = { "; - for( size_t i = 0; i <= m_A; ++i ) { - std::cout << A_raw.col_start[ i ] << " "; - } - std::cout << "}\n"; - for( size_t i = 0; i < m_A; ++i ) { - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - std::cout << "\t\t ( " << i << ", " << A_raw.row_index[ k ] << " ) = " - << A_raw.getPrintValue( k ) << "\n"; - } - } - std::cout << "\t\t B offset array = { "; - for( size_t j = 0; j <= m_B; ++j ) { - std::cout << B_raw.col_start[ j ] << " "; - } - std::cout << "}\n"; - for( size_t j = 0; j < m_B; ++j ) { - for( size_t k = B_raw.col_start[ j ]; k < B_raw.col_start[ j + 1 ]; ++k ) { - std::cout << "\t\t ( " << B_raw.row_index[ k ] << ", " << j << " ) = " - << B_raw.getPrintValue( k ) << "\n"; - } - } -#endif - // retrieve buffers char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; arr1 = arr2 = buf1 = buf2 = nullptr; @@ -1146,11 +1119,9 @@ namespace grb { for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); - valbuf[ k_col ] = A_raw.getValue( k, - mulMonoid.template getIdentity< typename Operator::D1 >() ); + valbuf[ k_col ] = A_raw.values[ k ]; #ifdef _DEBUG - std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, - mulMonoid.template getIdentity< typename Operator::D1 >() ) << ", "; + std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.values[ k ] << ", "; #endif } #ifdef _DEBUG @@ -1160,11 +1131,9 @@ namespace grb { const size_t l_col = B_raw.row_index[ l ]; if( coors1.assigned( l_col ) ) { coors2.assign( l_col ); - (void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.getValue( l, - mulMonoid.template getIdentity< typename Operator::D2 >() ), oper ); + (void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.values[ l ], oper ); #ifdef _DEBUG - std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, - mulMonoid.template getIdentity< typename Operator::D2 >() ) + std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.values[ l ] << " to yield C( " << i << ", " << l_col << " ), "; #endif } @@ -1190,6 +1159,278 @@ namespace grb { #endif } +#ifndef NDEBUG + for( size_t j = 0; j < n; ++j ) { + assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); + } +#endif + + // set final number of nonzeroes in output matrix + internal::setCurrentNonzeroes( C, nzc ); + } + + // done + return SUCCESS; + } + + /** + * \internal general elementwise matrix application that all eWiseApply + * variants refer to. + * @param[in] oper The operator corresponding to \a mulMonoid if + * \a allow_void is true; otherwise, an arbitrary operator + * under which to perform the eWiseApply. + * @param[in] mulMonoid The monoid under which to perform the eWiseApply if + * \a allow_void is true; otherwise, will be ignored. + * \endinternal + */ + template< + bool allow_void, + Descriptor descr, + class Monoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC eWiseApply_matrix_generic_union( + Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, + const Monoid &monoid, + const Phase &phase, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void >::type * const = nullptr + ) { + + assert( !(descr & descriptors::force_row_major ) ); + static_assert( allow_void || + ( !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value + ) ), + "grb::internal::eWiseApply_matrix_generic_union: the non-monoid version of " + "elementwise mxm can only be used if neither of the input matrices " + "is a pattern matrix (of type void)" ); + assert( phase != TRY ); +#ifdef _DEBUG + std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n"; +#endif + // get whether the matrices should be transposed prior to execution + constexpr bool trans_left = descr & descriptors::transpose_left; + constexpr bool trans_right = descr & descriptors::transpose_right; + + // run-time checks + const size_t m = grb::nrows( C ); + const size_t n = grb::ncols( C ); + const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A ); + const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A ); + const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B ); + const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B ); + + // Identities + const auto identity_A = monoid.template getIdentity< OutputType >(); + const auto identity_B = monoid.template getIdentity< OutputType >(); + + if( m != m_A || m != m_B || n != n_A || n != n_B ) { + return MISMATCH; + } + + const auto oper = monoid.getOperator(); + const auto &A_raw = !trans_left ? + internal::getCRS( A ) : + internal::getCCS( A ); + const auto &B_raw = !trans_right ? + internal::getCRS( B ) : + internal::getCCS( B ); + auto &C_raw = internal::getCRS( C ); + auto &CCS_raw = internal::getCCS( C ); + + + // retrieve buffers + char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; + arr1 = arr2 = buf1 = buf2 = nullptr; + InputType1 * vbuf1 = nullptr; + InputType2 * vbuf2 = nullptr; + OutputType * valbuf = nullptr; + internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); + internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B ); + internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); + // end buffer retrieval + + // initialisations + internal::Coordinates< reference > coors1, coors2; + coors1.set( arr1, false, buf1, n ); + coors2.set( arr2, false, buf2, n ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel + { + size_t start, end; + config::OMP::localRange( start, end, 0, n + 1 ); +#else + const size_t start = 0; + const size_t end = n + 1; +#endif + for( size_t j = start; j < end; ++j ) { + CCS_raw.col_start[ j ] = 0; + } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + } +#endif + // end initialisations + + // nonzero count + size_t nzc = 0; + + // symbolic phase + if( phase == RESIZE ) { + for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + coors1.assign( k_col ); + (void)++nzc; + } + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( not coors1.assigned( l_col ) ) { + (void)++nzc; + } + } + } + + const RC ret = grb::resize( C, nzc ); +#ifdef _DEBUG + std::cout << "grb::resize( C, " << nzc << " ) = " << ret << "\n"; +#endif + return ret; + } + + // computational phase + if( phase == EXECUTE ) { + // retrieve additional buffer + config::NonzeroIndexType * const C_col_index = internal::template + getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); + + // perform column-wise nonzero count + for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + coors1.assign( k_col ); + (void) ++nzc; + (void) ++CCS_raw.col_start[ k_col + 1 ]; + } + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( not coors1.assigned( l_col ) ) { + (void) ++nzc; + (void) ++CCS_raw.col_start[ l_col + 1 ]; + } + } + } + + // check capacity + if( nzc > capacity( C ) ) { +#ifdef _DEBUG + std::cout << "\t detected insufficient capacity " + << "for requested operation\n"; +#endif + const RC clear_rc = clear( C ); + if( clear_rc != SUCCESS ) { + return PANIC; + } else { + return FAILED; + } + } + + // prefix sum for CCS_raw.col_start + assert( CCS_raw.col_start[ 0 ] == 0 ); + for( size_t j = 1; j < n; ++j ) { + CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + } + assert( CCS_raw.col_start[ n ] == nzc ); + + // set C_col_index to all zero +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel for simd +#endif + for( size_t j = 0; j < n; j++ ) { + C_col_index[ j ] = 0; + } + + + // do computations + std::vector< bool > columns( n, false ); + size_t nzc = 0; + C_raw.col_start[ 0 ] = 0; + for( size_t i = 0; i < m; ++i ) { + std::fill( columns.begin(), columns.end(), false ); + +#ifdef _DEBUG + std::cout << " -- i: " << i << "\n"; +#endif + + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + columns[ k_col ] = true; + valbuf[ k_col ] = A_raw.getValue( k, identity_A ); +#ifdef _DEBUG + std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n"; +#endif + } + + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( columns[ l_col ] ) { // Intersection case + const auto valbuf_value_before = valbuf[ l_col ]; + (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper ); +#ifdef _DEBUG + std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, identity_B ) + << " && A(" << i << ";" << l_col << ")=" << valbuf_value_before + << " ==> C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n"; +#endif + } else { // Union case +#ifdef _DEBUG + std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n"; +#endif + columns[ l_col ] = true; + valbuf[ l_col ] = B_raw.getValue( l, identity_B ); + } + } + + for( size_t j_unsigned = columns.size() ; j_unsigned > 0 ; j_unsigned-- ) { + const size_t j = j_unsigned - 1; + if( not columns[ j ] ) { + continue; + } + // update CRS + C_raw.row_index[ nzc ] = j; + C_raw.setValue( nzc, valbuf[ j ] ); + // update CCS + C_col_index[ j ]++; + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, valbuf[ j ] ); + // update count + (void)++nzc; + } + C_raw.col_start[ i + 1 ] = nzc; + } + +#ifdef _DEBUG + std::cout << "CCS_raw.col_start = [ "; + for( size_t j = 0; j <= n; ++j ) + std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; +#endif #ifndef NDEBUG for( size_t j = 0; j < n; ++j ) { assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); @@ -1257,8 +1498,8 @@ namespace grb { std::cout << "In grb::eWiseApply_matrix_generic (reference, monoid)\n"; #endif - return internal::eWiseApply_matrix_generic< true, descr >( - C, A, B, mulmono.getOperator(), mulmono, phase + return internal::eWiseApply_matrix_generic_union< true, descr >( + C, A, B, mulmono, phase ); } @@ -1317,12 +1558,8 @@ namespace grb { "input matrices is a pattern matrix (of type void)" ); - typename grb::Monoid< - grb::operators::mul< double >, - grb::identities::one - > dummyMonoid; - return internal::eWiseApply_matrix_generic< false, descr >( - C, A, B, mulOp, dummyMonoid, phase + return internal::eWiseApply_matrix_generic_intersection< false, descr >( + C, A, B, mulOp, phase ); } From 34130f3302d6b1183bdf0be027dc28dc0330603b Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 29 Jun 2023 14:20:45 +0200 Subject: [PATCH 03/37] Nonblocking implementation fix --- include/graphblas/nonblocking/blas3.hpp | 98 ++++++++++--------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp index 5a222c7f2..5ecb1fffa 100644 --- a/include/graphblas/nonblocking/blas3.hpp +++ b/include/graphblas/nonblocking/blas3.hpp @@ -416,56 +416,6 @@ namespace grb { ); } - namespace internal { - - template< - bool allow_void, - Descriptor descr, - class MulMonoid, class Operator, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 - > - RC eWiseApply_matrix_generic( - Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, nonblocking, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, nonblocking, RIT3, CIT3, NIT3 > &B, - const Operator &oper, - const MulMonoid &mulMonoid, - const Phase &phase, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr - ) { - if( internal::NONBLOCKING::warn_if_not_native && - config::PIPELINE::warn_if_not_native - ) { - std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a " - << "blocking implementation.\n" - << " Further similar such warnings will be suppressed.\n"; - internal::NONBLOCKING::warn_if_not_native = false; - } - - // nonblocking execution is not supported - // first, execute any computation that is not completed - le.execution(); - - // second, delegate to the reference backend - return eWiseApply_matrix_generic< - allow_void, descr, - MulMonoid, Operator - >( - getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ), - oper, mulMonoid, phase - ); - } - - } // namespace internal - template< Descriptor descr = descriptors::no_operation, class MulMonoid, @@ -507,11 +457,26 @@ namespace grb { ); #ifdef _DEBUG - std::cout << "In grb::eWiseApply_matrix_generic (nonblocking, monoid)\n"; + std::cout << "In grb::eWiseApply (nonblocking, monoid)\n"; #endif + if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) { + std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a " + << "blocking implementation.\n" + << " Further similar such warnings will be suppressed.\n"; + internal::NONBLOCKING::warn_if_not_native = false; + } + + // nonblocking execution is not supported + // first, execute any computation that is not completed + internal::le.execution(); - return internal::eWiseApply_matrix_generic< true, descr >( - C, A, B, mulmono.getOperator(), mulmono, phase + // second, delegate to the reference backend + return eWiseApply< descr >( + internal::getRefMatrix( C ), + internal::getRefMatrix( A ), + internal::getRefMatrix( B ), + mulmono, + phase ); } @@ -561,13 +526,28 @@ namespace grb { "the operator version of eWiseApply cannot be used if either of the " "input matrices is a pattern matrix (of type void)" ); + if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) { + std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a " + << "blocking implementation.\n" + << " Further similar such warnings will be suppressed.\n"; + internal::NONBLOCKING::warn_if_not_native = false; + } - typename grb::Monoid< - grb::operators::mul< double >, - grb::identities::one - > dummyMonoid; - return internal::eWiseApply_matrix_generic< false, descr >( - C, A, B, mulOp, dummyMonoid, phase +#ifdef _DEBUG + std::cout << "In grb::eWiseApply (nonblocking, op)\n"; +#endif + + // nonblocking execution is not supported + // first, execute any computation that is not completed + internal::le.execution(); + + // second, delegate to the reference backend + return eWiseApply< descr >( + internal::getRefMatrix( C ), + internal::getRefMatrix( A ), + internal::getRefMatrix( B ), + mulOp, + phase ); } From 535e89c38d6ccbbd923690b8b78ced86752bfaa2 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 29 Jun 2023 18:23:29 +0200 Subject: [PATCH 04/37] Fix eWiseApplyMatrixReference unit-test to handle Monoid variant --- tests/unit/eWiseApplyMatrixReference.cpp | 384 ++++++++++++++--------- 1 file changed, 238 insertions(+), 146 deletions(-) diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp index 63c2ad6df..18f98df0d 100644 --- a/tests/unit/eWiseApplyMatrixReference.cpp +++ b/tests/unit/eWiseApplyMatrixReference.cpp @@ -15,221 +15,314 @@ * limitations under the License. */ +#include #include #include +#include #include -// static data corresponding to small matrices - -static const size_t I_A[] = { 0, 0, 1, 1, 2, 2, 3, 3 }; -static const size_t J_A[] = { 0, 2, 1, 2, 2, 3, 0, 2 }; -static const double V_A[] = { 1, 3, 4, 2, 6, 7, 5, 8 }; +#define _DEBUG -static const size_t I_B[] = { 0, 0, 1, 2, 3, 3 }; -static const size_t J_B[] = { 0, 3, 1, 1, 2, 3 }; -static const double V_B[] = { 9, 10, 11, 12, 14, 13 }; - -static const size_t I_C[] = { 0, 1, 3 }; -static const size_t J_C[] = { 0, 1, 2 }; -static const double V_C[] = { 9, 44, 112 }; +template< class Iterator > +void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) { +#ifndef _DEBUG + return; +#endif + std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl; + if( rows > 50 || cols > 50 ) { + os << " Matrix too large to print" << std::endl; + } else { + os.precision( 3 ); + for( size_t y = 0; y < rows; y++ ) { + os << std::string( 3, ' ' ); + for( size_t x = 0; x < cols; x++ ) { + auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) { + return a.first.first == y && a.first.second == x; + } ); + if( nnz_val != end ) + os << std::fixed << std::setw( 3 ) << ( *nnz_val ).second; + else + os << "___"; + os << " "; + } + os << std::endl; + } + } + os << "]" << std::endl; + std::flush( os ); +} -static const size_t rowlens[] = { 1, 1, 0, 1 }; -static const size_t collens[] = { 1, 1, 1, 0 }; +template< typename D > +void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) { + grb::wait( mat ); + printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os ); +} -static const double expect1_CRS[] = { 9, 44, 112 }; -static const double expect1_CCS[] = { 9, 44, 112 }; +// static data corresponding to small matrices -static const double expect2_CRS[] = { 1, 4, 8 }; -static const double expect2_CCS[] = { 1, 4, 8 }; +/** + * A: + * 1 _ 3 _ + * _ 4 2 _ + * _ _ 6 7 + * 5 _ _ 8 + */ +static const std::vector< size_t > I_A { 0, 0, 1, 1, 2, 2, 3, 3 }; +static const std::vector< size_t > J_A { 0, 2, 1, 2, 2, 3, 0, 2 }; +static const std::vector< int > V_A { 1, 3, 4, 2, 6, 7, 5, 8 }; + +/** + * B: + * 9 __ __ __ + * __ 11 12 __ + * __ 14 __ __ + * __ __ __ 13 + */ +static const std::vector< size_t > I_B { 0, 0, 1, 2, 3, 3 }; +static const std::vector< size_t > J_B { 0, 3, 1, 1, 2, 3 }; +static const std::vector< int > V_B { 9, 10, 11, 12, 14, 13 }; + +/** + * C_intersection: + * 9 ___ ___ ___ + * ___ 44 ___ ___ + * ___ ___ ___ ___ + * ___ ___ 112 ___ + */ +static const std::vector< size_t > I_C_intersection { 0, 1, 3 }; +static const std::vector< size_t > J_C_intersection { 0, 1, 2 }; +static const std::vector< int > V_C_intersection { 9, 44, 112 }; + +/** + * C_union_A_B: + * 9 ___ 3 10 + * ___ 44 2 ___ + * ___ 12 6 7 + * 5 ___ 112 13 + */ -static const double expect3_CRS[] = { 9, 11, 14 }; -static const double expect3_CCS[] = { 9, 11, 14 }; +static const std::vector< size_t > I_C_union { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 }; +static const std::vector< size_t > J_C_union { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 }; +static const std::vector< int > V_C_union_A_B { 9, 3, 10, 44, 2, 12, 6, 7, 5, 112, 13 }; -static const double expect4_CRS[] = { 1, 1, 1 }; -static const double expect4_CCS[] = { 1, 1, 1 }; +/** + * C_union_A_B_pattern: + * 1 _ 3 1 + * _ 4 2 _ + * _ 1 6 7 + * 5 _ 8 1 + */ +static const std::vector< size_t > I_C_union_A_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 }; +static const std::vector< size_t > J_C_union_A_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 }; +static const std::vector< int > V_C_union_A_B_pattern { 1, 3, 1, 4, 2, 1, 6, 7, 5, 8, 1 }; + +/** + * C_union_A_pattern_B: + * 9 __ 1 10 + * __ 11 1 __ + * __ 12 1 1 + * 1 __ 14 13 + */ +static const std::vector< size_t > I_C_union_A_pattern_B { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 }; +static const std::vector< size_t > J_C_union_A_pattern_B { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 }; +static const std::vector< int > V_C_union_A_pattern_B { 9, 1, 10, 11, 1, 12, 1, 1, 1, 14, 13 }; + +/** + * C_union_A_pattern_B_pattern: + * 1 _ 1 1 + * _ 1 1 _ + * _ 1 1 1 + * 1 _ 1 1 + */ +static const std::vector< size_t > I_C_union_A_pattern_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 }; +static const std::vector< size_t > J_C_union_A_pattern_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 }; +static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; // helper function to check internal data structures // of the reference backend template< typename T > -void checkCRSandCCS( const grb::Matrix< T > &C, - const size_t n, - const size_t * rlens, - const size_t * clens, - const size_t * I, - const size_t * J, - const double * expect_CRS, - const double * expect_CCS, - grb::RC &rc -) { - // check CRS output - const auto &crs1 = grb::internal::getCRS( C ); - for( size_t i = 0; i < n; ++i ) { - const size_t entries = crs1.col_start[ i + 1 ] - crs1.col_start[ i ]; - if( entries != rlens[ i ] ) { - std::cerr << "Error: unexpected number of entries " << entries << ", " - << " expected " << rlens[ i ] << " (CRS).\n"; - rc = grb::FAILED; - } - for( size_t k = crs1.col_start[ i ]; k < crs1.col_start[ i + 1 ]; ++k ) { - if( crs1.row_index[ k ] != J[ k ] ) { - std::cerr << "Error: unexpected entry at ( " << i << ", " - << crs1.row_index[ k ] << " ), " - << "expected one at ( " << i << ", " << J[ k ] << " ) " - << "instead (CRS).\n"; - rc = grb::FAILED; - } - if( crs1.values[ k ] != expect_CRS[ k ] ) { - std::cerr << "Error: unexpected value " << crs1.values[ k ] << "; " - << "expected " << expect_CRS[ k ] << " (CRS).\n"; - rc = grb::FAILED; +void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) { + { // check CRS output + const auto & crsObtained = grb::internal::getCRS( obtained ); + const auto & crsExpected = grb::internal::getCRS( expected ); + for( size_t i = 0; i < grb::nrows( obtained ); ++i ) { + for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) { + if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) { + std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), " + << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) " + << "instead (CRS).\n"; + rc = grb::FAILED; + } + if( crsObtained.values[ k ] != crsExpected.values[ k ] ) { + std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; " + << "expected " << crsExpected.values[ k ] << " (CRS).\n"; + rc = grb::FAILED; + } } } } - // check CCS output - const auto &ccs1 = grb::internal::getCCS( C ); - for( size_t j = 0; j < n; ++j ) { - const size_t entries = ccs1.col_start[ j + 1 ] - ccs1.col_start[ j ]; - if( entries != clens[ j ] ) { - std::cerr << "Error: unexpected number of entries " << entries << ", " - << "expected " << clens[ j ] << " (CCS).\n"; - rc = grb::FAILED; - } - for( size_t k = ccs1.col_start[ j ]; k < ccs1.col_start[ j + 1 ]; ++k ) { - if( ccs1.row_index[ k ] != I[ k ] ) { - std::cerr << "Error: unexpected entry at " - << "( " << ccs1.row_index[ k ] << ", " << j << " ), " - << "expected one at ( " << I[ k ] << ", " << j << " ) " - << "instead (CCS).\n"; - rc = grb::FAILED; - } - if( ccs1.values[ k ] != expect_CCS[ k ] ) { - std::cerr << "Error: unexpected value " << ccs1.values[ k ] << "; " - << "expected " << expect_CCS[ k ] << " (CCS).\n"; - rc = grb::FAILED; + { // check CCS output + const auto & ccsObtained = grb::internal::getCCS( obtained ); + const auto & ccsExpected = grb::internal::getCCS( expected ); + for( size_t j = 0; j < grb::ncols( obtained ); ++j ) { + for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) { + if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) { + std::cerr << "Error: unexpected entry at " + << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), " + << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) " + << "instead (CCS).\n"; + rc = grb::FAILED; + } + if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) { + std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; " + << "expected " << ccsExpected.values[ k ] << " (CCS).\n"; + rc = grb::FAILED; + } } } } } -void grbProgram( const void *, const size_t, grb::RC &rc ) { +void grbProgram( const void *, const size_t, grb::RC & rc ) { // initialize test - grb::Monoid< grb::operators::mul< double >, grb::identities::one > mulmono; + grb::Monoid< grb::operators::mul< int >, grb::identities::one > mulmono; const size_t n = 4; const size_t nelts_A = 8; const size_t nelts_B = 6; - grb::Matrix< double > A( n, n ); - grb::Matrix< double > B( n, n ); + grb::Matrix< int > A( n, n ); + grb::Matrix< int > B( n, n ); grb::Matrix< void > A_pattern( n, n ); grb::Matrix< void > B_pattern( n, n ); - grb::Matrix< double > C( n, n ); + grb::Matrix< int > C( n, n ); rc = grb::resize( A, nelts_A ); if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( A, I_A, J_A, V_A, nelts_A, grb::SEQUENTIAL ); + rc = grb::buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, grb::SEQUENTIAL ); } if( rc == grb::SUCCESS ) { rc = grb::resize( B, nelts_B ); } if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( B, I_B, J_B, V_B, nelts_B, grb::SEQUENTIAL ); + rc = grb::buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, grb::SEQUENTIAL ); } if( rc == grb::SUCCESS ) { rc = grb::resize( A_pattern, nelts_A ); } if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( A_pattern, I_A, J_A, nelts_A, grb::SEQUENTIAL ); + rc = grb::buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, grb::SEQUENTIAL ); } if( rc == grb::SUCCESS ) { rc = grb::resize( B_pattern, nelts_B ); } if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( B_pattern, I_B, J_B, nelts_B, grb::SEQUENTIAL ); + rc = grb::buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, grb::SEQUENTIAL ); } if( rc != grb::SUCCESS ) { std::cerr << "\tinitialisation FAILED\n"; return; } - // test 1: compute with the monoid mxm_elementwise - std::cout << "\t Verifying the monoid version of mxm_elementwise, " - << "A and B value matrices\n"; - rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE ); - rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono ); - if( rc != grb::SUCCESS ) { - std::cerr << "Call to grb::eWiseApply FAILED\n"; - return; - } - - checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect1_CRS, expect1_CCS, rc ); - - if( rc != grb::SUCCESS ) { - return; - } + printSparseMatrix( A, "A" ); + printSparseMatrix( B, "B" ); + + { // test 1: compute with the monoid mxm_elementwise + std::cout << "\t Verifying the monoid version of mxm_elementwise, " + << "A and B value matrices\n"; + rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE ); + rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono ); + printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" ); + if( rc != grb::SUCCESS ) { + std::cerr << "Call to grb::eWiseApply FAILED\n"; + return; + } + grb::Matrix< int > union_A_B( n, n ); + grb::buildMatrixUnique( union_A_B, I_C_union.data(), J_C_union.data(), V_C_union_A_B.data(), I_C_union.size(), grb::SEQUENTIAL ); + checkCRSandCCS( C, union_A_B, rc ); - // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n"; - std::cout << "\t Verifying the monoid version of mxm_elementwise, " - << "A value matrix, B pattern matrix\n"; - rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE ); - rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono ); - if( rc != grb::SUCCESS ) { - std::cerr << "Call to grb::eWiseApply FAILED\n"; - return; + if( rc != grb::SUCCESS ) { + return; + } } - checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect2_CRS, expect2_CCS, rc ); - - if( rc != grb::SUCCESS ) { - return; - } + { // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n"; + std::cout << "\t Verifying the monoid version of mxm_elementwise, " + << "A value matrix, B pattern matrix\n"; + rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE ); + rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono ); + printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" ); + if( rc != grb::SUCCESS ) { + std::cerr << "Call to grb::eWiseApply FAILED\n"; + return; + } + grb::Matrix< int > union_A_B_pattern( n, n ); + grb::buildMatrixUnique( union_A_B_pattern, I_C_union_A_B_pattern.data(), J_C_union_A_B_pattern.data(), V_C_union_A_B_pattern.data(), I_C_union_A_B_pattern.size(), grb::SEQUENTIAL ); + checkCRSandCCS( C, union_A_B_pattern, rc ); - // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n"; - std::cout << "\t Verifying the monoid version of mxm_elementwise, " - << "A pattern matrix, B value matrix\n"; - rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE ); - rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono ); - if( rc != grb::SUCCESS ) { - std::cerr << "Call to grb::eWiseApply FAILED\n"; - return; + if( rc != grb::SUCCESS ) { + return; + } } - checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect3_CRS, expect3_CCS, rc ); - - if( rc != grb::SUCCESS ) { - return; - } + { // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n"; + std::cout << "\t Verifying the monoid version of mxm_elementwise, " + << "A pattern matrix, B value matrix\n"; + rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE ); + rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono ); + printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" ); + if( rc != grb::SUCCESS ) { + std::cerr << "Call to grb::eWiseApply FAILED\n"; + return; + } + grb::Matrix< int > union_A_pattern_B( n, n ); + grb::buildMatrixUnique( union_A_pattern_B, I_C_union_A_pattern_B.data(), J_C_union_A_pattern_B.data(), V_C_union_A_pattern_B.data(), I_C_union_A_pattern_B.size(), grb::SEQUENTIAL ); + checkCRSandCCS( C, union_A_pattern_B, rc ); - // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n"; - std::cout << "\t Verifying the monoid version of mxm_elementwise, " - << "A pattern matrix, B pattern matrix\n"; - rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE ); - rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono ); - if( rc != grb::SUCCESS ) { - std::cerr << "Call to grb::eWiseApply FAILED\n"; - return; + if( rc != grb::SUCCESS ) { + return; + } } - checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect4_CRS, expect4_CCS, rc ); + { // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n"; + std::cout << "\t Verifying the monoid version of mxm_elementwise, " + << "A pattern matrix, B pattern matrix\n"; + rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE ); + rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono ); + printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" ); + if( rc != grb::SUCCESS ) { + std::cerr << "Call to grb::eWiseApply FAILED\n"; + return; + } + grb::Matrix< int > union_A_pattern_B_pattern( n, n ); + grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL ); + checkCRSandCCS( C, union_A_pattern_B_pattern, rc ); - if( rc != grb::SUCCESS ) { - return; + if( rc != grb::SUCCESS ) { + return; + } } - // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n"; - std::cout << "\t Verifying the operator version of mxm_elementwise " - << "(only value matrices)\n"; - rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE ); - rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() ); - if( rc != grb::SUCCESS ) { - std::cerr << "Call to grb::eWiseApply FAILED\n"; - return; + { // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n"; + std::cout << "\t Verifying the operator version of mxm_elementwise " + << "(only value matrices)\n"; + rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE ); + rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() ); + printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" ); + if( rc != grb::SUCCESS ) { + std::cerr << "Call to grb::eWiseApply FAILED\n"; + return; + } + grb::Matrix< int > intersection_A_B( n, n ); + grb::buildMatrixUnique( intersection_A_B, I_C_intersection.data(), J_C_intersection.data(), V_C_intersection.data(), I_C_intersection.size(), grb::SEQUENTIAL ); + checkCRSandCCS( C, intersection_A_B, rc ); + if( rc != grb::SUCCESS ) { + return; + } } - - checkCRSandCCS( C, n, rowlens, collens, I_C, J_C, expect1_CRS, expect1_CCS, rc ); } int main( int argc, char ** argv ) { @@ -252,4 +345,3 @@ int main( int argc, char ** argv ) { // done return 0; } - From a7bf8dfb8380719c3031cc7e212fedd376a46ebc Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 3 Jul 2023 14:50:59 +0200 Subject: [PATCH 05/37] Stackless implementation of the operator variant --- include/graphblas/reference/blas3.hpp | 348 ++++++++++------------- tests/unit/eWiseApplyMatrixReference.cpp | 68 ++++- 2 files changed, 214 insertions(+), 202 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index e77478564..228c2122b 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -929,34 +929,31 @@ namespace grb { * \a allow_void is true; otherwise, will be ignored. * \endinternal */ - template< - bool allow_void, + template< bool allow_void, Descriptor descr, class Operator, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 - > - RC eWiseApply_matrix_generic_intersection( - Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, - const Operator &oper, - const Phase &phase, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr - ) { - assert( !(descr & descriptors::force_row_major ) ); - static_assert( allow_void || - ( !( - std::is_same< InputType1, void >::value || - std::is_same< InputType2, void >::value - ) ), + typename OutputType, + typename InputType1, + typename InputType2, + typename RIT1, + typename CIT1, + typename NIT1, + typename RIT2, + typename CIT2, + typename NIT2, + typename RIT3, + typename CIT3, + typename NIT3 > + RC eWiseApply_matrix_generic_intersection( Matrix< OutputType, reference, RIT1, CIT1, NIT1 > & C, + const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > & A, + const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > & B, + const Operator & oper, + const Phase & phase, + const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void >::type * const = nullptr ) { + assert( ! ( descr & descriptors::force_row_major ) ); + static_assert( allow_void || ( ! ( std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ), "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of " "elementwise mxm can only be used if neither of the input matrices " "is a pattern matrix (of type void)" ); @@ -965,212 +962,167 @@ namespace grb { #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; #endif - // get whether the matrices should be transposed prior to execution - constexpr bool trans_left = descr & descriptors::transpose_left; - constexpr bool trans_right = descr & descriptors::transpose_right; + RC rc = SUCCESS; - // run-time checks - const size_t m = grb::nrows( C ); - const size_t n = grb::ncols( C ); - const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A ); - const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A ); - const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B ); - const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B ); + if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) { + return rc; + } - if( m != m_A || m != m_B || n != n_A || n != n_B ) { + const auto & A_raw = descr & grb::descriptors::transpose_left ? internal::getCCS( A ) : internal::getCRS( A ); + const auto & B_raw = descr & grb::descriptors::transpose_right ? internal::getCCS( B ) : internal::getCRS( B ); + const auto & C_crs_raw = internal::getCRS( C ); + const auto & C_ccs_raw = internal::getCCS( C ); + const size_t m_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? ncols( A ) : nrows( A ); + const size_t n_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? nrows( A ) : ncols( A ); + const size_t m_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? ncols( B ) : nrows( B ); + const size_t n_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? nrows( B ) : ncols( B ); + const size_t m_C = nrows( C ); + const size_t n_C = ncols( C ); + + // Check mask dimensions + if( m_A != m_B || n_A != n_B || m_A != m_C || n_A != n_C ) { +#ifdef _DEBUG + std::cout << "Dimensions of matrices do not match!\n"; +#endif return MISMATCH; } - const auto &A_raw = !trans_left ? - internal::getCRS( A ) : - internal::getCCS( A ); - const auto &B_raw = !trans_right ? - internal::getCRS( B ) : - internal::getCCS( B ); - auto &C_raw = internal::getCRS( C ); - auto &CCS_raw = internal::getCCS( C ); - - // retrieve buffers - char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; - arr1 = arr2 = buf1 = buf2 = nullptr; - InputType1 * vbuf1 = nullptr; - InputType2 * vbuf2 = nullptr; - OutputType * valbuf = nullptr; - internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); - internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B ); - internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); - // end buffer retrieval - - // initialisations - internal::Coordinates< reference > coors1, coors2; - coors1.set( arr1, false, buf1, n ); - coors2.set( arr2, false, buf2, n ); -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel - { - size_t start, end; - config::OMP::localRange( start, end, 0, n + 1 ); -#else - const size_t start = 0; - const size_t end = n + 1; -#endif - for( size_t j = start; j < end; ++j ) { - CCS_raw.col_start[ j ] = 0; - } + if( phase == Phase::RESIZE ) { + size_t nzc = 0; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - } +#pragma omp parallel for reduction( + : nzc ) default( none ) shared( B_raw, A_raw ) firstprivate( m_A ) #endif - // end initialisations + for( size_t i = 0; i < m_A; ++i ) { + auto B_k = B_raw.col_start[ i ]; + for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) { + const auto j = A_raw.row_index[ A_k ]; - // nonzero count - size_t nzc = 0; - - // symbolic phase - if( phase == RESIZE ) { - for( size_t i = 0; i < m; ++i ) { - coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - } - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( coors1.assigned( l_col ) ) { - (void)++nzc; + while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + B_k++; + } + if( B_k >= B_raw.col_start[ i + 1 ] ) { + break; + } + if( B_raw.row_index[ B_k ] == j ) { + nzc += 1; } } } - - const RC ret = grb::resize( C, nzc ); - if( ret != SUCCESS ) { - return ret; - } +#ifdef _DEBUG + std::cout << "RESIZE phase: resize( C, " << nzc << " )\n"; +#endif + return resize( C, nzc ); } - // computational phase - if( phase == EXECUTE ) { - // retrieve additional buffer - config::NonzeroIndexType * const C_col_index = internal::template - getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); + const size_t nzc = capacity( C ); - // perform column-wise nonzero count - for( size_t i = 0; i < m; ++i ) { - coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); + C_crs_raw.col_start[ 0 ] = 0; + C_ccs_raw.col_start[ 0 ] = 0; + // Prefix sum computation into L.CRS.col_start +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 +#pragma omp parallel for default( none ) shared( B_raw, A_raw, C_crs_raw, std::cout ) firstprivate( m_A ) +#endif + for( size_t i = 0; i < m_A; i++ ) { + auto B_k = B_raw.col_start[ i ]; + size_t cumul = 0UL; + for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) { + const auto j = A_raw.row_index[ A_k ]; + + while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + B_k++; } - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( coors1.assigned( l_col ) ) { - (void) ++nzc; - (void) ++CCS_raw.col_start[ l_col + 1 ]; - } + if( B_k >= B_raw.col_start[ i + 1 ]) { + break; + } + if( B_raw.row_index[ B_k ] == j ) { + cumul += 1; } } + C_crs_raw.col_start[ i + 1 ] = cumul; + } - // check capacity - if( nzc > capacity( C ) ) { + // Print the CRS prefix sum #ifdef _DEBUG - std::cout << "\t detected insufficient capacity " - << "for requested operation\n"; + std::cout << "CRS prefix sum: "; + for( size_t i = 0; i <= m_A; i++ ) { + std::cout << C_crs_raw.col_start[ i ] << " "; + } + std::cout << "\n"; #endif - const RC clear_rc = clear( C ); - if( clear_rc != SUCCESS ) { - return PANIC; - } else { - return FAILED; - } - } - // prefix sum for CCS_raw.col_start - assert( CCS_raw.col_start[ 0 ] == 0 ); - for( size_t j = 1; j < n; ++j ) { - CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; - } - assert( CCS_raw.col_start[ n ] == nzc ); + // Apply the prefix sum + for( size_t i = 1; i <= m_A; i++ ) { + C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ]; + C_ccs_raw.col_start[ i ] = C_crs_raw.col_start[ i ]; + } - // set C_col_index to all zero -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel - { - size_t start, end; - config::OMP::localRange( start, end, 0, n ); -#else - const size_t start = 0; - const size_t end = n; + // Check if the number of nonzeros is greater than the capacity + if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) { +#ifdef _DEBUG + std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n" + << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros, but capacity is " << nzc << "\n"; #endif - for( size_t j = start; j < end; ++j ) { - C_col_index[ j ] = 0; - } + return RC::MISMATCH; + } + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - } +#pragma omp parallel for simd #endif + for( size_t i = 0; i < m_A; i++ ) + C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0; - // do computations - size_t nzc = 0; - C_raw.col_start[ 0 ] = 0; - for( size_t i = 0; i < m; ++i ) { - coors1.clear(); - coors2.clear(); -#ifdef _DEBUG - std::cout << "\t The elements "; -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - valbuf[ k_col ] = A_raw.values[ k ]; -#ifdef _DEBUG - std::cout << "A( " << i << ", " << k_col << " ) = " << A_raw.values[ k ] << ", "; -#endif - } -#ifdef _DEBUG - std::cout << "are multiplied pairwise with "; + RC local_rc = rc; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 +#pragma omp parallel default( none ) shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) firstprivate( local_rc, m_A, oper ) #endif - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( coors1.assigned( l_col ) ) { - coors2.assign( l_col ); - (void)grb::apply( valbuf[ l_col ], valbuf[ l_col ], B_raw.values[ l ], oper ); -#ifdef _DEBUG - std::cout << "B( " << i << ", " << l_col << " ) = " << B_raw.values[ l ] - << " to yield C( " << i << ", " << l_col << " ), "; + { + size_t start_row = 0; + size_t end_row = m_A; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + config::OMP::localRange( start_row, end_row, 0, m_A ); #endif + + for( auto i = start_row; i < end_row; ++i ) { + auto B_k = B_raw.col_start[ i ]; + auto C_k = C_crs_raw.col_start[ i ]; + for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto j = A_raw.row_index[ k ]; + while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + B_k++; } - } + if( B_k >= B_raw.col_start[ i + 1 ] ) { + break; + } + if( B_raw.row_index[ B_k ] != j ) { + continue; + } + + const auto a_val = A_raw.values[ k ]; + const auto b_val = B_raw.values[ B_k ]; + OutputType c_val; + local_rc = local_rc ? local_rc : grb::apply< descr >( c_val, a_val, b_val, oper ); + + C_crs_raw.row_index[ C_k ] = j; + C_crs_raw.values[ C_k ] = c_val; + C_ccs_raw.row_index[ C_k ] = i; + C_ccs_raw.values[ C_k ] = c_val; #ifdef _DEBUG - std::cout << "\n"; + std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n"; + std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n"; + std::cerr << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n"; #endif - for( size_t k = 0; k < coors2.nonzeroes(); ++k ) { - const size_t j = coors2.index( k ); - // update CRS - C_raw.row_index[ nzc ] = j; - C_raw.setValue( nzc, valbuf[ j ] ); - // update CCS - const size_t CCS_index = C_col_index[ j ]++ + CCS_raw.col_start[ j ]; - CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, valbuf[ j ] ); - // update count - (void)++nzc; + C_k += 1; } - C_raw.col_start[ i + 1 ] = nzc; -#ifdef _DEBUG - std::cout << "\n"; -#endif - } - -#ifndef NDEBUG - for( size_t j = 0; j < n; ++j ) { - assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 +#pragma omp critical #endif - - // set final number of nonzeroes in output matrix - internal::setCurrentNonzeroes( C, nzc ); + { rc = rc ? rc : local_rc; } } - // done - return SUCCESS; + internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] ); + + return rc; } /** @@ -1356,7 +1308,7 @@ namespace grb { // set C_col_index to all zero #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd + #pragma omp parallel for simd #endif for( size_t j = 0; j < n; j++ ) { C_col_index[ j ] = 0; @@ -1421,7 +1373,7 @@ namespace grb { C_raw.col_start[ i + 1 ] = nzc; } -#ifdef _DEBUG +#ifdef _DEBUG std::cout << "CCS_raw.col_start = [ "; for( size_t j = 0; j <= n; ++j ) std::cout << CCS_raw.col_start[ j ] << " "; diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp index 18f98df0d..27bbe93fb 100644 --- a/tests/unit/eWiseApplyMatrixReference.cpp +++ b/tests/unit/eWiseApplyMatrixReference.cpp @@ -22,8 +22,6 @@ #include -#define _DEBUG - template< class Iterator > void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) { #ifndef _DEBUG @@ -59,6 +57,45 @@ void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os ); } +template< class Storage, typename D > +void printCompressedStorage( const Storage& storage, const grb::Matrix< D > & mat, std::ostream & os = std::cout ) { + os << " row_index: [ "; + for( size_t i = 0; i < grb::nrows( mat ); ++i ) { + os << storage.row_index[ i ] << " "; + } + os << "]" << std::endl; + os << " col_start: [ "; + for( size_t i = 0; i <= grb::nrows( mat ); ++i ) { + os << storage.col_start[ i ] << " "; + } + os << "]" << std::endl; + os << " values: [ "; + for( size_t i = 0; i < grb::nnz( mat ); ++i ) { + os << storage.values[ i ] << " "; + } + os << "]" << std::endl << std::flush; +} + +template< typename D > +void printCRS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) { +#ifndef _DEBUG + return; +#endif + grb::wait( mat ); + os << "CRS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl; + printCompressedStorage( grb::internal::getCRS( mat ), mat, os ); +} + +template< typename D > +void printCCS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) { +#ifndef _DEBUG + return; +#endif + grb::wait( mat ); + os << "CCS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl; + printCompressedStorage( grb::internal::getCCS( mat ), mat, os ); +} + // static data corresponding to small matrices /** @@ -137,12 +174,22 @@ static const std::vector< int > V_C_union_A_pattern_B { 9, 1, 10, 11, 1, 12, 1, */ static const std::vector< size_t > I_C_union_A_pattern_B_pattern { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3 }; static const std::vector< size_t > J_C_union_A_pattern_B_pattern { 0, 2, 3, 1, 2, 1, 2, 3, 0, 2, 3 }; -static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; +static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; // helper function to check internal data structures // of the reference backend template< typename T > void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) { + printCRS( obtained, "obtained" ); + printCRS( expected, "expected" ); + + if( grb::nnz( obtained ) != grb::nnz( expected ) ) { + std::cerr << "Error: unexpected number of non-zero entries; " + << "expected " << grb::nnz( expected ) << ", " + << "obtained " << grb::nnz( obtained ) << ".\n"; + rc = grb::FAILED; + } + { // check CRS output const auto & crsObtained = grb::internal::getCRS( obtained ); const auto & crsExpected = grb::internal::getCRS( expected ); @@ -163,6 +210,9 @@ void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & } } + printCCS( obtained, "obtained" ); + printCCS( expected, "expected" ); + { // check CCS output const auto & ccsObtained = grb::internal::getCCS( obtained ); const auto & ccsExpected = grb::internal::getCCS( expected ); @@ -228,11 +278,16 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { } printSparseMatrix( A, "A" ); + printCRS( A, "A" ); + printCCS( A, "A" ); printSparseMatrix( B, "B" ); + printCRS( B, "B" ); + printCCS( B, "B" ); { // test 1: compute with the monoid mxm_elementwise std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A and B value matrices\n"; + grb::clear( C ); rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono ); printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" ); @@ -252,6 +307,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A value matrix, B pattern matrix\n"; + grb::clear( C ); rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono ); printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" ); @@ -271,6 +327,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A pattern matrix, B value matrix\n"; + grb::clear( C ); rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono ); printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" ); @@ -290,6 +347,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A pattern matrix, B pattern matrix\n"; + grb::clear( C ); rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono ); printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" ); @@ -298,7 +356,8 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { return; } grb::Matrix< int > union_A_pattern_B_pattern( n, n ); - grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL ); + grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), + I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL ); checkCRSandCCS( C, union_A_pattern_B_pattern, rc ); if( rc != grb::SUCCESS ) { @@ -309,6 +368,7 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n"; std::cout << "\t Verifying the operator version of mxm_elementwise " << "(only value matrices)\n"; + grb::clear( C ); rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() ); printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" ); From 5eb0b44383ff274ee3bc3e6ec968c44a65f0fe58 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 3 Jul 2023 14:51:14 +0200 Subject: [PATCH 06/37] Convert vector to c-like array for monoid variant --- include/graphblas/reference/blas3.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 228c2122b..1207daaf2 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -26,7 +26,6 @@ #include #include -#include #include "io.hpp" #include "matrix.hpp" @@ -1109,7 +1108,7 @@ namespace grb { #ifdef _DEBUG std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n"; std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n"; - std::cerr << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n"; + std::cout << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n"; #endif C_k += 1; } @@ -1316,12 +1315,10 @@ namespace grb { // do computations - std::vector< bool > columns( n, false ); + bool columns[ n ] = { false }; size_t nzc = 0; C_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { - std::fill( columns.begin(), columns.end(), false ); - #ifdef _DEBUG std::cout << " -- i: " << i << "\n"; #endif @@ -1354,7 +1351,7 @@ namespace grb { } } - for( size_t j_unsigned = columns.size() ; j_unsigned > 0 ; j_unsigned-- ) { + for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { const size_t j = j_unsigned - 1; if( not columns[ j ] ) { continue; @@ -1371,6 +1368,9 @@ namespace grb { (void)++nzc; } C_raw.col_start[ i + 1 ] = nzc; + + for(size_t i=0; i Date: Wed, 12 Jul 2023 15:37:10 +0200 Subject: [PATCH 07/37] Style fixes --- include/graphblas/reference/blas3.hpp | 236 ++++++++++++------ tests/unit/eWiseApplyMatrixReference.cpp | 294 +++++++++-------------- tests/unit/eWiseApplyMatrix_variants.cpp | 287 +++++++++++----------- 3 files changed, 423 insertions(+), 394 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 1207daaf2..97c585118 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -928,53 +928,90 @@ namespace grb { * \a allow_void is true; otherwise, will be ignored. * \endinternal */ - template< bool allow_void, + template< + bool allow_void, Descriptor descr, class Operator, - typename OutputType, - typename InputType1, - typename InputType2, - typename RIT1, - typename CIT1, - typename NIT1, - typename RIT2, - typename CIT2, - typename NIT2, - typename RIT3, - typename CIT3, - typename NIT3 > - RC eWiseApply_matrix_generic_intersection( Matrix< OutputType, reference, RIT1, CIT1, NIT1 > & C, - const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > & A, - const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > & B, - const Operator & oper, - const Phase & phase, - const typename std::enable_if< ! grb::is_object< OutputType >::value && ! grb::is_object< InputType1 >::value && ! grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr ) { - assert( ! ( descr & descriptors::force_row_major ) ); - static_assert( allow_void || ( ! ( std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ), - "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid version of " - "elementwise mxm can only be used if neither of the input matrices " - "is a pattern matrix (of type void)" ); + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC eWiseApply_matrix_generic_intersection( + Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, + const Operator &oper, + const Phase &phase, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void + >::type * const = nullptr + ) { + assert( !( descr & descriptors::force_row_major ) ); + static_assert( allow_void || + ( !( + std::is_same< InputType1, void >::value + || std::is_same< InputType2, void >::value + ) + ), + "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid" + " version of elementwise mxm can only be used if neither of the input" + " matrices is a pattern matrix (of type void)" ); assert( phase != TRY ); + // get whether the matrices should be transposed prior to execution + constexpr bool trans_left = descr & descriptors::transpose_left; + constexpr bool trans_right = descr & descriptors::transpose_right; + #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; #endif - RC rc = SUCCESS; if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) { - return rc; +#ifdef _DEBUG + std::cout << "No nonzeros in input matrices, nothing to compute.\n"; +#endif + return SUCCESS; + } + + const auto &A_raw = trans_left + ? internal::getCCS( A ) + : internal::getCRS( A ); + const size_t m_A = trans_left + ? ncols( A ) + : nrows( A ); + const size_t n_A = trans_left + ? nrows( A ) + : ncols( A ); + if( m_A == 0 || n_A == 0 ) { +#ifdef _DEBUG + std::cout << "Matrix A is empty, nothing to compute.\n"; +#endif + return SUCCESS; + } + + const auto &B_raw = trans_right + ? internal::getCCS( B ) + : internal::getCRS( B ); + const size_t m_B = trans_right + ? ncols( B ) + : nrows( B ); + const size_t n_B = trans_right + ? nrows( B ) + : ncols( B ); + if( m_A == 0 || n_A == 0 ) { +#ifdef _DEBUG + std::cout << "Matrix B is empty, nothing to compute.\n"; +#endif + return SUCCESS; } - const auto & A_raw = descr & grb::descriptors::transpose_left ? internal::getCCS( A ) : internal::getCRS( A ); - const auto & B_raw = descr & grb::descriptors::transpose_right ? internal::getCCS( B ) : internal::getCRS( B ); - const auto & C_crs_raw = internal::getCRS( C ); - const auto & C_ccs_raw = internal::getCCS( C ); - const size_t m_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? ncols( A ) : nrows( A ); - const size_t n_A = descr & grb::descriptors::transpose_left || descr & grb::descriptors::transpose_left ? nrows( A ) : ncols( A ); - const size_t m_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? ncols( B ) : nrows( B ); - const size_t n_B = descr & grb::descriptors::transpose_right || descr & grb::descriptors::transpose_matrix ? nrows( B ) : ncols( B ); + auto &C_crs_raw = internal::getCRS( C ); + auto &C_ccs_raw = internal::getCCS( C ); const size_t m_C = nrows( C ); const size_t n_C = ncols( C ); @@ -986,17 +1023,27 @@ namespace grb { return MISMATCH; } + const auto A_identity = identities::zero< InputType1 >::value(); + const auto B_identity = identities::zero< InputType2 >::value(); + + RC rc = SUCCESS; if( phase == Phase::RESIZE ) { size_t nzc = 0; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for reduction( + : nzc ) default( none ) shared( B_raw, A_raw ) firstprivate( m_A ) +#pragma omp parallel for reduction( + : nzc ) \ + default( none ) shared( B_raw, A_raw ) \ + firstprivate( m_A ) #endif for( size_t i = 0; i < m_A; ++i ) { auto B_k = B_raw.col_start[ i ]; - for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) { + const auto A_k_start = A_raw.col_start[ i ]; + const auto A_k_end = A_raw.col_start[ i + 1 ]; + for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { const auto j = A_raw.row_index[ A_k ]; - while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + while( B_k < B_raw.col_start[ i + 1 ] + && B_raw.row_index[ B_k ] > j + ) { B_k++; } if( B_k >= B_raw.col_start[ i + 1 ] ) { @@ -1008,7 +1055,7 @@ namespace grb { } } #ifdef _DEBUG - std::cout << "RESIZE phase: resize( C, " << nzc << " )\n"; + std::cout << "resize( C, " << nzc << " )\n"; #endif return resize( C, nzc ); } @@ -1024,10 +1071,14 @@ namespace grb { for( size_t i = 0; i < m_A; i++ ) { auto B_k = B_raw.col_start[ i ]; size_t cumul = 0UL; - for( auto A_k = A_raw.col_start[ i ]; A_k < A_raw.col_start[ i + 1 ]; ++A_k ) { + const auto A_k_start = A_raw.col_start[ i ]; + const auto A_k_end = A_raw.col_start[ i + 1 ]; + for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { const auto j = A_raw.row_index[ A_k ]; - while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + while( B_k < B_raw.col_start[ i + 1 ] + && B_raw.row_index[ B_k ] > j + ) { B_k++; } if( B_k >= B_raw.col_start[ i + 1 ]) { @@ -1040,8 +1091,8 @@ namespace grb { C_crs_raw.col_start[ i + 1 ] = cumul; } - // Print the CRS prefix sum #ifdef _DEBUG + // Print the CRS prefix sum std::cout << "CRS prefix sum: "; for( size_t i = 0; i <= m_A; i++ ) { std::cout << C_crs_raw.col_start[ i ] << " "; @@ -1058,21 +1109,25 @@ namespace grb { // Check if the number of nonzeros is greater than the capacity if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) { #ifdef _DEBUG - std::cout << "EXECUTE phase: detected insufficient capacity for requested operation.\n" - << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros, but capacity is " << nzc << "\n"; + std::cout << "Insufficient capacity detected for requested operation.\n" + << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros" + << " but capacity is " << nzc << "\n"; #endif - return RC::MISMATCH; + return MISMATCH; } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel for simd #endif - for( size_t i = 0; i < m_A; i++ ) + for( size_t i = 0; i < m_A; i++ ) { C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0; + } RC local_rc = rc; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel default( none ) shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) firstprivate( local_rc, m_A, oper ) +#pragma omp parallel default( none ) \ + shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \ + firstprivate( local_rc, m_A, oper, A_identity, B_identity ) #endif { size_t start_row = 0; @@ -1081,12 +1136,17 @@ namespace grb { config::OMP::localRange( start_row, end_row, 0, m_A ); #endif - for( auto i = start_row; i < end_row; ++i ) { + for( size_t i = start_row; i < end_row; ++i ) { auto B_k = B_raw.col_start[ i ]; auto C_k = C_crs_raw.col_start[ i ]; - for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const auto j = A_raw.row_index[ k ]; - while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { + + const auto A_k_start = A_raw.col_start[ i ]; + const auto A_k_end = A_raw.col_start[ i + 1 ]; + for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { + const auto j = A_raw.row_index[ A_k ]; + while( B_k < B_raw.col_start[ i + 1 ] + && B_raw.row_index[ B_k ] > j + ) { B_k++; } if( B_k >= B_raw.col_start[ i + 1 ] ) { @@ -1096,31 +1156,48 @@ namespace grb { continue; } - const auto a_val = A_raw.values[ k ]; - const auto b_val = B_raw.values[ B_k ]; + const InputType1 a_val = A_raw.getValue( A_k, A_identity ); + const InputType2 b_val = B_raw.getValue( B_k, B_identity ); OutputType c_val; - local_rc = local_rc ? local_rc : grb::apply< descr >( c_val, a_val, b_val, oper ); + local_rc = local_rc + ? local_rc + : grb::apply< descr >( c_val, a_val, b_val, oper ); C_crs_raw.row_index[ C_k ] = j; - C_crs_raw.values[ C_k ] = c_val; + C_crs_raw.setValue( C_k, c_val ); C_ccs_raw.row_index[ C_k ] = i; - C_ccs_raw.values[ C_k ] = c_val; + C_ccs_raw.setValue( C_k, c_val ); #ifdef _DEBUG - std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( a_val ) + "\n"; - std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n"; - std::cout << "C( " + std::to_string( i ) + ";" + std::to_string( C_crs_raw.row_index[ C_k ] ) + " ) = " + std::to_string( c_val ) + "\n"; + std::cout << "A( " + std::to_string( i ) + ";" + + std::to_string( j ) + " ) = " + + std::to_string( a_val ) + "\n"; + std::cout << "B( " + std::to_string( i ) + ";" + + std::to_string( j ) + " ) = " + + std::to_string( b_val ) + "\n"; + std::cout << "C.crs( " + std::to_string( i ) + ";" + + std::to_string( j ) + " ) = " + + std::to_string( c_val ) + "\n"; #endif C_k += 1; } } + + if( local_rc != SUCCESS ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp critical #endif - { rc = rc ? rc : local_rc; } + { + rc = rc ? rc : local_rc; + } + } } +#ifdef _DEBUG + std::cout << "internal::setCurrentNonzeroes( C, " + << C_crs_raw.col_start[ m_A ] << " )\n"; +#endif internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] ); - + return rc; } @@ -1163,9 +1240,9 @@ namespace grb { std::is_same< InputType1, void >::value || std::is_same< InputType2, void >::value ) ), - "grb::internal::eWiseApply_matrix_generic_union: the non-monoid version of " - "elementwise mxm can only be used if neither of the input matrices " - "is a pattern matrix (of type void)" ); + "grb::internal::eWiseApply_matrix_generic_union: the non-monoid" + " version of elementwise mxm can only be used if neither of the" + " input matrices is a pattern matrix (of type void)" ); assert( phase != TRY ); #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n"; @@ -1175,12 +1252,12 @@ namespace grb { constexpr bool trans_right = descr & descriptors::transpose_right; // run-time checks - const size_t m = grb::nrows( C ); - const size_t n = grb::ncols( C ); - const size_t m_A = !trans_left ? grb::nrows( A ) : grb::ncols( A ); - const size_t n_A = !trans_left ? grb::ncols( A ) : grb::nrows( A ); - const size_t m_B = !trans_right ? grb::nrows( B ) : grb::ncols( B ); - const size_t n_B = !trans_right ? grb::ncols( B ) : grb::nrows( B ); + const size_t m = nrows( C ); + const size_t n = ncols( C ); + const size_t m_A = !trans_left ? nrows( A ) : ncols( A ); + const size_t n_A = !trans_left ? ncols( A ) : nrows( A ); + const size_t m_B = !trans_right ? nrows( B ) : ncols( B ); + const size_t n_B = !trans_right ? ncols( B ) : nrows( B ); // Identities const auto identity_A = monoid.template getIdentity< OutputType >(); @@ -1243,12 +1320,12 @@ namespace grb { for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); - (void)++nzc; + (void) ++nzc; } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; if( not coors1.assigned( l_col ) ) { - (void)++nzc; + (void) ++nzc; } } } @@ -1353,7 +1430,7 @@ namespace grb { for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { const size_t j = j_unsigned - 1; - if( not columns[ j ] ) { + if( !columns[ j ] ) { continue; } // update CRS @@ -1369,8 +1446,9 @@ namespace grb { } C_raw.col_start[ i + 1 ] = nzc; - for(size_t i=0; i( @@ -1509,6 +1590,9 @@ namespace grb { "the operator version of eWiseApply cannot be used if either of the " "input matrices is a pattern matrix (of type void)" ); +#ifdef _DEBUG + std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n"; +#endif return internal::eWiseApply_matrix_generic_intersection< false, descr >( C, A, B, mulOp, phase diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp index 27bbe93fb..6d675aa97 100644 --- a/tests/unit/eWiseApplyMatrixReference.cpp +++ b/tests/unit/eWiseApplyMatrixReference.cpp @@ -22,79 +22,8 @@ #include -template< class Iterator > -void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) { -#ifndef _DEBUG - return; -#endif - std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl; - if( rows > 50 || cols > 50 ) { - os << " Matrix too large to print" << std::endl; - } else { - os.precision( 3 ); - for( size_t y = 0; y < rows; y++ ) { - os << std::string( 3, ' ' ); - for( size_t x = 0; x < cols; x++ ) { - auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) { - return a.first.first == y && a.first.second == x; - } ); - if( nnz_val != end ) - os << std::fixed << std::setw( 3 ) << ( *nnz_val ).second; - else - os << "___"; - os << " "; - } - os << std::endl; - } - } - os << "]" << std::endl; - std::flush( os ); -} - -template< typename D > -void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) { - grb::wait( mat ); - printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os ); -} - -template< class Storage, typename D > -void printCompressedStorage( const Storage& storage, const grb::Matrix< D > & mat, std::ostream & os = std::cout ) { - os << " row_index: [ "; - for( size_t i = 0; i < grb::nrows( mat ); ++i ) { - os << storage.row_index[ i ] << " "; - } - os << "]" << std::endl; - os << " col_start: [ "; - for( size_t i = 0; i <= grb::nrows( mat ); ++i ) { - os << storage.col_start[ i ] << " "; - } - os << "]" << std::endl; - os << " values: [ "; - for( size_t i = 0; i < grb::nnz( mat ); ++i ) { - os << storage.values[ i ] << " "; - } - os << "]" << std::endl << std::flush; -} - -template< typename D > -void printCRS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) { -#ifndef _DEBUG - return; -#endif - grb::wait( mat ); - os << "CRS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl; - printCompressedStorage( grb::internal::getCRS( mat ), mat, os ); -} +using namespace grb; -template< typename D > -void printCCS( const grb::Matrix< D > & mat, const std::string & label = "", std::ostream & os = std::cout ) { -#ifndef _DEBUG - return; -#endif - grb::wait( mat ); - os << "CCS \"" << label << "\" (" << grb::nrows( mat ) << "x" << grb::ncols( mat ) << "):" << std::endl; - printCompressedStorage( grb::internal::getCCS( mat ), mat, os ); -} // static data corresponding to small matrices @@ -179,127 +108,116 @@ static const std::vector< int > V_C_union_A_pattern_B_pattern { 1, 1, 1, 1, 1, 1 // helper function to check internal data structures // of the reference backend template< typename T > -void checkCRSandCCS( const grb::Matrix< T > & obtained, const grb::Matrix< T > & expected, grb::RC & rc ) { - printCRS( obtained, "obtained" ); - printCRS( expected, "expected" ); - - if( grb::nnz( obtained ) != grb::nnz( expected ) ) { +void checkCRSandCCS( + const Matrix< T > & obtained, + const Matrix< T > & expected, + RC & rc +) { + if( nnz( obtained ) != nnz( expected ) ) { std::cerr << "Error: unexpected number of non-zero entries; " - << "expected " << grb::nnz( expected ) << ", " - << "obtained " << grb::nnz( obtained ) << ".\n"; - rc = grb::FAILED; + << "expected " << nnz( expected ) << ", " + << "obtained " << nnz( obtained ) << ".\n"; + rc = FAILED; } { // check CRS output - const auto & crsObtained = grb::internal::getCRS( obtained ); - const auto & crsExpected = grb::internal::getCRS( expected ); - for( size_t i = 0; i < grb::nrows( obtained ); ++i ) { + const auto & crsObtained = internal::getCRS( obtained ); + const auto & crsExpected = internal::getCRS( expected ); + for( size_t i = 0; i < nrows( obtained ); ++i ) { for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) { if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) { std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), " << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) " << "instead (CRS).\n"; - rc = grb::FAILED; + rc = FAILED; } if( crsObtained.values[ k ] != crsExpected.values[ k ] ) { std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; " << "expected " << crsExpected.values[ k ] << " (CRS).\n"; - rc = grb::FAILED; + rc = FAILED; } } } } - printCCS( obtained, "obtained" ); - printCCS( expected, "expected" ); - { // check CCS output - const auto & ccsObtained = grb::internal::getCCS( obtained ); - const auto & ccsExpected = grb::internal::getCCS( expected ); - for( size_t j = 0; j < grb::ncols( obtained ); ++j ) { + const auto & ccsObtained = internal::getCCS( obtained ); + const auto & ccsExpected = internal::getCCS( expected ); + for( size_t j = 0; j < ncols( obtained ); ++j ) { for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) { if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) { std::cerr << "Error: unexpected entry at " << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), " << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) " << "instead (CCS).\n"; - rc = grb::FAILED; + rc = FAILED; } if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) { std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; " << "expected " << ccsExpected.values[ k ] << " (CCS).\n"; - rc = grb::FAILED; + rc = FAILED; } } } } } -void grbProgram( const void *, const size_t, grb::RC & rc ) { +void grbProgram( const void *, const size_t, RC & rc ) { // initialize test - grb::Monoid< grb::operators::mul< int >, grb::identities::one > mulmono; + const grb::Monoid< grb::operators::mul< int >, + grb::identities::one > mulmono; const size_t n = 4; const size_t nelts_A = 8; const size_t nelts_B = 6; - grb::Matrix< int > A( n, n ); - grb::Matrix< int > B( n, n ); - grb::Matrix< void > A_pattern( n, n ); - grb::Matrix< void > B_pattern( n, n ); - grb::Matrix< int > C( n, n ); - - rc = grb::resize( A, nelts_A ); - if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, grb::SEQUENTIAL ); - } - if( rc == grb::SUCCESS ) { - rc = grb::resize( B, nelts_B ); - } - if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, grb::SEQUENTIAL ); - } - if( rc == grb::SUCCESS ) { - rc = grb::resize( A_pattern, nelts_A ); - } - if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, grb::SEQUENTIAL ); - } - if( rc == grb::SUCCESS ) { - rc = grb::resize( B_pattern, nelts_B ); - } - if( rc == grb::SUCCESS ) { - rc = grb::buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, grb::SEQUENTIAL ); - } - if( rc != grb::SUCCESS ) { - std::cerr << "\tinitialisation FAILED\n"; - return; - } - - printSparseMatrix( A, "A" ); - printCRS( A, "A" ); - printCCS( A, "A" ); - printSparseMatrix( B, "B" ); - printCRS( B, "B" ); - printCCS( B, "B" ); + Matrix< int > A( n, n ); + Matrix< int > B( n, n ); + Matrix< void > A_pattern( n, n ); + Matrix< void > B_pattern( n, n ); + Matrix< int > C( n, n ); + + assert( SUCCESS == resize( A, nelts_A ) ); + assert( SUCCESS == + buildMatrixUnique( A, I_A.data(), J_A.data(), V_A.data(), nelts_A, SEQUENTIAL ) + ); + assert( SUCCESS == resize( B, nelts_B ) ); + assert( SUCCESS == + buildMatrixUnique( B, I_B.data(), J_B.data(), V_B.data(), nelts_B, SEQUENTIAL ) + ); + assert( SUCCESS == resize( A_pattern, nelts_A ) ); + assert( SUCCESS == + buildMatrixUnique( A_pattern, I_A.data(), J_A.data(), nelts_A, SEQUENTIAL ) + ); + assert( SUCCESS == resize( B_pattern, nelts_B ) ); + assert( SUCCESS == + buildMatrixUnique( B_pattern, I_B.data(), J_B.data(), nelts_B, SEQUENTIAL ) + ); { // test 1: compute with the monoid mxm_elementwise std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A and B value matrices\n"; - grb::clear( C ); - rc = grb::eWiseApply( C, A, B, mulmono, grb::RESIZE ); + clear( C ); + rc = grb::eWiseApply( C, A, B, mulmono, RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono ); - printSparseMatrix( C, "eWiseApply( C, A, B, mulmono )" ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { std::cerr << "Call to grb::eWiseApply FAILED\n"; return; } - grb::Matrix< int > union_A_B( n, n ); - grb::buildMatrixUnique( union_A_B, I_C_union.data(), J_C_union.data(), V_C_union_A_B.data(), I_C_union.size(), grb::SEQUENTIAL ); + Matrix< int > union_A_B( n, n ); + assert( SUCCESS == + buildMatrixUnique( + union_A_B, + I_C_union.data(), + J_C_union.data(), + V_C_union_A_B.data(), + I_C_union.size(), + SEQUENTIAL ) + ); checkCRSandCCS( C, union_A_B, rc ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { return; } } @@ -307,19 +225,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 2: compute with the monoid mxm_elementwise, A value matrix, B pattern matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A value matrix, B pattern matrix\n"; - grb::clear( C ); - rc = grb::eWiseApply( C, A, B_pattern, mulmono, grb::RESIZE ); + clear( C ); + rc = grb::eWiseApply( C, A, B_pattern, mulmono, RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B_pattern, mulmono ); - printSparseMatrix( C, "eWiseApply( C, A, B_pattern, mulmono )" ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { std::cerr << "Call to grb::eWiseApply FAILED\n"; return; } - grb::Matrix< int > union_A_B_pattern( n, n ); - grb::buildMatrixUnique( union_A_B_pattern, I_C_union_A_B_pattern.data(), J_C_union_A_B_pattern.data(), V_C_union_A_B_pattern.data(), I_C_union_A_B_pattern.size(), grb::SEQUENTIAL ); + Matrix< int > union_A_B_pattern( n, n ); + assert( SUCCESS == + buildMatrixUnique( + union_A_B_pattern, + I_C_union_A_B_pattern.data(), + J_C_union_A_B_pattern.data(), + V_C_union_A_B_pattern.data(), + I_C_union_A_B_pattern.size(), + SEQUENTIAL ) + ); checkCRSandCCS( C, union_A_B_pattern, rc ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { return; } } @@ -327,19 +252,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 3: compute with the monoid mxm_elementwise, A pattern matrix, B value matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A pattern matrix, B value matrix\n"; - grb::clear( C ); - rc = grb::eWiseApply( C, A_pattern, B, mulmono, grb::RESIZE ); + clear( C ); + rc = grb::eWiseApply( C, A_pattern, B, mulmono, RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A_pattern, B, mulmono ); - printSparseMatrix( C, "eWiseApply( C, A_pattern, B, mulmono )" ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { std::cerr << "Call to grb::eWiseApply FAILED\n"; return; } - grb::Matrix< int > union_A_pattern_B( n, n ); - grb::buildMatrixUnique( union_A_pattern_B, I_C_union_A_pattern_B.data(), J_C_union_A_pattern_B.data(), V_C_union_A_pattern_B.data(), I_C_union_A_pattern_B.size(), grb::SEQUENTIAL ); + Matrix< int > union_A_pattern_B( n, n ); + assert( SUCCESS == + buildMatrixUnique( + union_A_pattern_B, + I_C_union_A_pattern_B.data(), + J_C_union_A_pattern_B.data(), + V_C_union_A_pattern_B.data(), + I_C_union_A_pattern_B.size(), + SEQUENTIAL ) + ); checkCRSandCCS( C, union_A_pattern_B, rc ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { return; } } @@ -347,20 +279,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 4: compute with the monoid mxm_elementwise, A pattern matrix, B pattern matrix \n"; std::cout << "\t Verifying the monoid version of mxm_elementwise, " << "A pattern matrix, B pattern matrix\n"; - grb::clear( C ); - rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, grb::RESIZE ); + clear( C ); + rc = grb::eWiseApply( C, A_pattern, B_pattern, mulmono, RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A_pattern, B_pattern, mulmono ); - printSparseMatrix( C, "eWiseApply( C, A_pattern, B_pattern, mulmono )" ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { std::cerr << "Call to grb::eWiseApply FAILED\n"; return; } - grb::Matrix< int > union_A_pattern_B_pattern( n, n ); - grb::buildMatrixUnique( union_A_pattern_B_pattern, I_C_union_A_pattern_B_pattern.data(), J_C_union_A_pattern_B_pattern.data(), V_C_union_A_pattern_B_pattern.data(), - I_C_union_A_pattern_B_pattern.size(), grb::SEQUENTIAL ); + Matrix< int > union_A_pattern_B_pattern( n, n ); + assert( SUCCESS == + buildMatrixUnique( + union_A_pattern_B_pattern, + I_C_union_A_pattern_B_pattern.data(), + J_C_union_A_pattern_B_pattern.data(), + V_C_union_A_pattern_B_pattern.data(), + I_C_union_A_pattern_B_pattern.size(), + SEQUENTIAL ) + ); checkCRSandCCS( C, union_A_pattern_B_pattern, rc ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { return; } } @@ -368,18 +306,26 @@ void grbProgram( const void *, const size_t, grb::RC & rc ) { { // test 5: compute with the operator mxm_elementwise (pattern matrices not allowed) \n"; std::cout << "\t Verifying the operator version of mxm_elementwise " << "(only value matrices)\n"; - grb::clear( C ); - rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), grb::RESIZE ); + clear( C ); + rc = grb::eWiseApply( C, A, B, mulmono.getOperator(), RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B, mulmono.getOperator() ); - printSparseMatrix( C, "eWiseApply( C, A, B, mulmono.getOperator() )" ); - if( rc != grb::SUCCESS ) { + if( rc != SUCCESS ) { std::cerr << "Call to grb::eWiseApply FAILED\n"; return; } - grb::Matrix< int > intersection_A_B( n, n ); - grb::buildMatrixUnique( intersection_A_B, I_C_intersection.data(), J_C_intersection.data(), V_C_intersection.data(), I_C_intersection.size(), grb::SEQUENTIAL ); + Matrix< int > intersection_A_B( n, n ); + assert( SUCCESS == + buildMatrixUnique( + intersection_A_B, + I_C_intersection.data(), + J_C_intersection.data(), + V_C_intersection.data(), + I_C_intersection.size(), + SEQUENTIAL ) + ); checkCRSandCCS( C, intersection_A_B, rc ); - if( rc != grb::SUCCESS ) { + + if( rc != SUCCESS ) { return; } } @@ -389,13 +335,13 @@ int main( int argc, char ** argv ) { (void)argc; std::cout << "Functional test executable: " << argv[ 0 ] << "\n"; - grb::RC rc; + RC rc; grb::Launcher< grb::AUTOMATIC > launcher; - if( launcher.exec( &grbProgram, NULL, 0, rc ) != grb::SUCCESS ) { + if( launcher.exec( &grbProgram, NULL, 0, rc ) != SUCCESS ) { std::cerr << "Test failed to launch\n"; - rc = grb::FAILED; + rc = FAILED; } - if( rc == grb::SUCCESS ) { + if( rc == SUCCESS ) { std::cout << "Test OK\n" << std::endl; } else { std::cerr << std::flush; diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index 0e57b8f58..ca112ce8e 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -18,10 +18,10 @@ /* * @author Benjamin Lozes * @date 24th of May, 2023 - * - * @brief Test for eWiseApply(Matrix, Monoid) + * + * @brief Test for eWiseApply(Matrix, Monoid) * and eWiseApply(Matrix, Operator) variants - * + * * This test is meant to ensure the behaviour of the eWiseApply(Matrix, Monoid) * and eWiseApply(Matrix, Operator) variants is correct. Precisely, we expect * the following behaviour: @@ -30,7 +30,7 @@ * provided identity value for the zero elements. * - eWiseApply(Matrix, Operator) should apply the operator to all elements * of the two matrices, EXCLUDING the couples (non_zero, zero) - * + * */ #include @@ -40,162 +40,137 @@ #include -#define _DEBUG +using namespace grb; using nz_type = int; -constexpr size_t M = 10; -constexpr size_t N = 10; constexpr nz_type A_INITIAL_VALUE = 1; constexpr nz_type B_INITIAL_VALUE = 3; -namespace utils { - template< class Iterator > - void printSparseMatrixIterator( size_t rows, size_t cols, Iterator begin, Iterator end, const std::string & name = "", std::ostream & os = std::cout ) { -#ifndef _DEBUG - return; -#endif - std::cout << "Matrix \"" << name << "\" (" << rows << "x" << cols << "):" << std::endl << "[" << std::endl; - if( rows > 50 || cols > 50 ) { - os << " Matrix too large to print" << std::endl; - } else { - // os.precision( 3 ); - for( size_t y = 0; y < rows; y++ ) { - os << std::string( 3, ' ' ); - for( size_t x = 0; x < cols; x++ ) { - auto nnz_val = std::find_if( begin, end, [ y, x ]( const typename std::iterator_traits< Iterator >::value_type & a ) { - return a.first.first == y && a.first.second == x; - } ); - if( nnz_val != end ) - os << std::fixed << ( *nnz_val ).second; - else - os << '_'; - os << " "; - } - os << std::endl; - } - } - os << "]" << std::endl; - std::flush( os ); - } - template< typename D > - void printSparseMatrix( const grb::Matrix< D > & mat, const std::string & name = "", std::ostream & os = std::cout ) { - grb::wait( mat ); - printSparseMatrixIterator( grb::nrows( mat ), grb::ncols( mat ), mat.cbegin(), mat.cend(), name, os ); +template< typename D > +bool equals_matrix( + const Matrix< D > & A, + const Matrix< D > & B +) { + if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){ + return false; } - template< typename D > - bool equals_matrix( const grb::Matrix< D > & A, const grb::Matrix< D > & B ) { - if( grb::nrows( A ) != grb::nrows( B ) || grb::ncols( A ) != grb::ncols( B ) ) - return false; - grb::wait( A ); - grb::wait( B ); - std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() ); - std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() ); - return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); - } -} // namespace utils + wait( A ); + wait( B ); + + std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() ); + std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() ); + return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); +} template< class Monoid > struct input_t { - const grb::Matrix< nz_type > & A; - const grb::Matrix< nz_type > & B; - const grb::Matrix< nz_type > & C_monoid; - const grb::Matrix< nz_type > & C_operator; + const Matrix< nz_type > & A; + const Matrix< nz_type > & B; + const Matrix< nz_type > & C_monoid; + const Matrix< nz_type > & C_operator; const Monoid & monoid; - input_t( - const grb::Matrix< nz_type > & A = {0,0}, - const grb::Matrix< nz_type > & B = {0,0}, - const grb::Matrix< nz_type > & C_monoid = {0,0}, - const grb::Matrix< nz_type > & C_operator = {0,0}, - const Monoid & monoid = Monoid() ) : - A( A ), B( B ), C_monoid( C_monoid ), C_operator( C_operator ), monoid( monoid ) {} -}; + input_t( + const Matrix< nz_type > & A = {0,0}, + const Matrix< nz_type > & B = {0,0}, + const Matrix< nz_type > & C_monoid = {0,0}, + const Matrix< nz_type > & C_operator = {0,0}, + const Monoid & monoid = Monoid() + ) : A( A ), + B( B ), + C_monoid( C_monoid ), + C_operator( C_operator ), + monoid( monoid ) {} +}; struct output_t { - grb::RC rc; + RC rc; }; template< class Monoid > void grb_program( const input_t< Monoid > & input, output_t & output ) { - static_assert( grb::is_monoid< Monoid >::value, "Monoid required" ); - const auto & op = input.monoid.getOperator(); - grb::wait( input.A ); - grb::wait( input.B ); + static_assert( is_monoid< Monoid >::value, "Monoid required" ); + const auto &op = input.monoid.getOperator(); + wait( input.A ); + wait( input.B ); - auto & rc = output.rc; - - utils::printSparseMatrix( input.A, "A" ); - utils::printSparseMatrix( input.B, "B" ); + RC &rc = output.rc; { // Operator variant - std::cout << "-- eWiseApply using Operator, supposed to be annihilating non-zeroes -> INTERSECTION\n"; - grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) ); - rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::RESIZE ); - grb::wait( C ); - if( rc != grb::RC::SUCCESS ) { + std::cout << "-- eWiseApply using Operator, supposed to be" + << " annihilating non-zeroes -> INTERSECTION\n"; + Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); + rc = eWiseApply( C, input.A, input.B, op, RESIZE ); + wait( C ); + if( rc != SUCCESS ) { std::cerr << "Error: Phase::RESIZE\n"; return; } - rc = grb::eWiseApply( C, input.A, input.B, op, grb::Phase::EXECUTE ); - grb::wait( C ); - if( rc != grb::RC::SUCCESS ) { + rc = eWiseApply( C, input.A, input.B, op, EXECUTE ); + wait( C ); + if( rc != SUCCESS ) { std::cerr << "Error: Phase::EXECUTE\n"; return; } - if( ! utils::equals_matrix( C, input.C_operator ) ) { + if( !equals_matrix( C, input.C_operator ) ) { std::cerr << "Error: Wrong result\n"; - utils::printSparseMatrix( C, "Obtained (operator)", std::cerr ); - utils::printSparseMatrix( input.C_operator, "Truth (operator)", std::cerr ); - rc = grb::RC::FAILED; + rc = FAILED; return; } std::cout << "Result (operator) is correct\n"; } - { // Monoid variant - std::cout << "-- eWiseApply using Monoid, supposed to consider non-zeroes as the identity -> UNION\n"; - grb::Matrix< nz_type > C( grb::nrows( input.A ), grb::ncols( input.A ) ); - rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::RESIZE ); - grb::wait( C ); - if( rc != grb::RC::SUCCESS ) { - std::cerr << "Error: Phase::RESIZE\n"; - return; - } - rc = grb::eWiseApply( C, input.A, input.B, input.monoid, grb::Phase::EXECUTE ); - grb::wait( C ); - if( rc != grb::RC::SUCCESS ) { - std::cerr << "Error: Phase::EXECUTE\n"; - return; - } - - if( ! utils::equals_matrix( C, input.C_monoid ) ) { - std::cerr << "Error: Wrong result\n"; - utils::printSparseMatrix( C, "Obtained (monoid)", std::cerr ); - utils::printSparseMatrix( input.C_monoid, "Truth (monoid)", std::cerr ); - rc = grb::RC::FAILED; - return; - } - - std::cout << "Result (monoid) is correct\n"; - } - - rc = grb::RC::SUCCESS; + // { // Monoid variant + // std::cout << "-- eWiseApply using Monoid, supposed to consider" + // << " non-zeroes as the identity -> UNION\n"; + // Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); + // rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); + // wait( C ); + // if( rc != SUCCESS ) { + // std::cerr << "Error: Phase::RESIZE\n"; + // return; + // } + // rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE ); + // wait( C ); + // if( rc != SUCCESS ) { + // std::cerr << "Error: Phase::EXECUTE\n"; + // return; + // } + + // if( !equals_matrix( C, input.C_monoid ) ) { + // std::cerr << "Error: Wrong result\n"; + // rc = FAILED; + // return; + // } + + // std::cout << "Result (monoid) is correct\n"; + // } + + rc = SUCCESS; } int main( int argc, char ** argv ) { (void) argc; (void) argv; - if(argc > 1) std::cout << "Usage: " << argv[ 0 ] << std::endl; + size_t N = 10; - std::cout << "This is functional test " << argv[ 0 ] << std::endl; - grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher; - grb::RC rc = grb::RC::SUCCESS; + if( argc > 2 ) { + std::cout << "Usage: " << argv[ 0 ] << std::endl; + return 1; + } + if( argc == 2 ) { + N = std::stoul( argv[ 1 ] ); + } + + std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush; + + Launcher< AUTOMATIC > launcher; // Create input data /** Matrix A: Row matrix filled with A_INITIAL_VALUE @@ -206,12 +181,13 @@ int main( int argc, char ** argv ) { * _ _ _ _ _ * (...) */ - grb::Matrix< nz_type > A( M, N, N ); + Matrix< nz_type > A( N, N, N ); std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); std::iota( A_cols.begin(), A_cols.end(), 0 ); - rc = grb::buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), grb::IOMode::SEQUENTIAL ); - assert( rc == grb::RC::SUCCESS ); + if( SUCCESS != + buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL ) + ) { return 2; } /** Matrix B: Column matrix filled with B_INITIAL_VALUE * Y _ _ _ _ @@ -221,12 +197,13 @@ int main( int argc, char ** argv ) { * Y _ _ _ _ * (...) */ - grb::Matrix< nz_type > B( M, N, N ); - std::vector< size_t > B_rows( M, 0 ), B_cols( M, 0 ); - std::vector< nz_type > B_values( M, B_INITIAL_VALUE ); + Matrix< nz_type > B( N, N, N ); + std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); + std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); std::iota( B_rows.begin(), B_rows.end(), 0 ); - rc = grb::buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), grb::IOMode::SEQUENTIAL ); - assert( rc == grb::RC::SUCCESS ); + if( SUCCESS != + buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL ) + ) { return 3; } { /** Matrix C_monoid_truth: Union of A and B @@ -237,17 +214,25 @@ int main( int argc, char ** argv ) { * Y ___ ___ ___ ___ * (...) */ - grb::Matrix< nz_type > C_monoid_truth( M, N ); - size_t nvalues = grb::nrows( A ) + grb::ncols( B ) - 1; + Matrix< nz_type > C_monoid_truth( N, N ); + size_t nvalues = nrows( A ) + ncols( B ) - 1; std::vector< size_t > C_monoid_truth_rows( nvalues, 0 ), C_monoid_truth_cols( nvalues, 0 ); std::vector< nz_type > C_monoid_truth_values( nvalues, 0 ); C_monoid_truth_values[ 0 ] = A_INITIAL_VALUE + B_INITIAL_VALUE; - std::iota( C_monoid_truth_rows.begin() + grb::nrows( A ), C_monoid_truth_rows.end(), 1 ); - std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + grb::nrows( A ), 1 ); - std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + grb::nrows( A ), A_INITIAL_VALUE ); - std::fill( C_monoid_truth_values.begin() + grb::nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE ); - rc = grb::buildMatrixUnique( C_monoid_truth, C_monoid_truth_rows.data(), C_monoid_truth_cols.data(), C_monoid_truth_values.data(), C_monoid_truth_values.size(), grb::IOMode::SEQUENTIAL ); - assert( rc == grb::RC::SUCCESS ); + std::iota( C_monoid_truth_rows.begin() + nrows( A ), C_monoid_truth_rows.end(), 1 ); + std::iota( C_monoid_truth_cols.begin() + 1, C_monoid_truth_cols.begin() + nrows( A ), 1 ); + std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + nrows( A ), A_INITIAL_VALUE ); + std::fill( C_monoid_truth_values.begin() + nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE ); + if( SUCCESS != + buildMatrixUnique( + C_monoid_truth, + C_monoid_truth_rows.data(), + C_monoid_truth_cols.data(), + C_monoid_truth_values.data(), + C_monoid_truth_values.size(), + SEQUENTIAL + ) + ) { return 4; } /** Matrix C_op_truth: Intersection of A and B * X+Y ___ ___ ___ ___ @@ -257,28 +242,42 @@ int main( int argc, char ** argv ) { * ___ ___ ___ ___ ___ * (...) */ - grb::Matrix< nz_type > C_op_truth( M, N ); + Matrix< nz_type > C_op_truth( N, N ); std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 ); std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE ); - rc = grb::buildMatrixUnique( C_op_truth, C_op_truth_rows.data(), C_op_truth_cols.data(), C_op_truth_values.data(), C_op_truth_values.size(), grb::IOMode::SEQUENTIAL ); - assert( rc == grb::RC::SUCCESS ); + if( SUCCESS != + buildMatrixUnique( + C_op_truth, + C_op_truth_rows.data(), + C_op_truth_cols.data(), + C_op_truth_values.data(), + C_op_truth_values.size(), + SEQUENTIAL + ) + ) { return 5; } { /** Test using addition operator, same type for lhs and rhs */ - input_t< grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero > > input { A, B, C_monoid_truth, C_op_truth, - grb::Monoid< grb::operators::add< nz_type >, grb::identities::zero >() }; - output_t output { grb::RC::SUCCESS }; + input_t< + Monoid< operators::add< nz_type >, identities::zero > + > input { A, B, C_monoid_truth, C_op_truth }; + output_t output { SUCCESS }; // Run the test - rc = launcher.exec( &grb_program, input, output, false ); + RC rc = launcher.exec( &grb_program, input, output, false ); // Check the result - assert( rc == grb::RC::SUCCESS ); - if( output.rc != grb::RC::SUCCESS ) { - std::cout << "Test FAILED (" << grb::toString( output.rc ) << ")" << std::endl; - return 1; + if( rc != SUCCESS ) { + std::cerr << "Error: Launcher::exec\n"; + return 6; + } + if( output.rc != SUCCESS ) { + std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; + return 7; } } } - std::cout << "Test OK" << std::endl; + std::cerr << std::flush; + std::cout << "Test OK" << std::endl << std::flush; + return 0; } From 8ee55c5be13394512c4fb0bc6e8644b83e9d1a44 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 12 Jul 2023 18:03:27 +0200 Subject: [PATCH 08/37] Non-square matrix CCS assignment bugfix --- include/graphblas/reference/blas3.hpp | 63 +++++++++++++++++++-------- tests/unit/eWiseApply_matrix.cpp | 47 ++++++++++---------- 2 files changed, 69 insertions(+), 41 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 97c585118..05f445f5d 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1062,11 +1062,19 @@ namespace grb { const size_t nzc = capacity( C ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 +#pragma omp parallel for simd +#endif + for( size_t i = 0; i <= n_A; i++ ) { + C_ccs_raw.col_start[ i ] = 0; + } + C_crs_raw.col_start[ 0 ] = 0; - C_ccs_raw.col_start[ 0 ] = 0; // Prefix sum computation into L.CRS.col_start #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for default( none ) shared( B_raw, A_raw, C_crs_raw, std::cout ) firstprivate( m_A ) +#pragma omp parallel for default( none ) \ + shared( B_raw, A_raw, C_crs_raw, C_ccs_raw, std::cout ) \ + firstprivate( m_A ) #endif for( size_t i = 0; i < m_A; i++ ) { auto B_k = B_raw.col_start[ i ]; @@ -1086,6 +1094,7 @@ namespace grb { } if( B_raw.row_index[ B_k ] == j ) { cumul += 1; + C_ccs_raw.col_start[ j + 1 ] += 1; } } C_crs_raw.col_start[ i + 1 ] = cumul; @@ -1093,41 +1102,57 @@ namespace grb { #ifdef _DEBUG // Print the CRS prefix sum - std::cout << "CRS prefix sum: "; + std::cout << "before nCRS prefix sum: "; for( size_t i = 0; i <= m_A; i++ ) { std::cout << C_crs_raw.col_start[ i ] << " "; } std::cout << "\n"; + // Print the CCS prefix sum + std::cout << "before nCCS prefix sum: "; + for( size_t i = 0; i <= n_A; i++ ) { + std::cout << C_ccs_raw.col_start[ i ] << " "; + } + std::cout << "\n"; #endif // Apply the prefix sum for( size_t i = 1; i <= m_A; i++ ) { C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ]; - C_ccs_raw.col_start[ i ] = C_crs_raw.col_start[ i ]; } + for ( size_t i = 1; i <= n_A; i++ ) { + C_ccs_raw.col_start[ i ] += C_ccs_raw.col_start[ i - 1 ]; + } + +#ifdef _DEBUG + // Print the CRS prefix sum + std::cout << "after nCRS prefix sum: "; + for( size_t i = 0; i <= m_A; i++ ) { + std::cout << C_crs_raw.col_start[ i ] << " "; + } + std::cout << "\n"; + // Print the CCS prefix sum + std::cout << "after nCCS prefix sum: "; + for( size_t i = 0; i <= n_A; i++ ) { + std::cout << C_ccs_raw.col_start[ i ] << " "; + } + std::cout << "\n"; +#endif // Check if the number of nonzeros is greater than the capacity - if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ m_A ] > nzc ) { + if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ n_A ] > nzc ) { #ifdef _DEBUG std::cout << "Insufficient capacity detected for requested operation.\n" - << "Requested " << C_crs_raw.col_start[ m_A ] << " nonzeros" + << "Requested " << C_ccs_raw.col_start[ m_A ] << " nonzeros" << " but capacity is " << nzc << "\n"; #endif return MISMATCH; } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd -#endif - for( size_t i = 0; i < m_A; i++ ) { - C_crs_raw.row_index[ i ] = C_ccs_raw.row_index[ i ] = 0; - } - RC local_rc = rc; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel default( none ) \ - shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \ - firstprivate( local_rc, m_A, oper, A_identity, B_identity ) + shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \ + firstprivate( local_rc, m_A, oper, A_identity, B_identity ) #endif { size_t start_row = 0; @@ -1144,6 +1169,7 @@ namespace grb { const auto A_k_end = A_raw.col_start[ i + 1 ]; for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { const auto j = A_raw.row_index[ A_k ]; + while( B_k < B_raw.col_start[ i + 1 ] && B_raw.row_index[ B_k ] > j ) { @@ -1165,8 +1191,9 @@ namespace grb { C_crs_raw.row_index[ C_k ] = j; C_crs_raw.setValue( C_k, c_val ); - C_ccs_raw.row_index[ C_k ] = i; - C_ccs_raw.setValue( C_k, c_val ); + + C_ccs_raw.row_index[ C_ccs_raw.col_start[ j ] ] = i; + C_ccs_raw.setValue( C_ccs_raw.col_start[ j ], c_val ); #ifdef _DEBUG std::cout << "A( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " @@ -1174,7 +1201,7 @@ namespace grb { std::cout << "B( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( b_val ) + "\n"; - std::cout << "C.crs( " + std::to_string( i ) + ";" + std::cout << "C( " + std::to_string( i ) + ";" + std::to_string( j ) + " ) = " + std::to_string( c_val ) + "\n"; #endif diff --git a/tests/unit/eWiseApply_matrix.cpp b/tests/unit/eWiseApply_matrix.cpp index 48db8af3c..98e8c33c7 100644 --- a/tests/unit/eWiseApply_matrix.cpp +++ b/tests/unit/eWiseApply_matrix.cpp @@ -23,23 +23,24 @@ using namespace grb; -void grb_program( const int &, grb::RC &rc ) { +void grb_program( const size_t &n, grb::RC &rc ) { // large non-square mixed-domain matrix check { - grb::Matrix< char > A( 10000000, 2000000 ); - grb::Matrix< float > B( 10000000, 2000000 ); - grb::Matrix< size_t > C( 10000000, 2000000 ); - size_t * I = new size_t[ 2000000 ]; - size_t * J = new size_t[ 2000000 ]; - char * V = new char[ 2000000 ]; - for( size_t k = 0; k < 2000000; ++k ) { - I[ k ] = J[ k ] = k; + grb::Matrix< char > A( n, 2*n ); + grb::Matrix< float > B( n, 2*n ); + grb::Matrix< size_t > C( n, 2*n ); + size_t * I = new size_t[ n ]; + size_t * J = new size_t[ n ]; + char * V = new char[ n ]; + for( size_t k = 0; k < n; ++k ) { + I[ k ] = k; + J[ k ] = k+n; V[ k ] = 2; } - rc = grb::buildMatrixUnique( A, I, J, V, 2000000, SEQUENTIAL ); - rc = rc ? rc : grb::buildMatrixUnique( B, I, J, V, 2000000, SEQUENTIAL ); - rc = rc ? rc : grb::buildMatrixUnique( C, I, J, V, 2000000, SEQUENTIAL ); + rc = grb::buildMatrixUnique( A, I, J, V, n, SEQUENTIAL ); + rc = rc ? rc : grb::buildMatrixUnique( B, I, J, V, n, SEQUENTIAL ); + rc = rc ? rc : grb::buildMatrixUnique( C, I, J, V, n, SEQUENTIAL ); rc = rc ? rc : grb::eWiseApply( C, A, B, grb::operators::add< float, size_t, char >(), RESIZE ); rc = rc ? rc : grb::eWiseApply( C, A, B, @@ -49,13 +50,14 @@ void grb_program( const int &, grb::RC &rc ) { << "mixed-domain matrix check\n"; return; } + for( const auto &triple : C ) { - const size_t &i = triple.first.first; - const size_t &j = triple.first.second; - const size_t &v = triple.second; - if( i != j ) { - std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) " - << "-- only expected entries on the diagonal\n"; + const auto &i = triple.first.first; + const auto &j = triple.first.second; + const auto &v = triple.second; + if( j != i+n ) { + std::cout << "Unexpected entry at position ( " << i << ", " << i+n << " ) " + << "-- only expected entries on the n-th diagonal\n"; rc = FAILED; } if( v != 4 ) { @@ -74,15 +76,14 @@ void grb_program( const int &, grb::RC &rc ) { int main( int argc, char ** argv ) { // defaults - bool printUsage = false; - int input = 0; // unused + size_t input = 1000; // unused // error checking if( argc > 1 ) { - printUsage = true; + input = std::strtoul( argv[ 1 ], nullptr, 10 ); } - if( printUsage ) { - std::cerr << "Usage: " << argv[ 0 ] << "\n"; + if( argc > 2 ) { + std::cerr << "Usage: " << argv[ 0 ] << "[n]\n"; return 1; } From 3a70d9b78a49a39ea496cf09179bb99766a00951 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 13 Jul 2023 09:26:14 +0200 Subject: [PATCH 09/37] Adapt changes in spy unit-test --- tests/unit/spy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/spy.cpp b/tests/unit/spy.cpp index 71b8e8f28..780216d7f 100644 --- a/tests/unit/spy.cpp +++ b/tests/unit/spy.cpp @@ -82,7 +82,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, grb::nnz( spy ) ); - rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk: " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; rc = grb::FAILED; @@ -114,7 +114,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, nnz( spy ) ); - rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk (pattern): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; rc = grb::FAILED; @@ -146,7 +146,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, nnz( spy ) ); - rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeOperator() ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk (boolean): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; rc = grb::FAILED; From 89ebb160a0ddc1c920c48e8834edfdc12bb64253 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Fri, 28 Jul 2023 11:56:40 +0200 Subject: [PATCH 10/37] Enable test for both variants --- tests/unit/eWiseApplyMatrix_variants.cpp | 50 ++++++++++++------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index ca112ce8e..6c1ff2ed0 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -125,31 +125,31 @@ void grb_program( const input_t< Monoid > & input, output_t & output ) { std::cout << "Result (operator) is correct\n"; } - // { // Monoid variant - // std::cout << "-- eWiseApply using Monoid, supposed to consider" - // << " non-zeroes as the identity -> UNION\n"; - // Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); - // rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); - // wait( C ); - // if( rc != SUCCESS ) { - // std::cerr << "Error: Phase::RESIZE\n"; - // return; - // } - // rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE ); - // wait( C ); - // if( rc != SUCCESS ) { - // std::cerr << "Error: Phase::EXECUTE\n"; - // return; - // } - - // if( !equals_matrix( C, input.C_monoid ) ) { - // std::cerr << "Error: Wrong result\n"; - // rc = FAILED; - // return; - // } - - // std::cout << "Result (monoid) is correct\n"; - // } + { // Monoid variant + std::cout << "-- eWiseApply using Monoid, supposed to consider" + << " non-zeroes as the identity -> UNION\n"; + Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); + rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); + wait( C ); + if( rc != SUCCESS ) { + std::cerr << "Error: Phase::RESIZE\n"; + return; + } + rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE ); + wait( C ); + if( rc != SUCCESS ) { + std::cerr << "Error: Phase::EXECUTE\n"; + return; + } + + if( !equals_matrix( C, input.C_monoid ) ) { + std::cerr << "Error: Wrong result\n"; + rc = FAILED; + return; + } + + std::cout << "Result (monoid) is correct\n"; + } rc = SUCCESS; } From 9f8a43ea634bf43b77b63d3fa9b0f9590ec7476e Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 31 Jul 2023 17:23:33 +0200 Subject: [PATCH 11/37] Revert to stack implementation of the intersection variant --- include/graphblas/reference/blas3.hpp | 456 ++++++++++------------- tests/unit/eWiseApplyMatrix_variants.cpp | 34 +- 2 files changed, 210 insertions(+), 280 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 05f445f5d..7baa39025 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -137,7 +137,7 @@ namespace grb { const auto &B_raw = !trans_right ? internal::getCRS( B ) : internal::getCCS( B ); - auto &C_raw = internal::getCRS( C ); + auto &CRS_raw = internal::getCRS( C ); auto &CCS_raw = internal::getCCS( C ); char * arr = nullptr; @@ -175,7 +175,7 @@ namespace grb { if( crs_only && phase == RESIZE ) { // we are using an auxialiary CRS that we cannot resize ourselves // instead, we update the offset array only - C_raw.col_start[ 0 ] = 0; + CRS_raw.col_start[ 0 ] = 0; } // if crs_only, then the below implements its resize phase // if not crs_only, then the below is both crucial for the resize phase, @@ -202,7 +202,7 @@ namespace grb { if( crs_only && phase == RESIZE ) { // we are using an auxialiary CRS that we cannot resize ourselves // instead, we update the offset array only - C_raw.col_start[ i + 1 ] = nzc; + CRS_raw.col_start[ i + 1 ] = nzc; } } } @@ -259,7 +259,7 @@ namespace grb { // use previously computed CCS offset array to update CCS during the // computational phase nzc = 0; - C_raw.col_start[ 0 ] = 0; + CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { coors.clear(); for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { @@ -302,8 +302,8 @@ namespace grb { assert( nzc < old_nzc ); const size_t j = coors.index( k ); // update CRS - C_raw.row_index[ nzc ] = j; - C_raw.setValue( nzc, valbuf[ j ] ); + CRS_raw.row_index[ nzc ] = j; + CRS_raw.setValue( nzc, valbuf[ j ] ); // update CCS if( !crs_only ) { const size_t CCS_index = C_col_index[ j ]++ + CCS_raw.col_start[ j ]; @@ -313,7 +313,7 @@ namespace grb { // update count (void) ++nzc; } - C_raw.col_start[ i + 1 ] = nzc; + CRS_raw.col_start[ i + 1 ] = nzc; } #ifndef NDEBUG @@ -918,18 +918,7 @@ namespace grb { namespace internal { - /** - * \internal general elementwise matrix application that all eWiseApply - * variants refer to. - * @param[in] oper The operator corresponding to \a mulMonoid if - * \a allow_void is true; otherwise, an arbitrary operator - * under which to perform the eWiseApply. - * @param[in] mulMonoid The monoid under which to perform the eWiseApply if - * \a allow_void is true; otherwise, will be ignored. - * \endinternal - */ template< - bool allow_void, Descriptor descr, class Operator, typename OutputType, typename InputType1, typename InputType2, @@ -951,295 +940,240 @@ namespace grb { void >::type * const = nullptr ) { - assert( !( descr & descriptors::force_row_major ) ); - static_assert( allow_void || - ( !( - std::is_same< InputType1, void >::value - || std::is_same< InputType2, void >::value - ) +#ifdef _DEBUG + std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; +#endif + assert( !(descr & descriptors::force_row_major ) ); + assert( phase != TRY ); + static_assert( + !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value ), "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid" - " version of elementwise mxm can only be used if neither of the input" - " matrices is a pattern matrix (of type void)" ); - assert( phase != TRY ); - + " version of elementwise mxm can only be used if neither of the" + " input matrices is a pattern matrix (of type void)" ); // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; constexpr bool trans_right = descr & descriptors::transpose_right; -#ifdef _DEBUG - std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; -#endif - - if( grb::nnz( B ) == 0 || grb::nnz( A ) == 0 ) { -#ifdef _DEBUG - std::cout << "No nonzeros in input matrices, nothing to compute.\n"; -#endif - return SUCCESS; - } + // run-time checks + const size_t m = nrows( C ); + const size_t n = ncols( C ); + const size_t m_A = !trans_left ? nrows( A ) : ncols( A ); + const size_t n_A = !trans_left ? ncols( A ) : nrows( A ); + const size_t m_B = !trans_right ? nrows( B ) : ncols( B ); + const size_t n_B = !trans_right ? ncols( B ) : nrows( B ); - const auto &A_raw = trans_left - ? internal::getCCS( A ) - : internal::getCRS( A ); - const size_t m_A = trans_left - ? ncols( A ) - : nrows( A ); - const size_t n_A = trans_left - ? nrows( A ) - : ncols( A ); - if( m_A == 0 || n_A == 0 ) { -#ifdef _DEBUG - std::cout << "Matrix A is empty, nothing to compute.\n"; -#endif - return SUCCESS; + if( m != m_A || m != m_B || n != n_A || n != n_B ) { + return MISMATCH; } - const auto &B_raw = trans_right - ? internal::getCCS( B ) - : internal::getCRS( B ); - const size_t m_B = trans_right - ? ncols( B ) - : nrows( B ); - const size_t n_B = trans_right - ? nrows( B ) - : ncols( B ); - if( m_A == 0 || n_A == 0 ) { -#ifdef _DEBUG - std::cout << "Matrix B is empty, nothing to compute.\n"; -#endif - return SUCCESS; - } + const auto &A_raw = !trans_left ? + internal::getCRS( A ) : + internal::getCCS( A ); + const auto &B_raw = !trans_right ? + internal::getCRS( B ) : + internal::getCCS( B ); + auto &CRS_raw = internal::getCRS( C ); + auto &CCS_raw = internal::getCCS( C ); + const auto dummy_identity = identities::zero< OutputType >::value(); - auto &C_crs_raw = internal::getCRS( C ); - auto &C_ccs_raw = internal::getCCS( C ); - const size_t m_C = nrows( C ); - const size_t n_C = ncols( C ); + // retrieve buffers + char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; + arr1 = arr2 = buf1 = buf2 = nullptr; + InputType1 * vbuf1 = nullptr; + InputType2 * vbuf2 = nullptr; + OutputType * valbuf = nullptr; + internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); + internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B ); + internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); + // end buffer retrieval - // Check mask dimensions - if( m_A != m_B || n_A != n_B || m_A != m_C || n_A != n_C ) { -#ifdef _DEBUG - std::cout << "Dimensions of matrices do not match!\n"; + // initialisations + internal::Coordinates< reference > coors1, coors2; + coors1.set( arr1, false, buf1, n ); + coors2.set( arr2, false, buf2, n ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 +#pragma omp parallel for simd default(none) shared(CCS_raw) #endif - return MISMATCH; + for( size_t j = 0; j <= n; ++j ) { + CCS_raw.col_start[ j ] = 0; } + // end initialisations - const auto A_identity = identities::zero< InputType1 >::value(); - const auto B_identity = identities::zero< InputType2 >::value(); + // nonzero count + size_t nzc = 0; - RC rc = SUCCESS; - if( phase == Phase::RESIZE ) { - size_t nzc = 0; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for reduction( + : nzc ) \ - default( none ) shared( B_raw, A_raw ) \ - firstprivate( m_A ) -#endif - for( size_t i = 0; i < m_A; ++i ) { - auto B_k = B_raw.col_start[ i ]; - const auto A_k_start = A_raw.col_start[ i ]; - const auto A_k_end = A_raw.col_start[ i + 1 ]; - for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { - const auto j = A_raw.row_index[ A_k ]; - - while( B_k < B_raw.col_start[ i + 1 ] - && B_raw.row_index[ B_k ] > j - ) { - B_k++; - } - if( B_k >= B_raw.col_start[ i + 1 ] ) { - break; - } - if( B_raw.row_index[ B_k ] == j ) { - nzc += 1; + // symbolic phase + if( phase == RESIZE ) { + for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + coors1.assign( k_col ); + } + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( coors1.assigned( l_col ) ) { + (void) ++nzc; } } } + + const RC ret = grb::resize( C, nzc ); #ifdef _DEBUG - std::cout << "resize( C, " << nzc << " )\n"; + std::cout << "grb::resize( C, " << nzc << " ) = " << ret << "\n"; #endif - return resize( C, nzc ); + return ret; } - const size_t nzc = capacity( C ); + // computational phase + if( phase == EXECUTE ) { + // retrieve additional buffer + config::NonzeroIndexType * const C_col_index = internal::template + getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd +#pragma omp parallel for simd default(none) shared(C_col_index) #endif - for( size_t i = 0; i <= n_A; i++ ) { - C_ccs_raw.col_start[ i ] = 0; - } + for( size_t j = 0; j < n; ++j ) { + C_col_index[ j ] = 0; + } - C_crs_raw.col_start[ 0 ] = 0; - // Prefix sum computation into L.CRS.col_start -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for default( none ) \ - shared( B_raw, A_raw, C_crs_raw, C_ccs_raw, std::cout ) \ - firstprivate( m_A ) -#endif - for( size_t i = 0; i < m_A; i++ ) { - auto B_k = B_raw.col_start[ i ]; - size_t cumul = 0UL; - const auto A_k_start = A_raw.col_start[ i ]; - const auto A_k_end = A_raw.col_start[ i + 1 ]; - for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { - const auto j = A_raw.row_index[ A_k ]; - - while( B_k < B_raw.col_start[ i + 1 ] - && B_raw.row_index[ B_k ] > j - ) { - B_k++; - } - if( B_k >= B_raw.col_start[ i + 1 ]) { - break; + // perform column-wise nonzero count + for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + coors1.assign( k_col ); } - if( B_raw.row_index[ B_k ] == j ) { - cumul += 1; - C_ccs_raw.col_start[ j + 1 ] += 1; + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( coors1.assigned( l_col ) ) { + (void) ++nzc; + (void) ++CCS_raw.col_start[ l_col + 1 ]; + } } } - C_crs_raw.col_start[ i + 1 ] = cumul; - } + // check capacity + if( nzc > capacity( C ) ) { #ifdef _DEBUG - // Print the CRS prefix sum - std::cout << "before nCRS prefix sum: "; - for( size_t i = 0; i <= m_A; i++ ) { - std::cout << C_crs_raw.col_start[ i ] << " "; - } - std::cout << "\n"; - // Print the CCS prefix sum - std::cout << "before nCCS prefix sum: "; - for( size_t i = 0; i <= n_A; i++ ) { - std::cout << C_ccs_raw.col_start[ i ] << " "; - } - std::cout << "\n"; + std::cout << "\t detected insufficient capacity " + << "for requested operation\n"; #endif + const RC clear_rc = clear( C ); + if( clear_rc != SUCCESS ) { + return PANIC; + } else { + return FAILED; + } + } + + // prefix sum for CCS_raw.col_start + assert( CCS_raw.col_start[ 0 ] == 0 ); + for( size_t j = 1; j < n; ++j ) { + CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + } + assert( CCS_raw.col_start[ n ] == nzc ); - // Apply the prefix sum - for( size_t i = 1; i <= m_A; i++ ) { - C_crs_raw.col_start[ i ] += C_crs_raw.col_start[ i - 1 ]; - } - for ( size_t i = 1; i <= n_A; i++ ) { - C_ccs_raw.col_start[ i ] += C_ccs_raw.col_start[ i - 1 ]; - } + // do computations + bool columns[ n ] = { false }; + bool columns2[ n ] = { false }; + size_t nzc = 0; + CRS_raw.col_start[ 0 ] = 0; + for( size_t i = 0; i < m; ++i ) { #ifdef _DEBUG - // Print the CRS prefix sum - std::cout << "after nCRS prefix sum: "; - for( size_t i = 0; i <= m_A; i++ ) { - std::cout << C_crs_raw.col_start[ i ] << " "; - } - std::cout << "\n"; - // Print the CCS prefix sum - std::cout << "after nCCS prefix sum: "; - for( size_t i = 0; i <= n_A; i++ ) { - std::cout << C_ccs_raw.col_start[ i ] << " "; - } - std::cout << "\n"; + std::cout << " -- i: " << i << "\n"; #endif - // Check if the number of nonzeros is greater than the capacity - if( C_crs_raw.col_start[ m_A ] > nzc || C_ccs_raw.col_start[ n_A ] > nzc ) { + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + columns[ k_col ] = true; + valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); #ifdef _DEBUG - std::cout << "Insufficient capacity detected for requested operation.\n" - << "Requested " << C_ccs_raw.col_start[ m_A ] << " nonzeros" - << " but capacity is " << nzc << "\n"; + std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n"; #endif - return MISMATCH; - } + } - RC local_rc = rc; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel default( none ) \ - shared( C_ccs_raw, C_crs_raw, A_raw, B_raw, rc, std::cout ) \ - firstprivate( local_rc, m_A, oper, A_identity, B_identity ) + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( !columns[ l_col ] ) { // Union case: ignored + continue; + } + const auto valbuf_value_before = valbuf[ l_col ]; + (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); + columns2[ l_col ] = true; +#ifdef _DEBUG + std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity ) + << " && A(" << i << ";" << l_col << ")=" << valbuf_value_before + << " ==> C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n"; #endif - { - size_t start_row = 0; - size_t end_row = m_A; + } + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - config::OMP::localRange( start_row, end_row, 0, m_A ); + #pragma omp parallel for simd #endif + for( size_t i = 0; i < n; i++ ) { + columns[ i ] = false; + } - for( size_t i = start_row; i < end_row; ++i ) { - auto B_k = B_raw.col_start[ i ]; - auto C_k = C_crs_raw.col_start[ i ]; - - const auto A_k_start = A_raw.col_start[ i ]; - const auto A_k_end = A_raw.col_start[ i + 1 ]; - for( auto A_k = A_k_start; A_k < A_k_end; ++A_k ) { - const auto j = A_raw.row_index[ A_k ]; - - while( B_k < B_raw.col_start[ i + 1 ] - && B_raw.row_index[ B_k ] > j - ) { - B_k++; - } - if( B_k >= B_raw.col_start[ i + 1 ] ) { - break; - } - if( B_raw.row_index[ B_k ] != j ) { + for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { + const size_t j = j_unsigned - 1; + if( !columns2[ j ] ) { continue; } + // update CRS + CRS_raw.row_index[ nzc ] = j; + CRS_raw.setValue( nzc, valbuf[ j ] ); + // update CCS + C_col_index[ j ]++; + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, valbuf[ j ] ); + // update count + (void)++nzc; + } + CRS_raw.col_start[ i + 1 ] = nzc; - const InputType1 a_val = A_raw.getValue( A_k, A_identity ); - const InputType2 b_val = B_raw.getValue( B_k, B_identity ); - OutputType c_val; - local_rc = local_rc - ? local_rc - : grb::apply< descr >( c_val, a_val, b_val, oper ); - - C_crs_raw.row_index[ C_k ] = j; - C_crs_raw.setValue( C_k, c_val ); - - C_ccs_raw.row_index[ C_ccs_raw.col_start[ j ] ] = i; - C_ccs_raw.setValue( C_ccs_raw.col_start[ j ], c_val ); -#ifdef _DEBUG - std::cout << "A( " + std::to_string( i ) + ";" - + std::to_string( j ) + " ) = " - + std::to_string( a_val ) + "\n"; - std::cout << "B( " + std::to_string( i ) + ";" - + std::to_string( j ) + " ) = " - + std::to_string( b_val ) + "\n"; - std::cout << "C( " + std::to_string( i ) + ";" - + std::to_string( j ) + " ) = " - + std::to_string( c_val ) + "\n"; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel for simd #endif - C_k += 1; + for( size_t i = 0; i < n; i++ ) { + columns2[ i ] = false; } } - if( local_rc != SUCCESS ) { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp critical +#ifdef _DEBUG + std::cout << "CCS_raw.col_start = [ "; + for( size_t j = 0; j <= n; ++j ) + std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; +#endif +#ifndef NDEBUG + for( size_t j = 0; j < n; ++j ) { + assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); + } #endif - { - rc = rc ? rc : local_rc; - } - } - } + // set final number of nonzeroes in output matrix #ifdef _DEBUG - std::cout << "internal::setCurrentNonzeroes( C, " - << C_crs_raw.col_start[ m_A ] << " )\n"; + std::cout << "internal::setCurrentNonzeroes( C, " << nzc << " )\n"; #endif - internal::setCurrentNonzeroes( C, C_crs_raw.col_start[ m_A ] ); + internal::setCurrentNonzeroes( C, nzc ); + } - return rc; + // done + return SUCCESS; } - /** - * \internal general elementwise matrix application that all eWiseApply - * variants refer to. - * @param[in] oper The operator corresponding to \a mulMonoid if - * \a allow_void is true; otherwise, an arbitrary operator - * under which to perform the eWiseApply. - * @param[in] mulMonoid The monoid under which to perform the eWiseApply if - * \a allow_void is true; otherwise, will be ignored. - * \endinternal - */ template< - bool allow_void, Descriptor descr, class Monoid, typename OutputType, typename InputType1, typename InputType2, @@ -1261,19 +1195,11 @@ namespace grb { void >::type * const = nullptr ) { - assert( !(descr & descriptors::force_row_major ) ); - static_assert( allow_void || - ( !( - std::is_same< InputType1, void >::value || - std::is_same< InputType2, void >::value - ) ), - "grb::internal::eWiseApply_matrix_generic_union: the non-monoid" - " version of elementwise mxm can only be used if neither of the" - " input matrices is a pattern matrix (of type void)" ); - assert( phase != TRY ); #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n"; #endif + assert( phase != TRY ); + assert( !(descr & descriptors::force_row_major ) ); // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; constexpr bool trans_right = descr & descriptors::transpose_right; @@ -1301,7 +1227,7 @@ namespace grb { const auto &B_raw = !trans_right ? internal::getCRS( B ) : internal::getCCS( B ); - auto &C_raw = internal::getCRS( C ); + auto &CRS_raw = internal::getCRS( C ); auto &CCS_raw = internal::getCCS( C ); @@ -1421,7 +1347,7 @@ namespace grb { // do computations bool columns[ n ] = { false }; size_t nzc = 0; - C_raw.col_start[ 0 ] = 0; + CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { #ifdef _DEBUG std::cout << " -- i: " << i << "\n"; @@ -1461,8 +1387,8 @@ namespace grb { continue; } // update CRS - C_raw.row_index[ nzc ] = j; - C_raw.setValue( nzc, valbuf[ j ] ); + CRS_raw.row_index[ nzc ] = j; + CRS_raw.setValue( nzc, valbuf[ j ] ); // update CCS C_col_index[ j ]++; const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; @@ -1471,7 +1397,7 @@ namespace grb { // update count (void)++nzc; } - C_raw.col_start[ i + 1 ] = nzc; + CRS_raw.col_start[ i + 1 ] = nzc; for( size_t i = 0; i < n; i++ ) { columns[ i ] = false; @@ -1558,7 +1484,7 @@ namespace grb { std::cout << "In grb::eWiseApply_matrix_generic( reference, monoid )\n"; #endif - return internal::eWiseApply_matrix_generic_union< true, descr >( + return internal::eWiseApply_matrix_generic_union< descr >( C, A, B, mulmono, phase ); } @@ -1621,7 +1547,7 @@ namespace grb { std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n"; #endif - return internal::eWiseApply_matrix_generic_intersection< false, descr >( + return internal::eWiseApply_matrix_generic_intersection< descr >( C, A, B, mulOp, phase ); } diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index 6c1ff2ed0..711b8ca15 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -50,8 +50,8 @@ constexpr nz_type B_INITIAL_VALUE = 3; template< typename D > bool equals_matrix( - const Matrix< D > & A, - const Matrix< D > & B + const Matrix< D > &A, + const Matrix< D > &B ) { if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){ return false; @@ -60,25 +60,29 @@ bool equals_matrix( wait( A ); wait( B ); - std::vector< std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() ); - std::vector< std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() ); + std::vector< + std::pair< std::pair< size_t, size_t >, D > + > A_vec( A.cbegin(), A.cend() ); + std::vector< + std::pair< std::pair< size_t, size_t >, D > + > B_vec( B.cbegin(), B.cend() ); return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); } template< class Monoid > struct input_t { - const Matrix< nz_type > & A; - const Matrix< nz_type > & B; - const Matrix< nz_type > & C_monoid; - const Matrix< nz_type > & C_operator; - const Monoid & monoid; + const Matrix< nz_type > &A; + const Matrix< nz_type > &B; + const Matrix< nz_type > &C_monoid; + const Matrix< nz_type > &C_operator; + const Monoid &monoid; input_t( - const Matrix< nz_type > & A = {0,0}, - const Matrix< nz_type > & B = {0,0}, - const Matrix< nz_type > & C_monoid = {0,0}, - const Matrix< nz_type > & C_operator = {0,0}, - const Monoid & monoid = Monoid() + const Matrix< nz_type > &A = {0,0}, + const Matrix< nz_type > &B = {0,0}, + const Matrix< nz_type > &C_monoid = {0,0}, + const Matrix< nz_type > &C_operator = {0,0}, + const Monoid &monoid = Monoid() ) : A( A ), B( B ), C_monoid( C_monoid ), @@ -91,7 +95,7 @@ struct output_t { }; template< class Monoid > -void grb_program( const input_t< Monoid > & input, output_t & output ) { +void grb_program( const input_t< Monoid > &input, output_t &output ) { static_assert( is_monoid< Monoid >::value, "Monoid required" ); const auto &op = input.monoid.getOperator(); wait( input.A ); From ef2b8da231a6e0be02a0a7b0b4738a6abd10df06 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 31 Jul 2023 17:30:11 +0200 Subject: [PATCH 12/37] omp bugfix --- include/graphblas/reference/blas3.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 7baa39025..dd85856af 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -995,7 +995,7 @@ namespace grb { coors1.set( arr1, false, buf1, n ); coors2.set( arr2, false, buf2, n ); #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd default(none) shared(CCS_raw) +#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n) #endif for( size_t j = 0; j <= n; ++j ) { CCS_raw.col_start[ j ] = 0; @@ -1035,7 +1035,7 @@ namespace grb { getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd default(none) shared(C_col_index) +#pragma omp parallel for simd default(none) shared(C_col_index) firstprivate(n) #endif for( size_t j = 0; j < n; ++j ) { C_col_index[ j ] = 0; From 045242e8b7eaaba2d405974422beca15b621aeb4 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 3 Aug 2023 22:04:57 +0200 Subject: [PATCH 13/37] Replace local buffer with coordinates array --- include/graphblas/reference/blas3.hpp | 38 +++++++++------------------ 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index dd85856af..a7b99c350 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1080,8 +1080,6 @@ namespace grb { // do computations - bool columns[ n ] = { false }; - bool columns2[ n ] = { false }; size_t nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { @@ -1091,7 +1089,7 @@ namespace grb { for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - columns[ k_col ] = true; + coors1.assign( k_col ); valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); #ifdef _DEBUG std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n"; @@ -1100,12 +1098,12 @@ namespace grb { for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; - if( !columns[ l_col ] ) { // Union case: ignored + if( !coors1.assigned( l_col ) ) { // Union case: ignored continue; } const auto valbuf_value_before = valbuf[ l_col ]; (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); - columns2[ l_col ] = true; + coors2.assign( l_col ); #ifdef _DEBUG std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity ) << " && A(" << i << ";" << l_col << ")=" << valbuf_value_before @@ -1113,16 +1111,11 @@ namespace grb { #endif } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd -#endif - for( size_t i = 0; i < n; i++ ) { - columns[ i ] = false; - } + coors1.clear(); for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { const size_t j = j_unsigned - 1; - if( !columns2[ j ] ) { + if( !coors2.assigned( j ) ) { continue; } // update CRS @@ -1138,12 +1131,7 @@ namespace grb { } CRS_raw.col_start[ i + 1 ] = nzc; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd -#endif - for( size_t i = 0; i < n; i++ ) { - columns2[ i ] = false; - } + coors2.clear(); } #ifdef _DEBUG @@ -1345,7 +1333,7 @@ namespace grb { // do computations - bool columns[ n ] = { false }; + size_t nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { @@ -1355,7 +1343,7 @@ namespace grb { for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - columns[ k_col ] = true; + coors1.assign( k_col ); valbuf[ k_col ] = A_raw.getValue( k, identity_A ); #ifdef _DEBUG std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n"; @@ -1364,7 +1352,7 @@ namespace grb { for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; - if( columns[ l_col ] ) { // Intersection case + if( coors1.assigned( l_col ) ) { // Intersection case const auto valbuf_value_before = valbuf[ l_col ]; (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper ); #ifdef _DEBUG @@ -1376,14 +1364,14 @@ namespace grb { #ifdef _DEBUG std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n"; #endif - columns[ l_col ] = true; + coors1.assign( l_col ); valbuf[ l_col ] = B_raw.getValue( l, identity_B ); } } for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { const size_t j = j_unsigned - 1; - if( !columns[ j ] ) { + if( !coors1.assigned( j ) ) { continue; } // update CRS @@ -1399,9 +1387,7 @@ namespace grb { } CRS_raw.col_start[ i + 1 ] = nzc; - for( size_t i = 0; i < n; i++ ) { - columns[ i ] = false; - } + coors1.clear(); } #ifdef _DEBUG From 154c592a44aa02652a6fa9a353ace55f5de09f26 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Fri, 4 Aug 2023 10:44:12 +0200 Subject: [PATCH 14/37] Bugfix for union variant --- include/graphblas/reference/blas3.hpp | 29 ++++++++------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index a7b99c350..e718ac35b 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1340,33 +1340,22 @@ namespace grb { #ifdef _DEBUG std::cout << " -- i: " << i << "\n"; #endif - + coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); valbuf[ k_col ] = A_raw.getValue( k, identity_A ); -#ifdef _DEBUG - std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, identity_A ) << "\n"; -#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; - if( coors1.assigned( l_col ) ) { // Intersection case - const auto valbuf_value_before = valbuf[ l_col ]; - (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, identity_B ), oper ); -#ifdef _DEBUG - std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, identity_B ) - << " && A(" << i << ";" << l_col << ")=" << valbuf_value_before - << " ==> C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n"; -#endif - } else { // Union case -#ifdef _DEBUG - std::cout << "Found B( " << i << ", " << l_col << " ) = " << B_raw.getValue( l, identity_B ) << "\n"; -#endif + const auto B_val = B_raw.getValue( l, identity_B ); + if( !coors1.assigned( l_col ) ) { // Union case coors1.assign( l_col ); - valbuf[ l_col ] = B_raw.getValue( l, identity_B ); + valbuf[ l_col ] = identity_A; } + const auto valbuf_value_before = valbuf[ l_col ]; + (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_val, oper ); } for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { @@ -1386,8 +1375,6 @@ namespace grb { (void)++nzc; } CRS_raw.col_start[ i + 1 ] = nzc; - - coors1.clear(); } #ifdef _DEBUG @@ -1467,7 +1454,7 @@ namespace grb { ); #ifdef _DEBUG - std::cout << "In grb::eWiseApply_matrix_generic( reference, monoid )\n"; + std::cout << "In grb::eWiseApply( reference, monoid )\n"; #endif return internal::eWiseApply_matrix_generic_union< descr >( @@ -1530,7 +1517,7 @@ namespace grb { "input matrices is a pattern matrix (of type void)" ); #ifdef _DEBUG - std::cout << "In grb::eWiseApply_matrix_generic( reference, operator )\n"; + std::cout << "In grb::eWiseApply( reference, operator )\n"; #endif return internal::eWiseApply_matrix_generic_intersection< descr >( From 13a0ae32bbc57e6450fdc08e3317382bbf8e55e7 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 9 Aug 2023 17:56:00 +0200 Subject: [PATCH 15/37] Add support for descriptors::force_row_major --- include/graphblas/reference/blas3.hpp | 179 +++++++++++++++----------- 1 file changed, 104 insertions(+), 75 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index e718ac35b..a060ea408 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -943,7 +943,6 @@ namespace grb { #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; #endif - assert( !(descr & descriptors::force_row_major ) ); assert( phase != TRY ); static_assert( !( @@ -953,6 +952,7 @@ namespace grb { "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid" " version of elementwise mxm can only be used if neither of the" " input matrices is a pattern matrix (of type void)" ); + constexpr bool crs_only = descr & descriptors::force_row_major; // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; constexpr bool trans_right = descr & descriptors::transpose_right; @@ -965,7 +965,19 @@ namespace grb { const size_t m_B = !trans_right ? nrows( B ) : ncols( B ); const size_t n_B = !trans_right ? ncols( B ) : nrows( B ); + if( crs_only && (trans_left || trans_right) ) { +#ifdef _DEBUG + std::cerr << "grb::descriptors::force_row_major and " + << "grb::descriptors::transpose_left/right are mutually " + << "exclusive\n"; +#endif + return ILLEGAL; + } + if( m != m_A || m != m_B || n != n_A || n != n_B ) { +#ifdef _DEBUG + std::cerr << "grb::eWiseApply: dimensions of input matrices do not match\n"; +#endif return MISMATCH; } @@ -994,11 +1006,14 @@ namespace grb { internal::Coordinates< reference > coors1, coors2; coors1.set( arr1, false, buf1, n ); coors2.set( arr2, false, buf2, n ); + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n) + #pragma omp parallel for simd default(none) \ + shared(CCS_raw) firstprivate(n) #endif - for( size_t j = 0; j <= n; ++j ) { - CCS_raw.col_start[ j ] = 0; + for( size_t j = 0; j <= n; ++j ) { + CCS_raw.col_start[ j ] = 0; + } } // end initialisations @@ -1034,11 +1049,14 @@ namespace grb { config::NonzeroIndexType * const C_col_index = internal::template getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel for simd default(none) shared(C_col_index) firstprivate(n) + #pragma omp parallel for simd default(none) \ + shared(C_col_index) firstprivate(n) #endif - for( size_t j = 0; j < n; ++j ) { - C_col_index[ j ] = 0; + for( size_t j = 0; j < n; ++j ) { + C_col_index[ j ] = 0; + } } // perform column-wise nonzero count @@ -1052,7 +1070,9 @@ namespace grb { const size_t l_col = B_raw.row_index[ l ]; if( coors1.assigned( l_col ) ) { (void) ++nzc; - (void) ++CCS_raw.col_start[ l_col + 1 ]; + if( !crs_only ) { + (void) ++CCS_raw.col_start[ l_col + 1 ]; + } } } } @@ -1072,28 +1092,22 @@ namespace grb { } // prefix sum for CCS_raw.col_start - assert( CCS_raw.col_start[ 0 ] == 0 ); - for( size_t j = 1; j < n; ++j ) { - CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + if( !crs_only ) { + assert( CCS_raw.col_start[ 0 ] == 0 ); + for( size_t j = 1; j < n; ++j ) { + CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + } + assert( CCS_raw.col_start[ n ] == nzc ); } - assert( CCS_raw.col_start[ n ] == nzc ); - // do computations size_t nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { -#ifdef _DEBUG - std::cout << " -- i: " << i << "\n"; -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); -#ifdef _DEBUG - std::cout << "Found A( " << i << ", " << k_col << " ) = " << A_raw.getValue( k, dummy_identity ) << "\n"; -#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { @@ -1104,11 +1118,6 @@ namespace grb { const auto valbuf_value_before = valbuf[ l_col ]; (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); coors2.assign( l_col ); -#ifdef _DEBUG - std::cout << "Found intersection: B(" << i << ";" << l_col << ")=" << B_raw.getValue( l, dummy_identity ) - << " && A(" << i << ";" << l_col << ")=" << valbuf_value_before - << " ==> C(" << i << ";" << l_col << ")=" << valbuf[ l_col ] << "\n"; -#endif } coors1.clear(); @@ -1122,10 +1131,12 @@ namespace grb { CRS_raw.row_index[ nzc ] = j; CRS_raw.setValue( nzc, valbuf[ j ] ); // update CCS - C_col_index[ j ]++; - const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; - CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, valbuf[ j ] ); + if( !crs_only ) { + C_col_index[ j ]++; + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, valbuf[ j ] ); + } // update count (void)++nzc; } @@ -1134,21 +1145,23 @@ namespace grb { coors2.clear(); } + if( !crs_only ) { #ifdef _DEBUG - std::cout << "CCS_raw.col_start = [ "; - for( size_t j = 0; j <= n; ++j ) - std::cout << CCS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; - std::cout << "]\n"; + std::cout << "CCS_raw.col_start = [ "; + for( size_t j = 0; j <= n; ++j ) + std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; #endif #ifndef NDEBUG - for( size_t j = 0; j < n; ++j ) { - assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); - } + for( size_t j = 0; j < n; ++j ) { + assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); + } #endif + } // set final number of nonzeroes in output matrix #ifdef _DEBUG @@ -1187,11 +1200,20 @@ namespace grb { std::cout << "In grb::internal::eWiseApply_matrix_generic_union\n"; #endif assert( phase != TRY ); - assert( !(descr & descriptors::force_row_major ) ); + constexpr bool crs_only = descr & descriptors::force_row_major; // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; constexpr bool trans_right = descr & descriptors::transpose_right; + if( crs_only && (trans_left || trans_right) ) { +#ifdef _DEBUG + std::cerr << "grb::descriptors::force_row_major and " + << "grb::descriptors::transpose_left/right are mutually " + << "exclusive\n"; +#endif + return ILLEGAL; + } + // run-time checks const size_t m = nrows( C ); const size_t n = ncols( C ); @@ -1205,6 +1227,9 @@ namespace grb { const auto identity_B = monoid.template getIdentity< OutputType >(); if( m != m_A || m != m_B || n != n_A || n != n_B ) { +#ifdef _DEBUG + std::cerr << "grb::eWiseApply: dimensions of input matrices do not match\n"; +#endif return MISMATCH; } @@ -1234,21 +1259,14 @@ namespace grb { internal::Coordinates< reference > coors1, coors2; coors1.set( arr1, false, buf1, n ); coors2.set( arr2, false, buf2, n ); + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel - { - size_t start, end; - config::OMP::localRange( start, end, 0, n + 1 ); -#else - const size_t start = 0; - const size_t end = n + 1; + #pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n) #endif - for( size_t j = start; j < end; ++j ) { + for( size_t j = 0; j < n + 1; ++j ) { CCS_raw.col_start[ j ] = 0; } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 } -#endif // end initialisations // nonzero count @@ -1291,13 +1309,17 @@ namespace grb { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); (void) ++nzc; - (void) ++CCS_raw.col_start[ k_col + 1 ]; + if( !crs_only ) { + (void) ++CCS_raw.col_start[ k_col + 1 ]; + } } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; if( not coors1.assigned( l_col ) ) { (void) ++nzc; - (void) ++CCS_raw.col_start[ l_col + 1 ]; + if( !crs_only ) { + (void) ++CCS_raw.col_start[ l_col + 1 ]; + } } } } @@ -1317,18 +1339,22 @@ namespace grb { } // prefix sum for CCS_raw.col_start - assert( CCS_raw.col_start[ 0 ] == 0 ); - for( size_t j = 1; j < n; ++j ) { - CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + if( !crs_only ) { + assert( CCS_raw.col_start[ 0 ] == 0 ); + for( size_t j = 1; j < n; ++j ) { + CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + } + assert( CCS_raw.col_start[ n ] == nzc ); } - assert( CCS_raw.col_start[ n ] == nzc ); // set C_col_index to all zero + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd + #pragma omp parallel for simd #endif - for( size_t j = 0; j < n; j++ ) { - C_col_index[ j ] = 0; + for( size_t j = 0; j < n; j++ ) { + C_col_index[ j ] = 0; + } } @@ -1367,31 +1393,34 @@ namespace grb { CRS_raw.row_index[ nzc ] = j; CRS_raw.setValue( nzc, valbuf[ j ] ); // update CCS - C_col_index[ j ]++; - const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; - CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, valbuf[ j ] ); + if( !crs_only ) { + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]++; + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, valbuf[ j ] ); + } // update count (void)++nzc; } CRS_raw.col_start[ i + 1 ] = nzc; } + if( !crs_only ) { #ifdef _DEBUG - std::cout << "CCS_raw.col_start = [ "; - for( size_t j = 0; j <= n; ++j ) - std::cout << CCS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; - std::cout << "]\n"; + std::cout << "CCS_raw.col_start = [ "; + for( size_t j = 0; j <= n; ++j ) + std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; #endif #ifndef NDEBUG - for( size_t j = 0; j < n; ++j ) { - assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); - } + for( size_t j = 0; j < n; ++j ) { + assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); + } #endif + } // set final number of nonzeroes in output matrix #ifdef _DEBUG From 7eba1fed1c5ff9af73c5c101fd041157e18b4e4a Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Tue, 31 Oct 2023 16:23:47 +0100 Subject: [PATCH 16/37] Bugfix + improvement --- include/graphblas/reference/blas3.hpp | 77 +++++++++++++++++---------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index a060ea408..a49b16c48 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -992,20 +992,17 @@ namespace grb { const auto dummy_identity = identities::zero< OutputType >::value(); // retrieve buffers - char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; - arr1 = arr2 = buf1 = buf2 = nullptr; + char * arr1, * arr3, * buf1, * buf3; + arr1 = buf1 = nullptr; InputType1 * vbuf1 = nullptr; - InputType2 * vbuf2 = nullptr; OutputType * valbuf = nullptr; internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); - internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B ); internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); // end buffer retrieval // initialisations - internal::Coordinates< reference > coors1, coors2; + internal::Coordinates< reference > coors1; coors1.set( arr1, false, buf1, n ); - coors2.set( arr2, false, buf2, n ); if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel for simd default(none) \ @@ -1022,6 +1019,7 @@ namespace grb { // symbolic phase if( phase == RESIZE ) { + nzc = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { @@ -1045,6 +1043,7 @@ namespace grb { // computational phase if( phase == EXECUTE ) { + nzc = 0; // retrieve additional buffer config::NonzeroIndexType * const C_col_index = internal::template getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); @@ -1101,9 +1100,11 @@ namespace grb { } // do computations - size_t nzc = 0; + nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); @@ -1111,57 +1112,75 @@ namespace grb { } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( !coors1.assigned( l_col ) ) { // Union case: ignored + const size_t j = B_raw.row_index[ l ]; + if( !coors1.assigned( j ) ) { // Union case: ignored continue; } - const auto valbuf_value_before = valbuf[ l_col ]; - (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); - coors2.assign( l_col ); - } - coors1.clear(); + const auto valbuf_value_before = valbuf[ j ]; + OutputType result_value; + (void)grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); - for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { - const size_t j = j_unsigned - 1; - if( !coors2.assigned( j ) ) { - continue; - } // update CRS CRS_raw.row_index[ nzc ] = j; - CRS_raw.setValue( nzc, valbuf[ j ] ); + CRS_raw.setValue( nzc, result_value ); + // update CCS if( !crs_only ) { C_col_index[ j ]++; const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, valbuf[ j ] ); + CCS_raw.setValue( CCS_index, result_value ); } + // update count (void)++nzc; } + CRS_raw.col_start[ i + 1 ] = nzc; - coors2.clear(); } - if( !crs_only ) { + #ifdef _DEBUG + std::cout << "CRS_raw.col_start = [ "; + for( size_t j = 0; j <= m; ++j ) + std::cout << CRS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "CRS_raw.row_index = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CRS_raw.row_index[ j ] << " "; + std::cout << "]\n"; + std::cout << "CRS_raw.values = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CRS_raw.values[ j ] << " "; + std::cout << "]\n"; + if( !crs_only ) { + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; std::cout << "CCS_raw.col_start = [ "; for( size_t j = 0; j <= n; ++j ) std::cout << CCS_raw.col_start[ j ] << " "; std::cout << "]\n"; - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; + std::cout << "CCS_raw.row_index = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CCS_raw.row_index[ j ] << " "; + std::cout << "]\n"; + std::cout << "CCS_raw.values = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CCS_raw.values[ j ] << " "; std::cout << "]\n"; + } #endif + #ifndef NDEBUG - for( size_t j = 0; j < n; ++j ) { + if( !crs_only ) { + for( size_t j = 0; j < n; ++j ) assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); - } -#endif } +#endif // set final number of nonzeroes in output matrix #ifdef _DEBUG From b84c55dbc59cc29e153a462a6f04256723b24f49 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 1 Nov 2023 11:56:33 +0100 Subject: [PATCH 17/37] New unit-test cases --- tests/unit/CMakeLists.txt | 1 + tests/unit/eWiseApplyMatrix_variants.cpp | 218 +++++++++++++++++------ 2 files changed, 161 insertions(+), 58 deletions(-) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 815db9d2b..5ac228625 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -87,6 +87,7 @@ add_grb_executables( ewiseapply ewiseapply.cpp add_grb_executables( eWiseApplyMatrix_variants eWiseApplyMatrix_variants.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking + ADDITIONAL_LINK_LIBRARIES test_utils_headers ) add_grb_executables( eWiseMatrix eWiseMatrix.cpp diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index 711b8ca15..cab4b4ec7 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -39,9 +39,11 @@ #include #include +#include using namespace grb; + using nz_type = int; constexpr nz_type A_INITIAL_VALUE = 1; @@ -53,23 +55,26 @@ bool equals_matrix( const Matrix< D > &A, const Matrix< D > &B ) { - if( nrows( A ) != nrows( B ) || ncols( A ) != ncols( B ) ){ + if( nrows( A ) != nrows( B ) || + ncols( A ) != ncols( B ) || + nnz( A ) != nnz( B ) + ) { return false; } wait( A ); wait( B ); - std::vector< - std::pair< std::pair< size_t, size_t >, D > + std::vector< + std::pair< std::pair< size_t, size_t >, D > > A_vec( A.cbegin(), A.cend() ); - std::vector< - std::pair< std::pair< size_t, size_t >, D > + std::vector< + std::pair< std::pair< size_t, size_t >, D > > B_vec( B.cbegin(), B.cend() ); return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); } -template< class Monoid > +template< class Monoid, Descriptor descr = descriptors::no_operation > struct input_t { const Matrix< nz_type > &A; const Matrix< nz_type > &B; @@ -82,11 +87,11 @@ struct input_t { const Matrix< nz_type > &B = {0,0}, const Matrix< nz_type > &C_monoid = {0,0}, const Matrix< nz_type > &C_operator = {0,0}, - const Monoid &monoid = Monoid() - ) : A( A ), - B( B ), + const Monoid &monoid = Monoid() + ) : A( A ), + B( B ), C_monoid( C_monoid ), - C_operator( C_operator ), + C_operator( C_operator ), monoid( monoid ) {} }; @@ -94,32 +99,35 @@ struct output_t { RC rc; }; -template< class Monoid > -void grb_program( const input_t< Monoid > &input, output_t &output ) { +template< class Monoid, Descriptor descr > +void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { static_assert( is_monoid< Monoid >::value, "Monoid required" ); const auto &op = input.monoid.getOperator(); - wait( input.A ); - wait( input.B ); RC &rc = output.rc; { // Operator variant - std::cout << "-- eWiseApply using Operator, supposed to be" + std::cout << " -- eWiseApply using Operator, supposed to be" << " annihilating non-zeroes -> INTERSECTION\n"; Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); - rc = eWiseApply( C, input.A, input.B, op, RESIZE ); - wait( C ); + + rc = eWiseApply( C, input.A, input.B, op, RESIZE ); if( rc != SUCCESS ) { std::cerr << "Error: Phase::RESIZE\n"; return; } - rc = eWiseApply( C, input.A, input.B, op, EXECUTE ); - wait( C ); + if( capacity( C ) < nnz( input.C_operator ) ) { + std::cerr << "Error: Capacity should be at least " << nnz( input.C_operator ) << "\n"; + rc = FAILED; + return; + } + + rc = eWiseApply( C, input.A, input.B, op, EXECUTE ); if( rc != SUCCESS ) { std::cerr << "Error: Phase::EXECUTE\n"; return; } - + print_matrix( C, 10, "C (intersection)" ); if( !equals_matrix( C, input.C_operator ) ) { std::cerr << "Error: Wrong result\n"; rc = FAILED; @@ -130,22 +138,27 @@ void grb_program( const input_t< Monoid > &input, output_t &output ) { } { // Monoid variant - std::cout << "-- eWiseApply using Monoid, supposed to consider" + std::cout << " -- eWiseApply using Monoid, supposed to consider" << " non-zeroes as the identity -> UNION\n"; Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); - rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); - wait( C ); + + rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); if( rc != SUCCESS ) { std::cerr << "Error: Phase::RESIZE\n"; return; } - rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE ); - wait( C ); + if( capacity( C ) < nnz( input.C_operator ) ) { + std::cerr << "Error: Capacity should be at least " << nnz( input.C_monoid ) << "\n"; + rc = FAILED; + return; + } + + rc = eWiseApply( C, input.A, input.B, input.monoid, EXECUTE ); if( rc != SUCCESS ) { std::cerr << "Error: Phase::EXECUTE\n"; return; } - + print_matrix( C, 10, "C (union)" ); if( !equals_matrix( C, input.C_monoid ) ) { std::cerr << "Error: Wrong result\n"; rc = FAILED; @@ -165,7 +178,7 @@ int main( int argc, char ** argv ) { size_t N = 10; if( argc > 2 ) { - std::cout << "Usage: " << argv[ 0 ] << std::endl; + std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl; return 1; } if( argc == 2 ) { @@ -186,12 +199,14 @@ int main( int argc, char ** argv ) { * (...) */ Matrix< nz_type > A( N, N, N ); - std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); - std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); - std::iota( A_cols.begin(), A_cols.end(), 0 ); - if( SUCCESS != - buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL ) - ) { return 2; } + { + std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); + std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); + std::iota( A_cols.begin(), A_cols.end(), 0 ); + if( SUCCESS != + buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL ) + ) { return 2; } + } /** Matrix B: Column matrix filled with B_INITIAL_VALUE * Y _ _ _ _ @@ -202,14 +217,17 @@ int main( int argc, char ** argv ) { * (...) */ Matrix< nz_type > B( N, N, N ); - std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); - std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); - std::iota( B_rows.begin(), B_rows.end(), 0 ); - if( SUCCESS != - buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL ) - ) { return 3; } - { + std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); + std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); + std::iota( B_rows.begin(), B_rows.end(), 0 ); + if( SUCCESS != + buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL ) + ) { return 3; } + } + + { // C = A .+ B + std::cout << "-- Test C = A .+ B\n"; /** Matrix C_monoid_truth: Union of A and B * X+Y X X X X * Y ___ ___ ___ ___ @@ -260,28 +278,112 @@ int main( int argc, char ** argv ) { ) ) { return 5; } - { /** Test using addition operator, same type for lhs and rhs - */ - input_t< - Monoid< operators::add< nz_type >, identities::zero > - > input { A, B, C_monoid_truth, C_op_truth }; - output_t output { SUCCESS }; - // Run the test - RC rc = launcher.exec( &grb_program, input, output, false ); - // Check the result - if( rc != SUCCESS ) { - std::cerr << "Error: Launcher::exec\n"; - return 6; - } - if( output.rc != SUCCESS ) { - std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; - return 7; - } + input_t< + Monoid< operators::add< nz_type >, identities::zero > + > input { A, B, C_monoid_truth, C_op_truth }; + output_t output { SUCCESS }; + // Run the test + RC rc = launcher.exec( &grb_program, input, output, false ); + // Check the result + if( rc != SUCCESS ) { + std::cerr << "Error: Launcher::exec\n"; + return 6; + } + if( output.rc != SUCCESS ) { + std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; + return 7; + } + } + + { // C = A .+ A + std::cout << "-- Test C = A .+ A\n"; + /** Matrix C_truth: Union/intersection of A and A + * X+X X+X X+X X+X X+X + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___(...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + Matrix< nz_type > C_truth( N, N ); + size_t nvalues = ncols( A ); + std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 ); + std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+A_INITIAL_VALUE ); + std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 ); + if( SUCCESS != + buildMatrixUnique( + C_truth, + C_truth_rows.data(), + C_truth_cols.data(), + C_truth_values.data(), + C_truth_values.size(), + SEQUENTIAL + ) + ) { return 8; } + + input_t< + Monoid< operators::add< nz_type >, identities::zero > + > input { A, A, C_truth, C_truth }; + output_t output { SUCCESS }; + // Run the test + RC rc = launcher.exec( &grb_program, input, output, false ); + // Check the result + if( rc != SUCCESS ) { + std::cerr << "Error: Launcher::exec\n"; + return 9; + } + if( output.rc != SUCCESS ) { + std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; + return 10; + } + } + + { // C = A .+ Bt + std::cout << "-- Test C = A .+ Bt\n"; + /** Matrix C_truth: Union/intersection of A and Bt + * X+Y X+Y X+Y X+Y X+Y + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___(...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + Matrix< nz_type > C_truth( N, N ); + size_t nvalues = ncols( A ); + std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 ); + std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+B_INITIAL_VALUE ); + std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 ); + if( SUCCESS != + buildMatrixUnique( + C_truth, + C_truth_rows.data(), + C_truth_cols.data(), + C_truth_values.data(), + C_truth_values.size(), + SEQUENTIAL + ) + ) { return 8; } + + input_t< + Monoid< operators::add< nz_type >, identities::zero >, + descriptors::transpose_right + > input { A, B, C_truth, C_truth }; + output_t output { SUCCESS }; + // Run the test + RC rc = launcher.exec( &grb_program, input, output, false ); + // Check the result + if( rc != SUCCESS ) { + std::cerr << "Error: Launcher::exec\n"; + return 9; + } + if( output.rc != SUCCESS ) { + std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; + return 10; } } std::cerr << std::flush; std::cout << "Test OK" << std::endl << std::flush; - + return 0; } From 4156581548e483d01e7320dda48b50aec60ae6f1 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 1 Nov 2023 14:53:29 +0100 Subject: [PATCH 18/37] Bugfix for union pattern --- include/graphblas/reference/blas3.hpp | 104 ++++++++++++++++++++------ tests/unit/spy.cpp | 3 + 2 files changed, 85 insertions(+), 22 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index a49b16c48..f4c9fd09c 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1293,6 +1293,7 @@ namespace grb { // symbolic phase if( phase == RESIZE ) { + nzc = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { @@ -1322,6 +1323,7 @@ namespace grb { getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); // perform column-wise nonzero count + nzc = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { @@ -1379,13 +1381,12 @@ namespace grb { // do computations - size_t nzc = 0; + nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { -#ifdef _DEBUG - std::cout << " -- i: " << i << "\n"; -#endif coors1.clear(); + coors2.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; coors1.assign( k_col ); @@ -1393,47 +1394,106 @@ namespace grb { } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; + const size_t j = B_raw.row_index[ l ]; const auto B_val = B_raw.getValue( l, identity_B ); - if( !coors1.assigned( l_col ) ) { // Union case - coors1.assign( l_col ); - valbuf[ l_col ] = identity_A; + if( !coors1.assigned( j ) ) { // Union case + valbuf[ j ] = identity_A; + } else { + coors2.assign( j ); + } + + const auto valbuf_value_before = valbuf[ j ]; + OutputType result_value; + (void)grb::apply( result_value, valbuf_value_before, B_val, oper ); + + // update CRS + CRS_raw.row_index[ nzc ] = j; + CRS_raw.setValue( nzc, result_value ); + + // update CCS + if( !crs_only ) { + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ]; +#ifdef NDEBUG + assert( CCS_index < capacity( C ) ); + assert( CCS_index < CCS_raw.col_start[ j+1 ] ); + assert( CCS_index >= CCS_raw.col_start[ j ] ); +#endif + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, result_value ); } - const auto valbuf_value_before = valbuf[ l_col ]; - (void)grb::apply( valbuf[ l_col ], valbuf_value_before, B_val, oper ); + // update count + (void)++nzc; } - for( size_t j_unsigned = n ; j_unsigned > 0 ; j_unsigned-- ) { - const size_t j = j_unsigned - 1; - if( !coors1.assigned( j ) ) { + for( size_t l = A_raw.col_start[ i ]; l < A_raw.col_start[ i + 1 ]; ++l ) { + const size_t j = A_raw.row_index[ l ]; + if( coors2.assigned( j ) ) { // Intersection case: already done before continue; } +#ifdef NDEBUG + assert( !coors1.assigned( j ) ); // Union case: already done before +#endif + + const auto A_val = A_raw.getValue( l, identity_A ); + OutputType result_value; + (void)grb::apply( result_value, A_val, identity_B, oper ); + // update CRS CRS_raw.row_index[ nzc ] = j; - CRS_raw.setValue( nzc, valbuf[ j ] ); + CRS_raw.setValue( nzc, result_value ); + // update CCS if( !crs_only ) { - const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]++; + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ]; +#ifdef NDEBUG + assert( CCS_index < capacity( C ) ); + assert( CCS_index < CCS_raw.col_start[ j+1 ] ); + assert( CCS_index >= CCS_raw.col_start[ j ] ); +#endif CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, valbuf[ j ] ); + CCS_raw.setValue( CCS_index, result_value ); } // update count (void)++nzc; } + CRS_raw.col_start[ i + 1 ] = nzc; } if( !crs_only ) { #ifdef _DEBUG - std::cout << "CCS_raw.col_start = [ "; - for( size_t j = 0; j <= n; ++j ) - std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "CRS_raw.col_start = [ "; + for( size_t j = 0; j <= m; ++j ) + std::cout << CRS_raw.col_start[ j ] << " "; std::cout << "]\n"; - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; + std::cout << "CRS_raw.row_index = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CRS_raw.row_index[ j ] << " "; std::cout << "]\n"; + std::cout << "CRS_raw.values = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CRS_raw.values[ j ] << " "; + std::cout << "]\n"; + if( !crs_only ) { + std::cout << "C_col_index = [ "; + for( size_t j = 0; j < n; ++j ) + std::cout << C_col_index[ j ] << " "; + std::cout << "]\n"; + std::cout << "CCS_raw.col_start = [ "; + for( size_t j = 0; j <= n; ++j ) + std::cout << CCS_raw.col_start[ j ] << " "; + std::cout << "]\n"; + std::cout << "CCS_raw.row_index = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CCS_raw.row_index[ j ] << " "; + std::cout << "]\n"; + std::cout << "CCS_raw.values = [ "; + for( size_t j = 0; j < nzc; ++j ) + std::cout << CCS_raw.values[ j ] << " "; + std::cout << "]\n"; + } #endif + #ifndef NDEBUG for( size_t j = 0; j < n; ++j ) { assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); diff --git a/tests/unit/spy.cpp b/tests/unit/spy.cpp index 780216d7f..ce6c1759d 100644 --- a/tests/unit/spy.cpp +++ b/tests/unit/spy.cpp @@ -82,6 +82,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, grb::nnz( spy ) ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE ); rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk: " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; @@ -114,6 +115,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, nnz( spy ) ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE ); rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk (pattern): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; @@ -146,6 +148,7 @@ void grb_program( const void * const fn_p, const size_t fn_length, grb::RC & rc if( rc == grb::SUCCESS ) { grb::Matrix< double > chk( p, q ); rc = rc ? rc : grb::resize( chk, nnz( spy ) ); + rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid(), grb::Phase::RESIZE ); rc = rc ? rc : grb::eWiseApply( chk, spy, spy2, ring.getMultiplicativeMonoid() ); if( rc == grb::SUCCESS && grb::nnz( chk ) != grb::nnz( spy ) ) { std::cerr << "Unexpected number of nonzeroes for chk (boolean): " << grb::nnz(chk) << ", expected " << grb::nnz(spy) << "\n"; From 30f8a302b53b0d40d65b14dd6b33d1173ddb3bc7 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 1 Nov 2023 16:39:29 +0100 Subject: [PATCH 19/37] Parallel iteration of coordinates --- include/graphblas/reference/blas3.hpp | 139 ++++++++++++++++++-------- 1 file changed, 95 insertions(+), 44 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index f4c9fd09c..f8049f370 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1105,10 +1105,38 @@ namespace grb { for( size_t i = 0; i < m; ++i ) { coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel default(none) \ + shared(coors1, valbuf) \ + firstprivate(i, A_raw, dummy_identity) +#endif + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + auto local_update = coors1.EMPTY_UPDATE(); + const size_t maxAsyncAssigns = coors1.maxAsyncAssigns(); + size_t assigns = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors1.asyncAssign( k_col, local_update ) ) { + valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); + if( ++assigns == maxAsyncAssigns ) { + coors1.joinUpdate( local_update ); + assigns = 0; + } + } +#else + if( !coors1.assign( k_col ) ) { + valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); + } +#endif + } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors1.joinUpdate( local_update ) ) {} +#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { @@ -1387,56 +1415,79 @@ namespace grb { coors1.clear(); coors2.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - valbuf[ k_col ] = A_raw.getValue( k, identity_A ); - } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel default(none) \ + shared(coors1, vbuf1, coors2, vbuf2) \ + firstprivate(i, A_raw, identity_A, B_raw, identity_B ) +#endif + { + auto local_update1 = coors1.EMPTY_UPDATE(); + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); + size_t assigns1 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t j = B_raw.row_index[ l ]; - const auto B_val = B_raw.getValue( l, identity_B ); - if( !coors1.assigned( j ) ) { // Union case - valbuf[ j ] = identity_A; - } else { - coors2.assign( j ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors1.asyncAssign( k_col, local_update1 ) ) { + vbuf1[ k_col ] = A_raw.getValue( k, identity_A ); + if( ++assigns1 == maxAsyncAssigns1 ) { + coors1.joinUpdate( local_update1 ); + assigns1 = 0; + } + } +#else + if( !coors1.assign( k_col ) ) { + vbuf1[ k_col ] = A_raw.getValue( k, identity_A ); + } +#endif + } } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors1.joinUpdate( local_update1 )) {} +#endif - const auto valbuf_value_before = valbuf[ j ]; - OutputType result_value; - (void)grb::apply( result_value, valbuf_value_before, B_val, oper ); - - // update CRS - CRS_raw.row_index[ nzc ] = j; - CRS_raw.setValue( nzc, result_value ); + auto local_update2 = coors2.EMPTY_UPDATE(); + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); + size_t assigns2 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif + for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = B_raw.row_index[ k ]; - // update CCS - if( !crs_only ) { - const size_t CCS_index = CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ]; -#ifdef NDEBUG - assert( CCS_index < capacity( C ) ); - assert( CCS_index < CCS_raw.col_start[ j+1 ] ); - assert( CCS_index >= CCS_raw.col_start[ j ] ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors2.asyncAssign( k_col, local_update2 ) ) { + vbuf2[ k_col ] = B_raw.getValue( k, identity_B ); + if( ++assigns2 == maxAsyncAssigns2 ) { + coors2.joinUpdate( local_update2 ); + assigns2 = 0; + } + } +#else + if( !coors2.assign( k_col ) ) { + vbuf2[ k_col ] = B_raw.getValue( k, identity_B ); + } #endif - CCS_raw.row_index[ CCS_index ] = i; - CCS_raw.setValue( CCS_index, result_value ); + } } - // update count - (void)++nzc; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors2.joinUpdate( local_update2 )) {} +#endif } - for( size_t l = A_raw.col_start[ i ]; l < A_raw.col_start[ i + 1 ]; ++l ) { - const size_t j = A_raw.row_index[ l ]; - if( coors2.assigned( j ) ) { // Intersection case: already done before - continue; - } -#ifdef NDEBUG - assert( !coors1.assigned( j ) ); // Union case: already done before -#endif + for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) { + const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2; + const auto j = assigned_coors.index( k ); + const auto A_val = coors1.assigned(k) ? vbuf1[ j ] : identity_A; + const auto B_val = coors2.assigned(k) ? vbuf2[ j ] : identity_B; - const auto A_val = A_raw.getValue( l, identity_A ); OutputType result_value; - (void)grb::apply( result_value, A_val, identity_B, oper ); + (void)grb::apply( result_value, A_val, B_val, oper ); // update CRS CRS_raw.row_index[ nzc ] = j; From 4b2b27b4cf9888256f8ff907e6cc80ebc816ca39 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 1 Nov 2023 18:04:03 +0100 Subject: [PATCH 20/37] Void values bugfix for test --- include/graphblas/reference/blas3.hpp | 40 +++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index f8049f370..5e2bf9d0a 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -57,6 +57,34 @@ "********************************************************************" \ "******************************\n" ); +#ifndef _H_GRB_REFERENCE_BLAS3_ACCESSORS +#define _H_GRB_REFERENCE_BLAS3_ACCESSORS + +namespace grb::internal +{ + template< typename D, typename T > + static inline void assignValue( + D *array, size_t i, const T& value, + typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr + ) { array[i] = value; } + + template< typename T > + static inline void assignValue( void *, size_t, const T& ) { /* do nothing */ } + + template< typename D, typename T > + static inline T getValue( + const D *array, size_t i, const T&, + typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr + ) { return array[i]; } + + template< typename T > + static inline T getValue( const void *, size_t, const T& identity ) { return identity; } + +} // namespace grb::internal + +#endif // _H_GRB_REFERENCE_BLAS3_ACCESSORS + + namespace grb { namespace internal { @@ -1433,7 +1461,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors1.asyncAssign( k_col, local_update1 ) ) { - vbuf1[ k_col ] = A_raw.getValue( k, identity_A ); + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); if( ++assigns1 == maxAsyncAssigns1 ) { coors1.joinUpdate( local_update1 ); assigns1 = 0; @@ -1441,7 +1469,7 @@ namespace grb { } #else if( !coors1.assign( k_col ) ) { - vbuf1[ k_col ] = A_raw.getValue( k, identity_A ); + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } #endif } @@ -1462,7 +1490,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors2.asyncAssign( k_col, local_update2 ) ) { - vbuf2[ k_col ] = B_raw.getValue( k, identity_B ); + assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); if( ++assigns2 == maxAsyncAssigns2 ) { coors2.joinUpdate( local_update2 ); assigns2 = 0; @@ -1470,7 +1498,7 @@ namespace grb { } #else if( !coors2.assign( k_col ) ) { - vbuf2[ k_col ] = B_raw.getValue( k, identity_B ); + assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); } #endif } @@ -1483,8 +1511,8 @@ namespace grb { for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) { const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2; const auto j = assigned_coors.index( k ); - const auto A_val = coors1.assigned(k) ? vbuf1[ j ] : identity_A; - const auto B_val = coors2.assigned(k) ? vbuf2[ j ] : identity_B; + const auto A_val = coors1.assigned(k) ? getValue(vbuf1, j, identity_A) : identity_A; + const auto B_val = coors2.assigned(k) ? getValue(vbuf2, j, identity_B) : identity_B; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); From 02330a8321bcd8d2939013ad73434e3bfbce3041 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 2 Nov 2023 13:13:32 +0100 Subject: [PATCH 21/37] Logic bugfix --- include/graphblas/reference/blas3.hpp | 121 ++++++++++++++++---------- 1 file changed, 76 insertions(+), 45 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 5e2bf9d0a..1f382f017 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1354,12 +1354,13 @@ namespace grb { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - (void) ++nzc; + if( !coors1.assign( k_col ) ) { + (void) ++nzc; + } } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; - if( not coors1.assigned( l_col ) ) { + if( !coors1.assigned( l_col ) ) { (void) ++nzc; } } @@ -1384,15 +1385,16 @@ namespace grb { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - coors1.assign( k_col ); - (void) ++nzc; + if( !coors1.assign( k_col ) ) { + (void) ++nzc; + } if( !crs_only ) { (void) ++CCS_raw.col_start[ k_col + 1 ]; } } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; - if( not coors1.assigned( l_col ) ) { + if( !coors1.assigned( l_col ) ) { (void) ++nzc; if( !crs_only ) { (void) ++CCS_raw.col_start[ l_col + 1 ]; @@ -1449,70 +1451,99 @@ namespace grb { firstprivate(i, A_raw, identity_A, B_raw, identity_B ) #endif { - auto local_update1 = coors1.EMPTY_UPDATE(); - { + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); - size_t assigns1 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait + auto local_update1 = coors1.EMPTY_UPDATE(); + const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); + size_t assigns1 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait #endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update1 ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - if( ++assigns1 == maxAsyncAssigns1 ) { - coors1.joinUpdate( local_update1 ); - assigns1 = 0; - } + if( !coors1.asyncAssign( k_col, local_update1 ) ) { + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + if( ++assigns1 == maxAsyncAssigns1 ) { + coors1.joinUpdate( local_update1 ); + assigns1 = 0; } + } #else - if( !coors1.assign( k_col ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - } -#endif + if( !coors1.assign( k_col ) ) { + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } +#endif } + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 while( !coors1.joinUpdate( local_update1 )) {} #endif - auto local_update2 = coors2.EMPTY_UPDATE(); - { + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); - size_t assigns2 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait + auto local_update2 = coors2.EMPTY_UPDATE(); + const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); + size_t assigns2 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait #endif - for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = B_raw.row_index[ k ]; + for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = B_raw.row_index[ k ]; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors2.asyncAssign( k_col, local_update2 ) ) { - assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); - if( ++assigns2 == maxAsyncAssigns2 ) { - coors2.joinUpdate( local_update2 ); - assigns2 = 0; - } + if( !coors2.asyncAssign( k_col, local_update2 ) ) { + assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); + if( ++assigns2 == maxAsyncAssigns2 ) { + coors2.joinUpdate( local_update2 ); + assigns2 = 0; } + } #else - if( !coors2.assign( k_col ) ) { - assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); - } -#endif + if( !coors2.assign( k_col ) ) { + assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); } +#endif } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 while( !coors2.joinUpdate( local_update2 )) {} #endif } - for( size_t k = 0; k < std::max( coors1.nonzeroes(), coors2.nonzeroes() ); ++k ) { - const auto& assigned_coors = coors1.assigned(k) ? coors1 : coors2; - const auto j = assigned_coors.index( k ); - const auto A_val = coors1.assigned(k) ? getValue(vbuf1, j, identity_A) : identity_A; - const auto B_val = coors2.assigned(k) ? getValue(vbuf2, j, identity_B) : identity_B; + for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { + const auto j = coors1.index( k ); + const auto A_val = getValue(vbuf1, j, identity_A); + const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B; + std::cout << " * (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n"; + + OutputType result_value; + (void)grb::apply( result_value, A_val, B_val, oper ); + + // update CRS + CRS_raw.row_index[ nzc ] = j; + CRS_raw.setValue( nzc, result_value ); + + // update CCS + if( !crs_only ) { + const size_t CCS_index = CCS_raw.col_start[ j+1 ] - ++C_col_index[ j ]; +#ifdef NDEBUG + assert( CCS_index < capacity( C ) ); + assert( CCS_index < CCS_raw.col_start[ j+1 ] ); + assert( CCS_index >= CCS_raw.col_start[ j ] ); +#endif + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, result_value ); + } + // update count + (void)++nzc; + } + for( size_t k = 0; k < coors2.nonzeroes(); ++k ) { + const auto j = coors2.index( k ); + if( coors1.assigned(j) ) { // Intersection case: already handled + continue; + } + const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A; + const auto B_val = getValue(vbuf2, j, identity_B); + std::cout << " # (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n"; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); From 585f485ce8de3e69f573baad1b52e581187f4c4e Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 2 Nov 2023 13:35:35 +0100 Subject: [PATCH 22/37] Merge remote-tracking branch 'origin/develop' into 636-unexpected-behaviour-of-ewiseapply-out-matrix-in-matrix-in-matrix-variants --- include/graphblas/banshee/pinnedvector.hpp | 137 +- include/graphblas/base/benchmark.hpp | 339 +++-- include/graphblas/base/exec.hpp | 301 ++-- include/graphblas/benchmark.hpp | 5 +- include/graphblas/bsp/collectives.hpp | 602 ++++---- include/graphblas/bsp/collectives_blas1.hpp | 509 ++++--- .../graphblas/bsp/collectives_blas1_raw.hpp | 3 +- .../graphblas/bsp/exec_broadcast_routines.hpp | 81 + include/graphblas/bsp1d/benchmark.hpp | 728 +++------ include/graphblas/bsp1d/exec.hpp | 1356 +++++++++++------ include/graphblas/bsp1d/matrix.hpp | 4 +- include/graphblas/exec.hpp | 5 +- include/graphblas/hyperdags/benchmark.hpp | 61 +- include/graphblas/hyperdags/exec.hpp | 54 +- include/graphblas/nonblocking/benchmark.hpp | 50 +- include/graphblas/nonblocking/exec.hpp | 63 +- include/graphblas/nonblocking/io.hpp | 7 +- include/graphblas/nonblocking/spmd.hpp | 1 + include/graphblas/reference/benchmark.hpp | 62 +- include/graphblas/reference/exec.hpp | 60 +- include/graphblas/reference/io.hpp | 4 +- include/graphblas/reference/pinnedvector.hpp | 4 +- include/graphblas/utils/TimerResults.hpp | 17 +- src/graphblas/CMakeLists.txt | 3 +- src/graphblas/bsp/exec_broadcast_routines.cpp | 76 + src/graphblas/bsp1d/CMakeLists.txt | 1 - src/graphblas/bsp1d/exec.cpp | 29 - src/graphblas/nonblocking/io.cpp | 3 +- src/graphblas/reference/io.cpp | 2 +- tests/smoke/label_test.cpp | 7 +- tests/smoke/simple_pagerank_from_mpi.cpp | 5 +- tests/unit/CMakeLists.txt | 26 + tests/unit/auto_launcher.cpp | 3 +- tests/unit/buildVector.cpp | 1 + tests/unit/eWiseApplyMatrixReference.cpp | 51 +- tests/unit/id_distributed.cpp | 307 ++++ tests/unit/launcherAndBenchmarker.cpp | 680 +++++++++ tests/unit/mxv.cpp | 2 +- tests/unit/pinnedVector.cpp | 31 +- tests/unit/sparse_mxv.cpp | 14 +- tests/unit/unittests.sh | 47 + tests/unit/vmxa.cpp | 2 +- tests/unit/vxm.cpp | 2 +- tests/unit/wait.cpp | 3 +- tests/utils/output_verification.hpp | 16 +- tests/utils/print_vec_mat.hpp | 504 +++++- 46 files changed, 4083 insertions(+), 2185 deletions(-) create mode 100644 include/graphblas/bsp/exec_broadcast_routines.hpp create mode 100644 src/graphblas/bsp/exec_broadcast_routines.cpp delete mode 100644 src/graphblas/bsp1d/exec.cpp create mode 100644 tests/unit/id_distributed.cpp create mode 100644 tests/unit/launcherAndBenchmarker.cpp diff --git a/include/graphblas/banshee/pinnedvector.hpp b/include/graphblas/banshee/pinnedvector.hpp index 236884fde..f22566565 100644 --- a/include/graphblas/banshee/pinnedvector.hpp +++ b/include/graphblas/banshee/pinnedvector.hpp @@ -24,7 +24,7 @@ * @author A. N. Yzelman */ -#if ! defined _H_GRB_BANSHEE_PINNEDVECTOR +#if !defined _H_GRB_BANSHEE_PINNEDVECTOR #define _H_GRB_BANSHEE_PINNEDVECTOR #include @@ -33,77 +33,86 @@ #include "coordinates.hpp" #include "vector.hpp" + namespace grb { /** No implementation notes. */ template< typename IOType > class PinnedVector< IOType, banshee > { - private: - /** - * Tell the system to delete \a _buffered_values only when we had its last - * banshee. - */ - utils::AutoDeleter< IOType > _raw_deleter; - - /** - * Tell the system to delete \a _buffered_mask only when we had its last - * banshee. - */ - utils::AutoDeleter< char > _assigned_deleter; - - /** A buffer of the local vector. */ - IOType * _buffered_values; - - /** A buffer of the sparsity pattern of \a _buffered_values. */ - internal::Coordinates< banshee > _buffered_mask; - - public: - /** No implementation notes. */ - PinnedVector() : _buffered_values( NULL ) {} - - /** No implementation notes. */ - template< typename Coords > - PinnedVector( const Vector< IOType, banshee, Coords > & x, IOMode mode ) : - _raw_deleter( x._raw_deleter ), _assigned_deleter( x._assigned_deleter ), _buffered_values( x._raw ), _buffered_mask( x._coordinates ) { - (void)mode; // sequential and parallel IO mode are equivalent for this implementation. - } - - /** No implementation notes. */ - IOType & operator[]( const size_t i ) noexcept { - return _buffered_values[ i ]; - } - - /** No implementation notes. */ - const IOType & operator[]( const size_t i ) const noexcept { - return _buffered_values[ i ]; - } - - /** No implementation notes. */ - bool mask( const size_t i ) const noexcept { - return _buffered_mask.assigned( i ); - } - - /** No implementation notes. */ - size_t length() const noexcept { - return _buffered_mask.size(); - } - - /** No implementation notes. */ - size_t index( const size_t index ) const noexcept { - return index; - } - - /** - * Frees the underlying raw memory area iff the underlying vector was - * destroyed. Otherwise set the underlying vector to unpinned state. - */ - void free() { - _raw_deleter.clear(); - _assigned_deleter.clear(); - } + private: + + /** + * Tell the system to delete \a _buffered_values only when we had its last + * banshee. + */ + utils::AutoDeleter< IOType > _raw_deleter; + + /** + * Tell the system to delete \a _buffered_mask only when we had its last + * banshee. + */ + utils::AutoDeleter< char > _assigned_deleter; + + /** A buffer of the local vector. */ + IOType * _buffered_values; + + /** A buffer of the sparsity pattern of \a _buffered_values. */ + internal::Coordinates< banshee > _buffered_mask; + + + public: + + /** No implementation notes. */ + PinnedVector() : _buffered_values( NULL ) {} + + /** No implementation notes. */ + template< typename Coords > + PinnedVector( const Vector< IOType, banshee, Coords > & x, IOMode mode ) : + _raw_deleter( x._raw_deleter ), _assigned_deleter( x._assigned_deleter ), + _buffered_values( x._raw ), _buffered_mask( x._coordinates + ) { + (void) mode; // sequential and parallel IO mode are equivalent for this + // implementation. + } + + /** No implementation notes. */ + IOType & operator[]( const size_t i ) noexcept { + return _buffered_values[ i ]; + } + + /** No implementation notes. */ + const IOType & operator[]( const size_t i ) const noexcept { + return _buffered_values[ i ]; + } + + /** No implementation notes. */ + bool mask( const size_t i ) const noexcept { + return _buffered_mask.assigned( i ); + } + + /** No implementation notes. */ + size_t length() const noexcept { + return _buffered_mask.size(); + } + + /** No implementation notes. */ + size_t index( const size_t index ) const noexcept { + return index; + } + + /** + * Frees the underlying raw memory area iff the underlying vector was + * destroyed. Otherwise set the underlying vector to unpinned state. + */ + void free() { + _raw_deleter.clear(); + _assigned_deleter.clear(); + } + }; } // namespace grb #endif // end ``_H_GRB_BANSHEE_PINNEDVECTOR + diff --git a/include/graphblas/base/benchmark.hpp b/include/graphblas/base/benchmark.hpp index 56a2fade6..f4775f587 100644 --- a/include/graphblas/base/benchmark.hpp +++ b/include/graphblas/base/benchmark.hpp @@ -28,10 +28,19 @@ #ifndef _H_GRB_BENCH_BASE #define _H_GRB_BENCH_BASE -#include -#include +#include // for sqrt #include -#include +#include // warning: normally should not be used in ALP backends(!) + +#ifndef _GRB_NO_STDIO + #include + #include + #include +#endif + +#ifndef _GRB_NO_EXCEPTIONS + #include +#endif #include #include @@ -43,21 +52,11 @@ #include "config.hpp" #include "exec.hpp" -#ifndef _GRB_NO_STDIO - #include -#endif - -#ifndef _GRB_NO_EXCEPTIONS - #include -#endif - -#include - /** * \defgroup benchmarking Benchmarking * - * ALP has a specialised class for benchmarking ALP programs, grb::Benchmarker, + * ALP has a specialised class for benchmarking ALP programs, #grb::Benchmarker, * which is a variant on the #grb::Launcher. It codes a particular benchmarking * strategy of any given ALP program as described below. * @@ -123,7 +122,7 @@ namespace grb { grb::utils::TimerResults &total_times, grb::utils::TimerResults &min_times, grb::utils::TimerResults &max_times, - grb::utils::TimerResults * sdev_times + std::vector< grb::utils::TimerResults > &sdev_times ) { inner_times.normalize( total ); total_times.accum( inner_times ); @@ -140,7 +139,7 @@ namespace grb { grb::utils::TimerResults &total_times, grb::utils::TimerResults &min_times, grb::utils::TimerResults &max_times, - grb::utils::TimerResults * sdev_times, + std::vector< grb::utils::TimerResults > &sdev_times, const size_t pid ) { total_times.normalize( total ); @@ -192,82 +191,93 @@ namespace grb { /** * Benchmarks a given ALP program. * - * This variant applies to input data as a byte blob and output data as a - * user-defined POD struct. + * This variant applies to typed ALP programs. * - * @tparam U Output type of the given user program. - * @tparam backend Which backend the program is using. + * @see #grb::Launcher for more details on type requirements. + * + * @tparam RunnerType The type of the runner, i.e., functor object storing + * the information for running the supplied ALP function. * - * @param[in] alp_program The use rogram to be benchmarked - * @param[in] data_in Input data as a raw data blob - * @param[in] in_size The size, in bytes, of the input data - * @param[out] out_data Output data - * @param[in] inner The number of inner repetitions of the benchmark - * @param[in] outer The number of outer repetitions of the benchmark - * @param[in] pid Unique ID of the calling user process + * @param[in] runner Functor object running the function. + * @param[in] times Data structure with timing information. + * @param[in] inner Number of inner iterations. + * @param[out] outer Number of outer iterations. + * @param[in] pid process Identifier of current user process. * * @see benchmarking * * @ingroup benchmarking */ template< - typename U, - enum Backend implementation = config::default_backend + enum Backend implementation, + typename RunnerType > static RC benchmark( - void ( *alp_program )( const void *, const size_t, U & ), - const void * data_in, - const size_t in_size, - U &data_out, - const size_t inner, - const size_t outer, + RunnerType &runner, + grb::utils::TimerResults ×, + const size_t inner, const size_t outer, const size_t pid ) { const double inf = std::numeric_limits< double >::infinity(); grb::utils::TimerResults total_times, min_times, max_times; - grb::utils::TimerResults * sdev_times = - new grb::utils::TimerResults[ outer ]; + std::vector< grb::utils::TimerResults > sdev_times( outer ); total_times.set( 0 ); min_times.set( inf ); max_times.set( 0 ); + grb::RC ret = grb::SUCCESS; // outer loop - for( size_t out = 0; out < outer; ++out ) { + for( size_t out = 0; out < outer && ret == grb::SUCCESS; ++out ) { grb::utils::TimerResults inner_times; inner_times.set( 0 ); // inner loop - for( size_t in = 0; in < inner; in++ ) { - data_out.times.set( 0 ); - ( *alp_program )( data_in, in_size, data_out ); - grb::collectives< implementation >::reduce( - data_out.times.io, 0, grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( - data_out.times.preamble, 0, grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( - data_out.times.useful, 0, grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( - data_out.times.postamble, 0, grb::operators::max< double >() ); - inner_times.accum( data_out.times ); + for( size_t in = 0; in < inner && ret == grb::SUCCESS; ++in ) { + times.set( 0 ); + + runner(); + + ret = ret ? ret : grb::collectives< implementation >::reduce( + times.io, 0, grb::operators::max< double >() ); + ret = ret ? ret : grb::collectives< implementation >::reduce( + times.preamble, 0, grb::operators::max< double >() ); + ret = ret ? ret : grb::collectives< implementation >::reduce( + times.useful, 0, grb::operators::max< double >() ); + ret = ret ? ret : grb::collectives< implementation >::reduce( + times.postamble, 0, grb::operators::max< double >() ); + + if( ret == grb::SUCCESS ) { + inner_times.accum( times ); + } } - // calculate performance stats - benchmark_calc_inner( out, inner, inner_times, total_times, min_times, - max_times, sdev_times ); + if( ret == grb::SUCCESS ) { + // calculate performance stats + benchmark_calc_inner( out, inner, inner_times, total_times, min_times, + max_times, sdev_times ); + } #ifndef _GRB_NO_STDIO // give experiment output line if( pid == 0 ) { - std::cout << "Outer iteration #" << out << " timings (io, preamble, " - << "useful, postamble, time since epoch): "; - std::cout << inner_times.io << ", " << inner_times.preamble << ", " - << inner_times.useful << ", " << inner_times.postamble << ", "; - printTimeSinceEpoch( false ); + if( ret == grb::SUCCESS ) { + std::ios_base::fmtflags prev_cout_state( std::cout.flags() ); + std::cout << "Outer iteration #" << out << " timings " + << "(io, preamble, useful, postamble, time since epoch): " + << std::fixed + << inner_times.io << ", " << inner_times.preamble << ", " + << inner_times.useful << ", " << inner_times.postamble << ", "; + printTimeSinceEpoch( false ); + std::cout.flags( prev_cout_state ); + } else { + std::cerr << "Error during cross-process collection of timing results: " + << "\t" << grb::toString( ret ) << std::endl; + } } #endif // pause for next outer loop - if( sleep( 1 ) != 0 ) { + if( sleep( 1 ) != 0 && ret == grb::SUCCESS ) { #ifndef _GRB_NO_STDIO std::cerr << "Sleep interrupted, assume benchmark is unreliable; " << "exiting.\n"; @@ -276,30 +286,72 @@ namespace grb { } } - // calculate performance stats - benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, - pid ); - delete [] sdev_times; + if( ret == grb::SUCCESS ) { + // calculate performance stats + benchmark_calc_outer( outer, total_times, min_times, max_times, + sdev_times, pid ); + } + + return ret; + } - return SUCCESS; + /** + * Benchmarks a given ALP program. + * + * This variant applies to untyped ALP programs. + * + * @see #grb::Launcher for more details on type requirements. + * + * @tparam U Output type of the given user program. + * @tparam backend Which backend the program is using. + * + * @param[in] alp_program The user program to be benchmarked. + * @param[in] data_in Input data as a raw data blob. + * @param[in] in_size The size, in bytes, of the input data. + * @param[out] out_data Output data as a plain-old-data struct \a U. + * @param[in] inner Number of inner repetitions of the benchmark. + * @param[in] outer Number of outer repetitions of the benchmark. + * @param[in] pid Unique ID of the calling user process. + * + * @see benchmarking + * + * @ingroup benchmarking + */ + template< + typename U, + enum Backend implementation + > + static RC benchmark( + AlpUntypedFunc< U > alp_program, + const void * data_in, const size_t in_size, + U &data_out, + const size_t inner, const size_t outer, + const size_t pid + ) { + auto runner = [ alp_program, data_in, in_size, &data_out ] { + alp_program( data_in, in_size, data_out ); + }; + return benchmark< implementation >( runner, data_out.times, inner, outer, + pid ); } /** * Benchmarks a given ALP program. * - * This variant applies to input data as a user-defined POD struct and - * output data as a user-defined POD struct. + * This variant applies to typed ALP programs. + * + * @see #grb::Launcher for more details on type requirements. * * @tparam T Input type of the given user program. * @tparam U Output type of the given user program. * - * @param[in] alp_program The use rogram to be benchmarked - * @param[in] data_in Input data as a raw data blob - * @param[in] in_size The size, in bytes, of the input data - * @param[out] out_data Output data - * @param[in] inner The number of inner repetitions of the benchmark - * @param[in] outer The number of outer repetitions of the benchmark - * @param[in] pid Unique ID of the calling user process + * @param[in] alp_program The user program to be benchmarked. + * @param[in] data_in Input data as a raw data blob. + * @param[in] in_size The size, in bytes, of the input data. + * @param[out] out_data Output data. + * @param[in] inner Number of inner repetitions of the benchmark. + * @param[in] outer Number of outer repetitions of the benchmark. + * @param[in] pid Unique ID of the calling user process. * * @see benchmarking * @@ -307,77 +359,19 @@ namespace grb { */ template< typename T, typename U, - enum Backend implementation = config::default_backend + enum Backend implementation > static RC benchmark( - void ( *alp_program )( const T &, U & ), - const T &data_in, - U &data_out, - const size_t inner, - const size_t outer, + AlpTypedFunc< T, U > alp_program, + const T &data_in, U &data_out, + const size_t inner, const size_t outer, const size_t pid ) { - const double inf = std::numeric_limits< double >::infinity(); - grb::utils::TimerResults total_times, min_times, max_times; - grb::utils::TimerResults * sdev_times = - new grb::utils::TimerResults[ outer ]; - total_times.set( 0 ); - min_times.set( inf ); - max_times.set( 0 ); - - // outer loop - for( size_t out = 0; out < outer; ++out ) { - grb::utils::TimerResults inner_times; - inner_times.set( 0 ); - - // inner loop - for( size_t in = 0; in < inner; ++in ) { - data_out.times.set( 0 ); - - ( *alp_program )( data_in, data_out ); - grb::collectives< implementation >::reduce( data_out.times.io, 0, - grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( data_out.times.preamble, 0, - grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( data_out.times.useful, 0, - grb::operators::max< double >() ); - grb::collectives< implementation >::reduce( data_out.times.postamble, 0, - grb::operators::max< double >() ); - inner_times.accum( data_out.times ); - } - - // calculate performance stats - benchmark_calc_inner( out, inner, inner_times, total_times, min_times, - max_times, sdev_times ); - -#ifndef _GRB_NO_STDIO - // give experiment output line - if( pid == 0 ) { - std::cout << "Outer iteration #" << out << " timings " - << "(io, preamble, useful, postamble, time since epoch): " << std::fixed - << inner_times.io << ", " << inner_times.preamble << ", " - << inner_times.useful << ", " << inner_times.postamble << ", "; - printTimeSinceEpoch( false ); - std::cout << std::scientific; - } -#endif - - // pause for next outer loop - if( sleep( 1 ) != 0 ) { -#ifndef _GRB_NO_STDIO - std::cerr << "Sleep interrupted, assume benchmark is unreliable; " - << "exiting.\n"; -#endif - abort(); - } - } - - // calculate performance stats - benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, + auto runner = [ alp_program, &data_in, &data_out ] { + alp_program( data_in, data_out ); + }; + return benchmark< implementation >( runner, data_out.times, inner, outer, pid ); - delete[] sdev_times; - - return SUCCESS; } @@ -436,11 +430,14 @@ namespace grb { */ Benchmarker( const size_t process_id = 0, - size_t nprocs = 1, - std::string hostname = "localhost", - std::string port = "0" + const size_t nprocs = 1, + const std::string hostname = "localhost", + const std::string port = "0" ) { - (void)process_id; (void)nprocs; (void)hostname; (void)port; + (void) process_id; + (void) nprocs; + (void) hostname; + (void) port; #ifndef _GRB_NO_EXCEPTIONS throw std::logic_error( "Benchmarker class called with unsupported mode or " "implementation" ); @@ -450,17 +447,16 @@ namespace grb { /** * Benchmarks a given ALP program. * - * This variant applies to input data as a user-defined POD struct and - * output data as a user-defined POD struct. + * This variant applies to typed ALP programs. * * @tparam T Input type of the given user program. * @tparam U Output type of the given user program. * - * @param[in] alp_program The ALP program to be benchmarked - * @param[in] data_in Input data as a raw data blob - * @param[out] data_out Output data - * @param[in] inner The number of inner repetitions of the benchmark - * @param[in] outer The number of outer repetitions of the benchmark + * @param[in] alp_program The ALP program to be benchmarked. + * @param[in] data_in Input data. + * @param[out] data_out Output data. + * @param[in] inner Number of inner repetitions of the benchmark. + * @param[in] outer Number of outer repetitions of the benchmark. * @param[in] broadcast An optional argument that dictates whether the * \a data_in argument should be broadcast across all * user processes participating in the benchmark, @@ -469,6 +465,8 @@ namespace grb { * The default value of \a broadcast is false. * * @returns #grb::SUCCESS The benchmarking has completed successfully. + * @returns #grb::ILLEGAL If \a broadcast was false but \a T is not + * default-constructible. * @returns #grb::FAILED An error during benchmarking has occurred. The * benchmark attempt could be retried, and an error * for the failure is reported to the standard error @@ -477,6 +475,8 @@ namespace grb { * starting the benchmark, while benchmarking, or * while aggregating the final results. * + * @see #grb::Launcher for more details. + * * @see benchmarking * * \internal This is the base implementation that should be specialised by @@ -485,10 +485,8 @@ namespace grb { template< typename T, typename U > RC exec( void ( *alp_program )( const T &, U & ), - const T &data_in, - U &data_out, - const size_t inner, - const size_t outer, + const T &data_in, U &data_out, + const size_t inner, const size_t outer, const bool broadcast = false ) const { (void) alp_program; @@ -502,23 +500,26 @@ namespace grb { // furthermore, it should be impossible to call this function without // triggering an exception during construction of this stub class, so we // just return PANIC here +#ifndef _GRB_NO_STDIO + std::cerr << "Error: base Benchmarker::exec called. An implementation-" + << "specific variant should have been called instead.\n"; +#endif return PANIC; } /** * Benchmarks a given ALP program. * - * This variant applies to input data as a byte blob and output data as a - * user-defined POD struct. + * This variant applies to untyped ALP programs. * * @tparam U Output type of the given user program. * - * @param[in] alp_program The use rogram to be benchmarked - * @param[in] data_in Input data as a raw data blob - * @param[in] in_size The size, in bytes, of the input data - * @param[out] data_out Output data - * @param[in] inner The number of inner repetitions of the benchmark - * @param[in] outer The number of outer repetitions of the benchmark + * @param[in] alp_program The user program to be benchmarked. + * @param[in] data_in Input data as a raw data blob. + * @param[in] in_size The size, in bytes, of the input data. + * @param[out] data_out Output data. + * @param[in] inner Number of inner repetitions of the benchmark. + * @param[in] outer Number of outer repetitions of the benchmark. * @param[in] broadcast An optional argument that dictates whether the * \a data_in argument should be broadcast across all * user processes participating in the benchmark, @@ -537,6 +538,8 @@ namespace grb { * starting the benchmark, while benchmarking, or * while aggregating the final results. * + * @see #grb::Launcher for more details. + * * @see benchmarking * * \internal This is the base implementation that should be specialised by @@ -562,6 +565,10 @@ namespace grb { // furthermore, it should be impossible to call this function without // triggering an exception during construction of this stub class, so we // just return PANIC here +#ifndef _GRB_NO_STDIO + std::cerr << "Error: base Benchmarker::exec called. An implementation-" + << "specific variant should have been called instead.\n"; +#endif return PANIC; } @@ -570,13 +577,7 @@ namespace grb { * * Calling this function is equivalent to calling #grb::Launcher::finalize. * - * After a call to this function, no further ALP programs may be benchmarked - * nor launched-- i.e., both the #grb::Launcher and #grb::Benchmarker - * functionalities many no longer be used. - * - * A well-behaving program calls this function, or #grb::Launcher::finalize, - * exactly once and just before exiting (or just before the guaranteed last - * invocation of an ALP program). + * @see #grb::Launcher for further details. * * @return #grb::SUCCESS The resources have successfully and permanently been * released. diff --git a/include/graphblas/base/exec.hpp b/include/graphblas/base/exec.hpp index fefb10132..18d7b9d99 100644 --- a/include/graphblas/base/exec.hpp +++ b/include/graphblas/base/exec.hpp @@ -40,38 +40,75 @@ namespace grb { + /** + * Type definition for an ALP function with input type information. + */ + template< typename InputType, typename OutputType > + using AlpTypedFunc = void ( * )( const InputType &, OutputType & ); + + /** + * Type definition for an ALP function without input type information. + */ + template< typename OutputType > + using AlpUntypedFunc = void ( * )( const void *, size_t, OutputType & ); + /** * The various ways in which the #grb::Launcher can be used to execute an * ALP program. * - * \warning An implementation may require different linker commands - * when using different modes. + * \warning An implementation or backend may require different linker commands + * when using different modes, and may require different arguments be + * passed on program launch. Please see the compiler and runner + * wrappers grbcxx, alpcxx, grbrun, and/or + * alprun for more details; or refer to the implementation + * documentation. * - * \warning Depending on the mode given to #grb::Launcher, the parameters - * required for the exec function may differ. + * \warning Depending on the mode given to #grb::Launcher, different parameters + * to the exec function may be required. * - * \note However, the ALP program is unaware of which mode is the launcher - * employs and will not have to change. + * An ALP program remains unaware of which mode the launcher employs. Normally, + * it requires no change depending on how it is launched. An exception is when + * data is passed through and from the caller program: + * -# if the launch mode is #AUTOMATIC, best practice is to minimise the input + * data footprint that requires broadcasting to all user processes + * executing the algorithm; in the base case, no input data requires + * broadcasting. Output is retained only from the first user process, i.e., + * the user process for which #grb::spmd<>::pid() returns zero. + * -# for any other launch mode, multiple user processes may exist before any + * ALP or ALP/GraphBLAS context exists. Each pre-existing process in such + * external context is then mapped to an ALP user process in a one-to-one + * manner. Data, including pointer data, may be passed freely between these + * two mapped processes; this may, in principle and contrary to the + * automatic mode, consider large data. Output is retained at each user + * process and thus is freely available to the mapped external process. In + * best practice, different user processes return different parts of the + * overall output, thereby achieving parallel I/O. */ enum EXEC_MODE { /** - * Automatic mode. The #grb::Launcher can spawn user processes - * which will execute a given program. + * Automatic mode. + * + * The #grb::Launcher may spawn additional user processes which will jointly + * execute a given ALP program. */ AUTOMATIC = 0, /** - * Manual mode. The user controls \a nprocs user processes - * which together should execute a given program, by, for - * example, using the #grb::Launcher. + * Manual mode. + * + * The user controls \a nprocs external processes which jointly should form an + * ALP context and execute one or more given ALP programs. */ MANUAL, /** - * When running from an MPI program. The user controls - * \a nprocs MPI programs, which, together, should execute - * a given ALP program. + * From MPI mode. + * + * The user controls \a nprocs external MPI processes which jointly should + * form an ALP context and execute one or more given ALP programs. The only + * difference with the manual mode is that this mode guarantees that the + * pre-existing external processes are MPI processes. */ FROM_MPI @@ -81,17 +118,24 @@ namespace grb { * A group of user processes that together execute ALP programs. * * Allows an application to run any ALP program. Input data may be passed - * through a user-defined type. Output data will be retrieved via the same - * type. + * through a user-defined type. Output data will be retrieved via another user- + * defined type. * - * For backends that support multiple user processes, the caller may - * explicitly set the process ID and total number of user processes. + * For backends that support multiple user processes, the caller may explicitly + * set the process ID and total number of user processes. If the launcher is + * requested to spawn new user processes, i.e., if it is constructed using the + * #AUTOMATIC mode, then the backend spawns an implementation-defined number of + * additional user processes beyond that corresponding to the process + * constructing the launcher instance, that then jointly execute ALP programs + * in parallel. * - * The intended use is to `just call' the exec function, which should be - * accepted by any backend. + * The intended use is to `just call' the exec function, which must be accepted + * by any backend in any implementation, to execute any ALP program. * * @tparam mode Which #EXEC_MODE the Launcher should adhere to. - * @tparam backend Which backend is to be used. + * @tparam backend Which backend to use. This is a hidden template argument that + * defaults to the backend selected at compile time through + * grbcxx or alpcxx. */ template< enum EXEC_MODE mode, enum Backend backend > class Launcher { @@ -99,12 +143,21 @@ namespace grb { public : /** - * Constructs a new #grb::Launcher. This constructor is a collective call; - * all \a nprocs processes that form a single launcher group must make a - * simultaneous call to this constructor. + * Constructs a new #grb::Launcher. + * + * In #AUTOMATIC mode, a single root user processes issues a call to this + * constructor. In all other modes, a call to this constructor is + * \em collective: all \a nprocs processes that are to form a single launcher + * group, must make a simultaneous call to this constructor and must do so + * with consistent arguments. + * + * \note One may note that in all modes, a call to this constructor must be + * collective; it is just that in automatic mode there is but one + * process involved with the collective call (i.e., \a nprocs is one). * * There is an implementation-defined time-out for the creation of a launcher - * group. + * group. The default arguments to the below are consistent with the + * automatic launcher mode. * * @param[in] process_id The user process ID of the calling process. * The value must be larger or equal to 0. This @@ -124,18 +177,32 @@ namespace grb { * if and only if \a nprocs is larger than one. * Optional: the default value is `0'. * - * @throws invalid_argument If \a nprocs is zero. - * @throws invalid_argument If \a process_id is greater than or equal to - * \a nprocs. + * While these arguments are generic and would work with most network + * fabrics, some modes such as indeed #FROM_MPI may require other arguments + * for constructing a launcher. In terms of specification, only #AUTOMATIC + * and #MANUAL are required to implement this specific constructor + * signature, including the specified defaults for each argument. All + * aforementioned default values must be legal for the #AUTOMATIC and + * #MANUAL modes. + * + * Any other mode in #grb::EXEC_MODE, with possibly different constructor + * signatures from those listed here, are both optional and implementation- + * specific. * * \note An implementation or backend may define further constraints on the * input arguments, such as, obviously, on \a hostname and \a port, but * also on \a nprocs and, as a result, on \a process_id. - * \note The most obvious is that backends supporting only one user process - * must not accept \a nprocs larger than 1. + * \note The most obvious such restriction has backends supporting only one + * user process not accepting \a nprocs larger than 1. + * + * @throws invalid_argument If \a nprocs is zero. + * @throws invalid_argument If \a process_id is greater than or equal to + * \a nprocs. * - * All aforementioned default values shall always be legal. + * @throws std::invalid_argument If \a nprocs is zero. + * @throws std::invalid_argument If \a process_id is larger than or equal to + * \a nprocs. */ Launcher( const size_t process_id = 0, @@ -145,7 +212,8 @@ namespace grb { ) { // spec does not specify any constrants on hostname and port // so accept (and ignore) anything - (void) hostname; (void) port; + (void) hostname; + (void) port; #ifndef _GRB_NO_EXCEPTIONS // sanity checks on process_id and nprocs @@ -164,38 +232,77 @@ namespace grb { * Executes a given ALP program using the user processes encapsulated by this * launcher group. * - * Calling this function, depending on whether the automatic or manual/MPI - * mode was selected, will either \em spawn the maximum number of available - * user processes and \em then execute the given program, \em or it will - * employ the given processes that are managed by the user application and - * used to construct this launcher instance to execute the given - * \a alp_program. + * Calling this function, depending on whether the automatic, manual, or from + * MPI mode was selected, will either: + * -# use processes spawned by the ALP implemenation and use those, as well + * as the process which had constructed this launcher instance, + * to jointly execute the given \a alp_program, \em or + * -# employ the given processes that are managed by the user application + * and used to construct this launcher instance to execute the given + * \a alp_program. * * This is a collective function call-- all processes in the launcher group * must make a simultaneous call to this function and must do so using * consistent arguments. * - * @tparam T The type of the data to pass to the ALP program as input. - * @tparam U The type of the output data to pass back to the caller. + * @tparam T The type of the data to pass to the ALP program as input. This + * must be a POD type that contains no pointers. + * + * \note In fact, \a T may be standard layout and contain no pointers, or it + * may be trivially copiable and contain no pointers. + * + * For calls with \a broadcast false, \a T must furthermore be + * default-constructible (and have meaningful default values that allow for + * successful multi-process execution). + * + * For programs or entry points that are solely to be called from manual or + * from MPI modes with \a broadcast false, there are no constraints + * on the type \a T since instances of \a T are only ever passed within the + * pre-existing user process, and never communicated across user processes. + * + * @tparam U The type of the output data to pass back to the caller. This may + * be of any type. + * + * When \a mode is #AUTOMATIC, the type \a U must be default-constructible. * * @param[in] alp_program The user program to be executed. * @param[in] data_in Input data of user-defined type \a T. + * @param[out] data_out Output data of user-defined type \a U. + * @param[in] broadcast Whether the input should be broadcast from user + * process 0 to all other user processes. Optional; + * the default value is false. + * + * When in automatic mode and \a broadcast is false, the input data + * \a data_in will only be available at user process with ID 0-- any other + * user processes will receive a default-constructed \a data_in instead. + * When in automatic mode and \a broadcast is true, the input data + * \a data_in will be available at all user processes instead. * - * When in automatic mode and \a broadcast is false, the data will - * only be available at user process with ID 0. When in automatic mode and - * \a broadcast is true, the data will be available at all user - * processes. When in manual mode, the data will be available to this user - * process only, with "this process" corresponding to the process that calls - * this function. + * When in #MANUAL or #FROM_MPI mode, each user process should collectively + * call this function. If \a broadcast is false, the input data + * will be passed from the external calling process to the corresponding ALP + * user processes in a one-to-one manner. Should \a broadcast be + * true, then the initial input data passed this way is overwritten + * for user processes \f$ s > 0 \f$ with the \a data_in passed at user + * process zero. * - * @param[out] data_out Output data of user-defined type \a U. The output - * data should be available at user process with ID - * zero. - * @param[in] broadcast Whether the input should be broadcast from user - * process 0 to all other user processes. Optional; - * the default value is \a false. + * Only in #MANUAL or #FROM_MPI modes will the output of any user processes + * with ID \f$ s > 0 \f$ be returned to all the processes that collectively + * call this function. + * + * In #AUTOMATIC mode, the output at \f$ s > 0 \f$ is lost. Only the output + * of the first user process \f$ s = 0 \f$ will be passed back to the root + * process that called this function. + * + * \note The default for \a broadcast is false as it is the variant + * that implies the least cost when launching a program. + * + * \note The #FROM_MPI mode is specific to this imlementation and need not + * be provided as part of the specification. * * @return #grb::SUCCESS If the execution proceeded as intended. + * @return #grb::ILLEGAL If \a broadcast was false and \a mode was + * #AUTOMATIC, but \a T not default-constructible. * @return #grb::PANIC If an unrecoverable error was encountered while * attempting to execute, attempting to terminate, or * while executing, the given program. @@ -204,11 +311,12 @@ namespace grb { * achieve its intended result-- for example, an iterative solver * may fail to converge. A good programming pattern has that \a U * either a) is an error code for the algorithm used (e.g., - * #grb::RC), or b) that \a U contains such an error code. + * int or #grb::RC), or that b) \a U is a struct that + * contains such an error code. */ template< typename T, typename U > RC exec( - void ( *alp_program )( const T &, U & ), + AlpTypedFunc< T, U > alp_program, const T &data_in, U &data_out, const bool broadcast = false @@ -227,30 +335,43 @@ namespace grb { * launcher group. * * This variant of exec has that \a data_in is of a variable byte size, - * instead of a fixed POD type. If \a broadcast is true and the - * launcher is instantiated using the #grb::AUTOMATIC mode, all bytes are - * broadcast to all user processes. + * instead of a fixed (POD pointer-less) type. We refer to the given function + * as an untyped ALP function (since the input is a raw pointer), whereas the + * other variant executes \em typed ALP functions instead. * - * @param[in] alp_program The user program to be executed. + * If \a broadcast is true, all bytes are broadcast from the user + * process with ID zero to all other user processes. + * + * \note When in #MANUAL or #FROM_MPI mode, this implies any arguments passed + * in a process-to-process manner will be lost. + * + * If \a broadcast is false and the launcher in #AUTOMATIC mode, + * then the user processes with ID \f$ s > 0 \f$ will receive \a data_in + * equal to nullptr and \a in_size equal to zero. + * + * See the \em typed ALP exec variant for more detailed comments, which also + * transfer to this untyped variant. + * + * @param[in] alp_program The (untyped) user program to be executed. * @param[in] data_in Pointer to raw input byte data. * @param[in] in_size The number of bytes the input data consists of. - * @param[out] data_out Output data of user-defined type \a U. The output - * data should be available at user process with ID - * zero. - * @param[in] broadcast Whether the input should be broadcast from user - * process 0 to all other user processes. Optional; - * the default value is \a false. + * @param[out] data_out Output data of user-defined type \a U. The output + * data should be available at user process with ID + * zero. + * @param[in] broadcast Whether the input should be broadcast from user + * process 0 to all other user processes. Optional; + * the default value is \a false. * * @return #grb::SUCCESS If the execution proceeded as intended. + * @return #grb::ILLEGAL If \a in_size is larger than zero but \a data_in is + * equal to nullptr. * @return #grb::PANIC If an unrecoverable error was encountered while * attempting to execute, attempting to terminate, or * while executing, the given program. - * - * For more details, see the other version of this function. */ template< typename U > RC exec( - void ( *alp_program )( const void *, const size_t, U & ), + AlpUntypedFunc< U > alp_program, const void * data_in, const size_t in_size, U &data_out, @@ -268,25 +389,24 @@ namespace grb { * Releases all ALP resources. * * After a call to this function, no further ALP programs may launched using - * the #grb::Launcher and #grb::Benchmarker. Also the use of #grb::init and - * #grb::finalize will no longer be accepted. + * \em any #grb::Launcher or #grb::Benchmarker instance. Implementations and + * backends shall under no circumstance require a call to this function; any + * use of this function shall remain purely optional. * - * \warning #grb::init and #grb::finalize are deprecated. + * \warning After a call to this function, also any subsequent call to the + * deprecated #grb::init and #grb::finalize will no longer be + * accepted. * * \internal - * \todo Remove the above comments once #grb::init and #grb::finalize are + * \todo Remove the above warning once #grb::init and #grb::finalize are * moved to an internal namespace. * \endinternal * * After a call to this function, the only way to once again run ALP programs - * is to use the #grb::Launcher from a new process. + * is to use the #grb::Launcher from a different process. * * \warning Therefore, use this function with care and preferably only just - * before exiting the process. - - * A well-behaving program calls this function, or - * #grb::Benchmarker::finalize, exactly once before its process terminates, - * or just after the guaranteed last invocation of an ALP program. + * before exiting the process-- or not at all. * * @return #grb::SUCCESS The resources have successfully and permanently been * released. @@ -296,25 +416,14 @@ namespace grb { * undefined and should no longer be used. * * \note In the terminology of the Message Passing Interface (MPI), this - * function is the ALP equivalent of the MPI_Finalize(). - * - * \note In #grb::AUTOMATIC mode when using a parallel backend that uses MPI - * to auto-parallelise the ALP computations, MPI is never explicitly - * exposed to the user application. This use case necessitates the - * specification of this function. - * - * \note Thus, and in particular, an ALP program launched in #grb::AUTOMATIC - * mode while using the #grb::BSP1D or the #grb::hybrid backends with - * ALP compiled using LPF that in turn is configured to use an - * MPI-based engine, should make sure to call this function before - * program exit. - * - * \note An application that launches ALP programs in #grb::FROM_MPI mode - * must still call this function, even though a proper such application - * makes its own call to MPI_Finalize(). This does \em not - * induce improper behaviour since calling this function using a - * launcher instance in #grb::FROM_MPI mode translates, from an MPI - * perspective, to a no-op. + * function is similar to MPI_Finalize(). + * + * \warning Different from MPI, however, a call to this function at program + * exit is not mandatory. + * + * \warning An application that launches ALP programs in #grb::FROM_MPI mode + * that calls this function, must (afterwards) still make a call to + * MPI_Finalize(). * * \internal This is the base implementation that should be specialised by * each backend separately. diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp index ccace7979..81bd67773 100644 --- a/include/graphblas/benchmark.hpp +++ b/include/graphblas/benchmark.hpp @@ -45,7 +45,10 @@ #ifdef _GRB_BACKEND namespace grb { - template< enum EXEC_MODE mode, enum Backend implementation = config::default_backend > + template< + enum EXEC_MODE mode, + enum Backend implementation = config::default_backend + > class Benchmarker; } #endif diff --git a/include/graphblas/bsp/collectives.hpp b/include/graphblas/bsp/collectives.hpp index 098f7f738..e7291734d 100644 --- a/include/graphblas/bsp/collectives.hpp +++ b/include/graphblas/bsp/collectives.hpp @@ -66,331 +66,388 @@ "************************************************************************" \ "**********************\n" ); + namespace grb { /** - * Collective communications using the GraphBLAS operators for - * reduce-style operations. This is the BSP1D implementation. + * Collective communications using ALP operators for reduce-style operations. + * + * This is the BSP1D implementation. * * TODO internal issue #198 */ template<> class collectives< BSP1D > { - private: - /** Disallow instantiation of this class. */ - collectives() {} - - public: - /** - * Schedules an allreduce operation of a single object of type IOType per - * process. The allreduce shall be complete by the end of the call. This is a - * collective graphBLAS operation. - * - * \parblock - * \par Performance semantics: - * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$ - * -# local work: \f$ N*Operator \f$ ; - * -# transferred bytes: \f$ N \f$ ; - * -# BSP cost: \f$ Ng + N*Operator + l \f$; - * \endparblock - * - * This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes - * if the internal buffer was not sufficiently large. - */ - template< - Descriptor descr = descriptors::no_operation, - typename Operator, typename IOType - > - static RC allreduce( IOType &inout, const Operator &op = Operator() ) { - // this is the serial algorithm only - // TODO internal issue #19 + private: + + /** Disallow instantiation of this class. */ + collectives() {} + + + public: + + /** + * Schedules an allreduce operation of a single object of type IOType per + * process. The allreduce shall be complete by the end of the call. This is a + * collective graphBLAS operation. + * + * \parblock + * \par Performance semantics: + * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$ + * -# local work: \f$ N*Operator \f$ ; + * -# transferred bytes: \f$ N \f$ ; + * -# BSP cost: \f$ Ng + N*Operator + l \f$; + * \endparblock + * + * This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes + * if the internal buffer was not sufficiently large. + */ + template< + Descriptor descr = descriptors::no_operation, + typename Operator, typename IOType + > + static RC allreduce( IOType &inout, const Operator &op = Operator() ) { + // this is the serial algorithm only + // TODO internal issue #19 #ifdef _DEBUG - std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = " - << inout << " and op = " << &op << std::endl; + std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = " + << inout << " and op = " << &op << std::endl; #endif - // static sanity check - NO_CAST_ASSERT_BLAS0( ( !( descr & descriptors::no_casting ) || - std::is_same< IOType, typename Operator::D1 >::value || - std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value - ), - "grb::collectives::allreduce", - "Incompatible given value type and operator domains while " - "no_casting descriptor was set" - ); - - // we need access to LPF context - internal::BSP1D_Data &data = internal::grb_BSP1D.load(); - - // catch trivial case early - if( data.P == 1 ) { - return SUCCESS; - } + // static sanity check + NO_CAST_ASSERT_BLAS0( ( !( descr & descriptors::no_casting ) || + std::is_same< IOType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), + "grb::collectives::allreduce", + "Incompatible given value type and operator domains while " + "no_casting descriptor was set" + ); + + // we need access to LPF context + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); + + // catch trivial case early + if( data.P == 1 ) { + return SUCCESS; + } - // we need to register inout - lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT; - if( data.ensureMemslotAvailable() != grb::SUCCESS ) { + // we need to register inout + lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT; + if( data.ensureMemslotAvailable() != grb::SUCCESS ) { #ifndef NDEBUG - const bool could_not_ensure_enough_memory_slots_available = false; - assert( could_not_ensure_enough_memory_slots_available ); + const bool could_not_ensure_enough_memory_slots_available = false; + assert( could_not_ensure_enough_memory_slots_available ); #endif - return PANIC; - } - if( lpf_register_local( data.context, - &inout, - sizeof( IOType ), - &inout_slot - ) != LPF_SUCCESS - ) { + return PANIC; + } + if( lpf_register_local( data.context, + &inout, + sizeof( IOType ), + &inout_slot + ) != LPF_SUCCESS + ) { #ifndef NDEBUG - const bool lpf_register_returned_error = false; - assert( lpf_register_returned_error ); + const bool lpf_register_returned_error = false; + assert( lpf_register_returned_error ); #endif - return PANIC; - } else { - data.signalMemslotTaken(); - } + return PANIC; + } else { + data.signalMemslotTaken(); + } - // allgather inout values - // note: buffer size check is done by the below function - if( internal::allgather( - inout_slot, 0, - data.slot, data.s * sizeof( IOType ), - sizeof( IOType ), - data.P * sizeof( IOType ), - true - ) != grb::SUCCESS ) { + // allgather inout values + // note: buffer size check is done by the below function + if( internal::allgather( + inout_slot, 0, + data.slot, data.s * sizeof( IOType ), + sizeof( IOType ), + data.P * sizeof( IOType ), + true + ) != grb::SUCCESS ) { #ifndef NDEBUG - const bool allgather_returned_error = false; - assert( allgather_returned_error ); + const bool allgather_returned_error = false; + assert( allgather_returned_error ); #endif - return PANIC; + return PANIC; } - // deregister - if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) { + // deregister + if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) { #ifndef NDEBUG - const bool lpf_deregister_returned_error = false; - assert( lpf_deregister_returned_error ); + const bool lpf_deregister_returned_error = false; + assert( lpf_deregister_returned_error ); #endif - return PANIC; - } else { - data.signalMemslotReleased(); - } - - // fold everything - IOType * __restrict__ const buffer = data.getBuffer< IOType >(); - for( size_t i = 0; i < data.P; ++i ) { - if( i == data.s ) { - continue; + return PANIC; + } else { + data.signalMemslotReleased(); } + + // fold everything + IOType * __restrict__ const buffer = data.getBuffer< IOType >(); + for( size_t i = 0; i < data.P; ++i ) { + if( i == data.s ) { + continue; + } #ifdef _DEBUG - std::cout << data.s - << ": in Collectives< BSP1D >::allreduce. Buffer " - "index " - << i << ", folding " << buffer[ i ] << " into " << inout << ", yields "; + std::cout << data.s << ": in Collectives< BSP1D >::allreduce. Buffer " + << "index " << i << ", folding " << buffer[ i ] << " into " << inout + << ", yields "; #endif - // if casting is required to apply op, foldl will take care of this - if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) { - assert( false ); - } + // if casting is required to apply op, foldl will take care of this + if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) { + assert( false ); + } #ifdef _DEBUG - std::cout << inout << std::endl; + std::cout << inout << std::endl; #endif - } + } - // done - return SUCCESS; - } - - /** - * Schedules a reduce operation of a single object of type IOType per process. - * The reduce shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce. - * - * \parblock - * \par Performance semantics: - * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$ - * -# local work: \f$ N*Operator \f$ ; - * -# transferred bytes: \f$ N \f$ ; - * -# BSP cost: \f$ Ng + N*Operator + l \f$; - * \endparblock - * - */ - template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType > - static RC reduce( IOType & inout, const lpf_pid_t root = 0, const Operator op = Operator() ) { - // this is the serial algorithm only - // TODO internal issue #19 - - // static sanity check - NO_CAST_ASSERT_BLAS0( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value ), - "grb::collectives::reduce", - "Incompatible given value type and operator domains while " - "no_casting descriptor was set" ); - - // we need access to LPF context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); - - // catch trivial case early - if( data.P == 1 ) { + // done return SUCCESS; } - // make sure we can support comms pattern: IOType -> P * IOType - lpf_coll_t coll; - if( commsPreamble( data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 ) != SUCCESS ) { - return PANIC; - } + /** + * Schedules a reduce operation of a single object of type IOType per process. + * The reduce shall be complete by the end of the call. This is a collective + * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce. + * + * \parblock + * \par Performance semantics: + * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$ + * -# local work: \f$ N*Operator \f$ ; + * -# transferred bytes: \f$ N \f$ ; + * -# BSP cost: \f$ Ng + N*Operator + l \f$; + * \endparblock + */ + template< + Descriptor descr = descriptors::no_operation, + typename Operator, typename IOType + > + static RC reduce( + IOType &inout, const lpf_pid_t root = 0, + const Operator op = Operator() + ) { + // this is the serial algorithm only + // TODO internal issue #19 + + // static sanity check + NO_CAST_ASSERT_BLAS0( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), "grb::collectives::reduce", + "Incompatible given value type and operator domains while " + "no_casting descriptor was set" + ); + + // we need access to LPF context + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); + + // catch trivial case early + if( data.P == 1 ) { + return SUCCESS; + } - // create a local register slot - lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT; - if( lpf_register_global( data.context, &inout, sizeof( IOType ), &inout_slot ) != LPF_SUCCESS ) { - return PANIC; - } + // make sure we can support comms pattern: IOType -> P * IOType + lpf_coll_t coll; + if( commsPreamble( + data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 + ) != SUCCESS + ) { + return PANIC; + } - if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { - return PANIC; - } + // create a local register slot + lpf_memslot_t inout_slot = LPF_INVALID_MEMSLOT; + if( lpf_register_global( + data.context, &inout, sizeof( IOType ), &inout_slot + ) != LPF_SUCCESS + ) { + return PANIC; + } - // gather together values - if( lpf_gather( coll, inout_slot, data.slot, sizeof( IOType ), root ) != LPF_SUCCESS ) { - return PANIC; - } + if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + return PANIC; + } - // finish the communication - if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { - return PANIC; - } + // gather together values + if( lpf_gather( + coll, inout_slot, data.slot, sizeof( IOType ), root + ) != LPF_SUCCESS + ) { + return PANIC; + } - // do deregister - if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) { - return PANIC; - } + // finish the communication + if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + return PANIC; + } - // fold everything: root only - if( data.s == root ) { - IOType * __restrict__ const buffer = data.getBuffer< IOType >(); - for( size_t i = 0; i < data.P; ++i ) { - if( i == root ) { - continue; - } - // if casting is required to apply op, foldl will take care of this - // note: the no_casting check could be deferred to foldl but this would result in unclear error messages - if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) { - return PANIC; + // do deregister + if( lpf_deregister( data.context, inout_slot ) != LPF_SUCCESS ) { + return PANIC; + } + + // fold everything: root only + if( data.s == root ) { + IOType * __restrict__ const buffer = data.getBuffer< IOType >(); + for( size_t i = 0; i < data.P; ++i ) { + if( i == root ) { + continue; + } + // if casting is required to apply op, foldl will take care of this + // note: the no_casting check could be deferred to foldl but this would + // result in unclear error messages + if( foldl< descr >( inout, buffer[ i ], op ) != SUCCESS ) { + return PANIC; + } } } - } - if( commsPostamble( data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 ) != SUCCESS ) { - return PANIC; - } + if( commsPostamble( + data, &coll, data.P, data.P * sizeof( IOType ), 0, 1 + ) != SUCCESS + ) { + return PANIC; + } - // done - return SUCCESS; - } - - /** - * Schedules a broadcast operation of a single object of type IOType per process. - * The broadcast shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the PlatformBSP #broadcast. - * - * @tparam IOType The type of the to-be broadcast value. - * - * @param[in,out] inout On input: the value at the root process to be broadcast. - * On output at process \a root: the same value. - * On output at non-root processes: the value at root. - * - * \parblock - * \par Performance semantics: common - * Whether system calls will happen depends on the LPF engine compiled with, - * as does whether buffer space is proportional to the payload size is - * required. In principle, when using a fabric like Inifiband and when using - * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained. - * - * All below variants in any backend shall not result in dynamic memory - * allocations. - * \endparblock - * - * \parblock - * \par Performance semantics: serial - * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ - * -# local work: \f$ 0 \f$ ; - * -# transferred bytes: \f$ NP \f$ ; - * -# BSP cost: \f$ NPg + l \f$; - * \endparblock - * - * \parblock - * \par Performance semantics: two phase - * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ - * -# local work: \f$ 0 \f$ ; - * -# transferred bytes: \f$ 2N \f$ ; - * -# BSP cost: \f$ 2(Ng + l) \f$; - * \endparblock - * - * \parblock - * \par Performance semantics: two level tree - * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ - * -# local work: \f$ 0 \f$ ; - * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ; - * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$; - * \endparblock - */ - template< typename IOType > - static RC broadcast( IOType & inout, const lpf_pid_t root = 0 ) { - // we need access to LPF context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); - - // make sure we can support comms pattern: IOType -> IOType - lpf_coll_t coll; - if( commsPreamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) { - return PANIC; + // done + return SUCCESS; } - // register inout - lpf_memslot_t slot = LPF_INVALID_MEMSLOT; - if( data.ensureMemslotAvailable() != SUCCESS ) { - return PANIC; - } - if( lpf_register_global( data.context, &inout, sizeof( IOType ), &slot ) != LPF_SUCCESS ) { - return PANIC; - } + /** + * Schedules a broadcast operation of a single object of type IOType per process. + * The broadcast shall be complete by the end of the call. This is a collective + * graphBLAS operation. The BSP costs are as for the PlatformBSP #broadcast. + * + * @tparam IOType The type of the to-be broadcast value. + * + * @param[in,out] inout On input: the value at the root process to be broadcast. + * On output at process \a root: the same value. + * On output at non-root processes: the value at root. + * + * \parblock + * \par Performance semantics: common + * Whether system calls will happen depends on the LPF engine compiled with, + * as does whether buffer space is proportional to the payload size is + * required. In principle, when using a fabric like Inifiband and when using + * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained. + * + * All below variants in any backend shall not result in dynamic memory + * allocations. + * \endparblock + * + * \parblock + * \par Performance semantics: serial + * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ + * -# local work: \f$ 0 \f$ ; + * -# transferred bytes: \f$ NP \f$ ; + * -# BSP cost: \f$ NPg + l \f$; + * \endparblock + * + * \parblock + * \par Performance semantics: two phase + * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ + * -# local work: \f$ 0 \f$ ; + * -# transferred bytes: \f$ 2N \f$ ; + * -# BSP cost: \f$ 2(Ng + l) \f$; + * \endparblock + * + * \parblock + * \par Performance semantics: two level tree + * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$ + * -# local work: \f$ 0 \f$ ; + * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ; + * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$; + * \endparblock + */ + template< typename IOType > + static RC broadcast( IOType &inout, const lpf_pid_t root = 0 ) { + // we need access to LPF context + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); + + // make sure we can support comms pattern: IOType -> IOType + lpf_coll_t coll; + if( commsPreamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) { + return PANIC; + } - if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { - return PANIC; - } + // register inout + lpf_memslot_t slot = LPF_INVALID_MEMSLOT; + if( data.ensureMemslotAvailable() != SUCCESS ) { + return PANIC; + } + if( lpf_register_global( + data.context, &inout, sizeof( IOType ), &slot + ) != LPF_SUCCESS + ) { + return PANIC; + } - // broadcast value - if( lpf_broadcast( coll, slot, slot, sizeof( IOType ), root ) != LPF_SUCCESS ) { - return PANIC; - } + if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + return PANIC; + } - // finish communication - if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { - return PANIC; - } + // broadcast value + if( lpf_broadcast( + coll, slot, slot, sizeof( IOType ), root + ) != LPF_SUCCESS + ) { + return PANIC; + } - // coda - if( lpf_deregister( data.context, slot ) != LPF_SUCCESS ) { - return PANIC; - } + // finish communication + if( lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + return PANIC; + } - if( commsPostamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) { - return PANIC; + // coda + if( lpf_deregister( data.context, slot ) != LPF_SUCCESS ) { + return PANIC; + } + + if( commsPostamble( data, &coll, data.P, 0, 0, 1 ) != SUCCESS ) { + return PANIC; + } + + // done + return SUCCESS; } - // done - return SUCCESS; - } + /** + * Schedules a broadcast of a raw array of a given type. + * + * @tparam IOType The array element type. + * + * @param[in,out] inout A pointer to the array to broadcast (for the root + * user process), or a pointer where to store the array + * to be broadcast (for all other user processes). + * @param[in] size The size, in number of array elements, of the array + * to be broadcast. Must match across all user processes + * in the collective call. + * @param[in] root Which user process ID is the root. + * + * \parblock + * \par Performance semantics + * + * Please refer to the LPF collectives higher-level library for the + * performance semantics of this call. (This function does not implements + * its own custom logic for this primitive.) + * \endparblock + * + * @returns grb::SUCCESS On successful broadcast of the requested array. + * @returns grb::PANIC If the communication layer has failed. + */ + template< Descriptor descr = descriptors::no_operation, typename IOType > + static RC broadcast( + IOType * inout, const size_t size, const size_t root = 0 + ) { + return internal::broadcast< descr >( inout, size, root ); + } - /** TODO documentation */ - template< Descriptor descr = descriptors::no_operation, typename IOType > - static RC broadcast( IOType * inout, const size_t size, const size_t root = 0 ) { - return internal::broadcast< descr >( inout, size, root ); - } }; } // namespace grb @@ -398,3 +455,4 @@ namespace grb { #undef NO_CAST_ASSERT_BLAS0 #endif // end ``_H_GRB_BSP_COLL'' + diff --git a/include/graphblas/bsp/collectives_blas1.hpp b/include/graphblas/bsp/collectives_blas1.hpp index eace13468..7b1633cef 100644 --- a/include/graphblas/bsp/collectives_blas1.hpp +++ b/include/graphblas/bsp/collectives_blas1.hpp @@ -37,13 +37,18 @@ #include "internal-collectives.hpp" -/** The difference between pid and root, modulus P - circumvents weird modulus behaviour under -ve numbers */ -#define DIFF( pid, root, P ) ( ( pid < root ) ? pid + P - root : pid - root ) % P +/** + * The difference between pid and root, modulus P - circumvents weird modulus + * behaviour under -ve numbers + */ +#define DIFF( pid, root, P ) ( (pid < root) ? pid + P - root : pid - root ) % P + namespace grb { /** - * Collective communications using the GraphBLAS operators for reduce-style operations. + * Collective communications using the GraphBLAS operators for reduce-style + * operations. */ namespace internal { @@ -57,8 +62,9 @@ namespace grb { * Default is grb::descriptors::no_operation. * @tparam IOType The type of the to-be gathered value. * - * @param[in] in: The value at the calling process to be gathered. - * @param[out] out: The vector of gathered values, available at the root process. + * @param[in] in The value at the calling process to be gathered. + * @param[out] out The vector of gathered values, available at the root + * process. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -74,22 +80,25 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > - RC gather( const IOType & in, + > + RC gather( + const IOType &in, #ifdef BLAS1_RAW IOType * out, #else - Vector< IOType, reference, Coords > & out, + Vector< IOType, reference, Coords > &out, #endif - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // run-time sanity check #ifndef BLAS1_RAW @@ -121,9 +130,15 @@ namespace grb { lpf_memslot_t slot = LPF_INVALID_MEMSLOT; RC ret = SUCCESS; #ifndef BLAS1_RAW - if( lpf_register_global( data.context, internal::getRaw( out ), data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) { + if( lpf_register_global( + data.context, internal::getRaw( out ), data.P * sizeof( IOType ), &slot + ) != LPF_SUCCESS + ) { #else - if( lpf_register_global( data.context, out, data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) { + if( lpf_register_global( + data.context, out, data.P * sizeof( IOType ), &slot + ) != LPF_SUCCESS + ) { #endif // failure at this point will have to be cleaned up as best as possible ret = PANIC; @@ -134,26 +149,42 @@ namespace grb { } // gather values - if( ret == SUCCESS && lpf_gather( coll, slot, slot, sizeof( IOType ), root ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_gather( coll, slot, slot, sizeof( IOType ), root ) + != LPF_SUCCESS + ) { // failure at this point will have to be cleaned up as best as possible ret = PANIC; } // perform communication - if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_sync( data.context, LPF_SYNC_DEFAULT ) + != LPF_SUCCESS + ) { // failure at this point will have to be cleaned up as best as possible ret = PANIC; } #ifndef BLAS1_RAW // make sure sparsity info is correct - for( size_t i = 0; data.s == root && ret == SUCCESS && internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes() && i < data.P; ++i ) { - (void)internal::getCoordinates( out ).assign( i ); + for( + size_t i = 0; + data.s == root && + ret == SUCCESS && + internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes() + && i < data.P; + ++i + ) { + (void) internal::getCoordinates( out ).assign( i ); } #endif // deregister slot - if( slot != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, slot ) != LPF_SUCCESS ) { + if( slot != LPF_INVALID_MEMSLOT && + lpf_deregister( data.context, slot ) + != LPF_SUCCESS + ) { // error during cleanup of memslot ret = PANIC; } @@ -168,8 +199,8 @@ namespace grb { } /** - * Schedules a gather operation of a vector of \a N/P elements of type IOType per process - * to a vector of \f$ N \f$ elements. + * Schedules a gather operation of a vector of \a N/P elements of type IOType + * per process to a vector of \f$ N \f$ elements. * The gather shall be complete by the end of the call. This is a collective * graphBLAS operation. The BSP costs are as for the LPF #gather. * @@ -194,25 +225,27 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC gather( #ifdef BLAS1_RAW const IOType * in, const size_t size, IOType * out, #else - const Vector< IOType, reference, Coords > & in, - Vector< IOType, reference, Coords > & out, + const Vector< IOType, reference, Coords > &in, + Vector< IOType, reference, Coords > &out, #endif - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: Vector IOType -> P * Vector IOType #ifndef BLAS1_RAW @@ -229,9 +262,15 @@ namespace grb { lpf_memslot_t slot = LPF_INVALID_MEMSLOT; RC ret = SUCCESS; #ifndef BLAS1_RAW - if( lpf_register_global( data.context, internal::getRaw( out ), size * data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) { + if( lpf_register_global( data.context, internal::getRaw( out ), + size * data.P * sizeof( IOType ), &slot ) + != LPF_SUCCESS + ) { #else - if( lpf_register_global( data.context, out, size * data.P * sizeof( IOType ), &slot ) != LPF_SUCCESS ) { + if( lpf_register_global( data.context, out, size * data.P * sizeof( IOType ), + &slot ) + != LPF_SUCCESS + ) { #endif // failure at this point will have to be cleaned up as best as possible ret = PANIC; @@ -240,38 +279,68 @@ namespace grb { // copy input to buffer const size_t pos = ( data.s == root ) ? data.s : 0; #ifdef BLAS1_RAW - for( size_t i = 0; ret == SUCCESS && ( out + pos * size ) != in && i < size; i++ ) { + for( + size_t i = 0; + ret == SUCCESS && ( out + pos * size ) != in && i < size; + i++ + ) { out[ pos * size + i ] = in[ i ]; } #else - for( size_t i = 0; ret == SUCCESS && ( internal::getRaw( out ) + pos * size ) != internal::getRaw( in ) && i < size; i++ ) { + for( + size_t i = 0; + ret == SUCCESS && + (internal::getRaw( out ) + pos * size) != internal::getRaw( in ) && + i < size; + i++ + ) { internal::getRaw( out )[ pos * size + i ] = internal::getRaw( in )[ i ]; } #endif // activate registrations - if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_sync( data.context, LPF_SYNC_DEFAULT ) + != LPF_SUCCESS + ) { ret = PANIC; } // gather values - if( ret == SUCCESS && lpf_gather( coll, slot, slot, size * sizeof( IOType ), root ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_gather( coll, slot, slot, size * sizeof( IOType ), root ) + != LPF_SUCCESS + ) { ret = PANIC; } // complete requested communication - if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_sync( data.context, LPF_SYNC_DEFAULT ) + != LPF_SUCCESS + ) { ret = PANIC; } #ifndef BLAS1_RAW // set sparsity of output - for( size_t i = 0; data.s == root && ret == SUCCESS && internal::getCoordinates( out ).size() != internal::getCoordinates( out ).nonzeroes() && i < data.P * size; ++i ) { - (void)internal::getCoordinates( out ).assign( i ); + for( + size_t i = 0; + data.s == root && + ret == SUCCESS && + internal::getCoordinates( out ).size() != + internal::getCoordinates( out ).nonzeroes() && + i < data.P * size; + ++i + ) { + (void) internal::getCoordinates( out ).assign( i ); } #endif // destroy memory slot - if( slot != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, slot ) != LPF_SUCCESS ) { + if( slot != LPF_INVALID_MEMSLOT && + lpf_deregister( data.context, slot ) != + LPF_SUCCESS + ) { ret = PANIC; } @@ -294,8 +363,10 @@ namespace grb { * Default is grb::descriptors::no_operation. * @tparam IOType The type of the to-be scattered value. * - * @param[in] in: The vector of \a P elements at the root process to be scattered. - * @param[out] out: The scattered value of the root process \f$ vector[i] \f$ at process \a i. + * @param[in] in The vector of \a P elements at the root process to be + * scattered. + * @param[out] out The scattered value of the root process \f$ vector[i] \f$ + * at process \a i. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -311,23 +382,25 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC scatter( #ifdef BLAS1_RAW const IOType * in, #else - const Vector< IOType, reference, Coords > & in, + const Vector< IOType, reference, Coords > &in, #endif - IOType & out, - const lpf_pid_t root ) { + IOType &out, + const lpf_pid_t root + ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: P * IOType -> IOType #ifndef BLAS1_RAW @@ -343,13 +416,27 @@ namespace grb { lpf_memslot_t src, dest; src = dest = LPF_INVALID_MEMSLOT; RC ret = SUCCESS; - if( lpf_register_global( data.context, &out, sizeof( IOType ), &dest ) != LPF_SUCCESS ) { + if( lpf_register_global( data.context, &out, sizeof( IOType ), &dest ) + != LPF_SUCCESS + ) { ret = PANIC; } #ifndef BLAS1_RAW - if( ret == SUCCESS && lpf_register_global( data.context, const_cast< IOType * >( internal::getRaw( in ) ), data.P * sizeof( IOType ), &src ) != LPF_SUCCESS ) { + if( ret == SUCCESS && lpf_register_global( + data.context, + const_cast< IOType * >( internal::getRaw( in ) ), + data.P * sizeof( IOType ), + &src + ) != LPF_SUCCESS + ) { #else - if( ret == SUCCESS && lpf_register_global( data.context, const_cast< IOType * >( in ), data.P * sizeof( IOType ), &src ) != LPF_SUCCESS ) { + if( ret == SUCCESS && lpf_register_global( + data.context, + const_cast< IOType * >( in ), + data.P * sizeof( IOType ), + &src + ) != LPF_SUCCESS + ) { #endif // failure at this point will have to be cleaned up as best as possible ret = PANIC; @@ -357,7 +444,9 @@ namespace grb { // root copies output #ifndef BLAS1_RAW - if( ret == SUCCESS && data.s == root && &out != internal::getRaw( in ) + data.s ) { + if( ret == SUCCESS && data.s == root && + &out != internal::getRaw( in ) + data.s + ) { #else if( ret == SUCCESS && data.s == root && &out != in + data.s ) { #endif @@ -365,25 +454,35 @@ namespace grb { } // activate global regs - if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS + ) { ret = PANIC; } // scatter values - if( ret == SUCCESS && lpf_scatter( coll, src, dest, sizeof( IOType ), root ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_scatter( coll, src, dest, sizeof( IOType ), root ) != LPF_SUCCESS + ) { ret = PANIC; } // wait for completion of requested collective - if( ret == SUCCESS && lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS ) { + if( ret == SUCCESS && + lpf_sync( data.context, LPF_SYNC_DEFAULT ) != LPF_SUCCESS + ) { ret = PANIC; } // destroy memory slots - if( src != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, src ) != LPF_SUCCESS ) { + if( src != LPF_INVALID_MEMSLOT && + lpf_deregister( data.context, src ) != LPF_SUCCESS + ) { ret = PANIC; } - if( dest != LPF_INVALID_MEMSLOT && lpf_deregister( data.context, dest ) != LPF_SUCCESS ) { + if( dest != LPF_INVALID_MEMSLOT && + lpf_deregister( data.context, dest ) != LPF_SUCCESS + ) { ret = PANIC; } @@ -398,17 +497,20 @@ namespace grb { /** * Schedules a scatter operation of a vector of \a N elements of type IOType - * to a vector of \f$ N/P elements \f$ per process. It is assumed that \a N is a multiple of \a P. - * The gather shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the LPF #gather. + * to a vector of \f$ N/P elements \f$ per process. It is assumed that \a N is + * a multiple of \a P. The gather shall be complete by the end of the call. + * This is a collective graphBLAS operation. The BSP costs are as for the LPF + * #gather. * * @tparam descr The GraphBLAS descriptor. * Default is grb::descriptors::no_operation. * @tparam IOType The type of the to-be scattered value. * - * @param[in] in: The vector of N elements at the root process to be scattered. - * @param[out] out: The scattered vector of the root process, such that process \a i - * has \f$ N/P \f$ elements located at offset \f$ (N/P)*i \f$. + * @param[in] in The vector of N elements at the root process to be + * scattered. + * @param[out] out The scattered vector of the root process, such that process + * \a i has \f$ N/P \f$ elements located at offset + * \f$ (N/P)*i \f$. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -424,25 +526,27 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC scatter( #ifdef BLAS1_RAW const IOType * in, const size_t size, IOType * out, #else - const Vector< IOType, reference, Coords > & in, - Vector< IOType, reference, Coords > & out, + const Vector< IOType, reference, Coords > &in, + Vector< IOType, reference, Coords > &out, #endif - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); const size_t procs = data.P; #ifndef BLAS1_RAW @@ -538,10 +642,12 @@ namespace grb { } /** - * Schedules an allgather operation of a single object of type IOType per process - * to a vector of P elements. - * The allgather shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the LPF #allgather. + * Schedules an allgather operation of a single object of type IOType per + * process to a vector of P elements. + * + * The allgather shall be complete by the end of the call. This is a + * collective graphBLAS operation. The BSP costs are as for the LPF + * #allgather. * * @tparam descr The GraphBLAS descriptor. * Default is grb::descriptors::no_operation. @@ -564,22 +670,24 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > - RC allgather( IOType & in, + > + RC allgather( + IOType &in, #ifdef BLAS1_RAW IOType * out #else - Vector< IOType, reference, Coords > & out + Vector< IOType, reference, Coords > &out #endif ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: IOType -> P * IOType #ifndef BLAS1_RAW @@ -656,10 +764,12 @@ namespace grb { } /** - * Schedules an allgather operation of a vector of \a N/P elements of type IOType per process - * to a vector of \f$ N \f$ elements. - * The allgather shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the LPF #allgather. + * Schedules an allgather operation of a vector of \a N/P elements of type + * IOType per process to a vector of \f$ N \f$ elements. + * + * The allgather shall be complete by the end of the call. This is a + * collective graphBLAS operation. The BSP costs are as for the LPF + * #allgather. * * @tparam descr The GraphBLAS descriptor. * Default is grb::descriptors::no_operation. @@ -682,25 +792,26 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC allgather( #ifdef BLAS1_RAW const IOType * in, const size_t size, IOType * out #else - const Vector< IOType, reference, Coords > & in, - Vector< IOType, reference, Coords > & out + const Vector< IOType, reference, Coords > &in, + Vector< IOType, reference, Coords > &out #endif ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: IOType -> P * IOType #ifndef BLAS1_RAW @@ -777,8 +888,9 @@ namespace grb { } /** - * Schedules an alltoall operation of a vector of P elements of type IOType per process - * to a vector of \a P elements. + * Schedules an alltoall operation of a vector of P elements of type IOType + * per process to a vector of \a P elements. + * * The alltoall shall be complete by the end of the call. This is a collective * graphBLAS operation. The BSP costs are as for the LPF #alltoall. * @@ -786,9 +898,10 @@ namespace grb { * Default is grb::descriptors::no_operation. * @tparam IOType The type of the vector elements. * - * @param[in] in: The vector of \a P elements at each process. - * @param[out] out: The resulting vector of \a P elements, such that process \f$ i \f$ will - * receive (in order) the element at \f$ vector[i] \f$ from each process. + * @param[in] in The vector of \a P elements at each process. + * @param[out] out The resulting vector of \a P elements, such that process + * \f$ i \f$ will receive (in order) the element at + * \f$ vector[i] \f$ from each process. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -804,24 +917,25 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC alltoall( #ifdef BLAS1_RAW IOType * in, IOType * out #else - const Vector< IOType, reference, Coords > & in, - Vector< IOType, reference, Coords > & out + const Vector< IOType, reference, Coords > &in, + Vector< IOType, reference, Coords > &out #endif ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); #ifndef BLAS1_RAW TEST_VEC_SIZE( in, data.P ) TEST_VEC_SIZE( out, data.P ) @@ -899,19 +1013,22 @@ namespace grb { } /** - * Schedules an allcombine operation of a vector of \a N/P elements of type IOType per process - * to a vector of \a N/P elements. - * The allcombine shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the LPF #allcombine. + * Schedules an allcombine operation of a vector of \a N/P elements of type + * IOType per process to a vector of \a N/P elements. + * + * The allcombine shall be complete by the end of the call. This is a + * collective graphBLAS operation. The BSP costs are as for the LPF + * #allcombine. * * @tparam descr The GraphBLAS descriptor. * Default is grb::descriptors::no_operation. * @tparam Operator Which operator to use for combining. * @tparam IOType The type of the vector elements. * - * @param[in,out] inout: The vector of \a N/P elements at each process. At the end of - * the call, each process shall hold the combined vectors. - * @param[in] op: The associative operator to combine by. + * @param[in,out] inout The vector of \a N/P elements at each process. At + * the end of the call, each process shall hold the + * combined vectors. + * @param[in] op The associative operator to combine by. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -935,7 +1052,8 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename IOType #ifndef BLAS1_RAW @@ -949,16 +1067,19 @@ namespace grb { #else Vector< IOType, reference, Coords > &inout, #endif - const Operator op ) { + const Operator op + ) { // static sanity check - NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value ), - "grb::collectives::allcombine", + NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), "grb::collectives::allcombine", "Incompatible given value type and operator domains while " "no_casting descriptor was set" ); // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); #ifndef BLAS1_RAW const size_t size = internal::getCoordinates( inout ).size(); #endif @@ -1113,8 +1234,9 @@ namespace grb { } /** - * Schedules a combine operation of a vector of N/P elements of type IOType per process - * to a vector of N elements. + * Schedules a combine operation of a vector of N/P elements of type IOType + * per process to a vector of N elements. + * * The combine shall be complete by the end of the call. This is a collective * graphBLAS operation. The BSP costs are as for the LPF #combine. * @@ -1123,10 +1245,11 @@ namespace grb { * @tparam Operator Which operator to use for combining. * @tparam IOType The type of the vector elements. * - * @param[in,out] inout: The vector of \a N/P elements at each process. At the end of - * the call, the root process shall hold the combined vectors. - * @param[in] op: The associative operator to combine by. - * @param[in] root: The root process. + * @param[in,out] inout The vector of \a N/P elements at each process. At + * the end of the call, the root process shall hold the + * combined vectors. + * @param[in] op The associative operator to combine by. + * @param[in] root The root process. * * @returns grb::SUCCESS When the operation succeeds as planned. * @returns grb::PANIC When the communication layer unexpectedly fails. When @@ -1158,32 +1281,37 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC combine( #ifdef BLAS1_RAW IOType * inout, const size_t size, #else - Vector< IOType, reference, Coords > & inout, + Vector< IOType, reference, Coords > &inout, #endif const Operator op, - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // static sanity check - NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value ), - "grb::collectives::combine", + NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), "grb::collectives::combine", "Incompatible given value type and operator domains while " - "no_casting descriptor was set" ); + "no_casting descriptor was set" + ); // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: IOType -> P * IOType lpf_coll_t coll; @@ -1418,13 +1546,14 @@ namespace grb { } /** - * Schedules a reduce operation of a vector of N/P elements of type IOType per process - * to a single element. + * Schedules a reduce operation of a vector of N/P elements of type IOType per + * process to a single element. + * * The reduce shall be complete by the end of the call. This is a collective * graphBLAS operation. The BSP costs are as for the LPF #reduce. * - * Since this is a collective call, there are \a N/P values \a in at each process - * Let these vectors be denoted by \f$ x_s \f$, with + * Since this is a collective call, there are \a N/P values \a in at each + * process. Let these vectors be denoted by \f$ x_s \f$, with * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the * argument \a in on input at the user process with ID \a s. Let * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a @@ -1463,7 +1592,8 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1471,26 +1601,30 @@ namespace grb { , typename Coords #endif - > + > RC reduce( #ifdef BLAS1_RAW const InputType * in, const size_t size, #else - const Vector< InputType, reference, Coords > & in, + const Vector< InputType, reference, Coords > &in, #endif - IOType & out, + IOType &out, const Operator op, - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // static sanity check - NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value ), - "grb::collectives::reduce", + NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), "grb::collectives::reduce", "Incompatible given value type and operator domains while " - "no_casting descriptor was set" ); + "no_casting descriptor was set" + ); // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: IOType -> P * IOType lpf_coll_t coll; @@ -1579,7 +1713,8 @@ namespace grb { } // reduce to the left - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1587,16 +1722,18 @@ namespace grb { , typename Coords #endif - > - RC reducel( IOType & out, + > + RC reducel( + IOType &out, #ifdef BLAS1_RAW const InputType * in, const size_t size, #else - const Vector< InputType, reference, Coords > & in, + const Vector< InputType, reference, Coords > &in, #endif const Operator op, - const lpf_pid_t root ) { + const lpf_pid_t root + ) { #ifdef BLAS1_RAW return reduce( in, size, out, op, root ); #else @@ -1605,7 +1742,8 @@ namespace grb { } // reduce to the right - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1613,7 +1751,7 @@ namespace grb { , typename Coords #endif - > + > RC reducer( #ifdef BLAS1_RAW const InputType * in, @@ -1621,9 +1759,10 @@ namespace grb { #else const Vector< InputType, reference, Coords > & in, #endif - IOType & out, + IOType &out, const Operator op, - const lpf_pid_t root ) { + const lpf_pid_t root + ) { #ifdef BLAS1_RAW return reduce( in, size, out, op, root ); #else @@ -1632,8 +1771,9 @@ namespace grb { } /** - * Schedules an allreduce operation of a vector of N/P elements of type IOType per process - * to a single element. + * Schedules an allreduce operation of a vector of N/P elements of type IOType + * per process to a single element. + * * The allreduce shall be complete by the end of the call. This is a collective * graphBLAS operation. The BSP costs are as for the LPF #allreduce. * @@ -1676,8 +1816,8 @@ namespace grb { * \endparblock * */ - - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1685,25 +1825,29 @@ namespace grb { , typename Coords #endif - > + > RC allreduce( #ifdef BLAS1_RAW const InputType * in, const size_t size, #else - const Vector< InputType, reference, Coords > & in, + const Vector< InputType, reference, Coords > &in, #endif - IOType & out, - const Operator op ) { + IOType &out, + const Operator op + ) { // static sanity check - NO_CAST_ASSERT_BLAS1( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Operator::D1 >::value || std::is_same< IOType, typename Operator::D2 >::value || - std::is_same< IOType, typename Operator::D3 >::value ), - "grb::collectives::allreduce", + NO_CAST_ASSERT_BLAS1( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Operator::D1 >::value || + std::is_same< IOType, typename Operator::D2 >::value || + std::is_same< IOType, typename Operator::D3 >::value + ), "grb::collectives::allreduce", "Incompatible given value type and operator domains while " - "no_casting descriptor was set" ); + "no_casting descriptor was set" + ); // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); // make sure we can support comms pattern: P * IOType lpf_coll_t coll; @@ -1716,7 +1860,8 @@ namespace grb { // reduce our values locally // if casting is required to apply op, foldl will take care of this - // note: the no_casting check could be deferred to foldl but this would result in unclear error messages + // note: the no_casting check could be deferred to foldl but this would + // result in unclear error messages for( size_t i = 0; i < size; i++ ) { #ifdef BLAS1_RAW if( foldl< descr >( out, in[ i ], op ) != SUCCESS ) { @@ -1755,7 +1900,8 @@ namespace grb { continue; } // if casting is required to apply op, foldl will take care of this - // note: the no_casting check could be deferred to foldl but this would result in unclear error messages + // note: the no_casting check could be deferred to foldl but this would + // result in unclear error messages if( foldl< descr >( out, buffer[ i ], op ) != SUCCESS ) { return PANIC; } @@ -1771,7 +1917,8 @@ namespace grb { } // allreduce to the left - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1779,15 +1926,17 @@ namespace grb { , typename Coords #endif - > - RC allreducel( IOType & out, + > + RC allreducel( + IOType &out, #ifdef BLAS1_RAW const InputType * in, const size_t size, #else - const Vector< InputType, reference, Coords > & in, + const Vector< InputType, reference, Coords > &in, #endif - const Operator op ) { + const Operator op + ) { #ifdef BLAS1_RAW return allreduce( in, size, out, op ); #else @@ -1796,7 +1945,8 @@ namespace grb { } // allreduce to the right - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename Operator, typename InputType, typename IOType @@ -1804,7 +1954,7 @@ namespace grb { , typename Coords #endif - > + > RC allreducer( #ifdef BLAS1_RAW const InputType * in, @@ -1812,8 +1962,9 @@ namespace grb { #else const Vector< InputType, reference, Coords > & in, #endif - IOType & out, - const Operator op ) { + IOType &out, + const Operator op + ) { #ifdef BLAS1_RAW return allreduce( in, size, out, op ); #else @@ -1824,14 +1975,17 @@ namespace grb { /** * Schedules a broadcast operation of a vector of N elements of type IOType * to a vector of N elements per process. - * The broadcast shall be complete by the end of the call. This is a collective - * graphBLAS operation. The BSP costs are as for the LPF #broadcast. + * + * The broadcast shall be complete by the end of the call. This is a + * collective graphBLAS operation. The BSP costs are as for the LPF + * #broadcast. * * @tparam descr The GraphBLAS descriptor. * Default is grb::descriptors::no_operation. * @tparam IOType The type of the to-be broadcast vector element values. * - * @param[in,out] inout On input: the vector at the root process to be broadcast. + * @param[in,out] inout On input: the vector at the root process to be + * broadcast. * On output at process \a root: the same value. * On output at non-root processes: the vector at root. * @@ -1858,23 +2012,25 @@ namespace grb { * \endparblock * */ - template< Descriptor descr = descriptors::no_operation, + template< + Descriptor descr = descriptors::no_operation, typename IOType #ifndef BLAS1_RAW , typename Coords #endif - > + > RC broadcast( #ifdef BLAS1_RAW IOType * inout, const size_t size, #else - Vector< IOType, reference, Coords > & inout, + Vector< IOType, reference, Coords > &inout, #endif - const lpf_pid_t root ) { + const lpf_pid_t root + ) { // we need access to BSP context - internal::BSP1D_Data & data = internal::grb_BSP1D.load(); + internal::BSP1D_Data &data = internal::grb_BSP1D.load(); #ifndef BLAS1_RAW const size_t size = internal::getCoordinates( inout ).size(); @@ -1927,3 +2083,4 @@ namespace grb { } // namespace internal } // namespace grb + diff --git a/include/graphblas/bsp/collectives_blas1_raw.hpp b/include/graphblas/bsp/collectives_blas1_raw.hpp index 71e37fb09..3c61e1b7e 100644 --- a/include/graphblas/bsp/collectives_blas1_raw.hpp +++ b/include/graphblas/bsp/collectives_blas1_raw.hpp @@ -53,9 +53,10 @@ "**********************\n" ); #define BLAS1_RAW -#include "collectives_blas1.hpp" + #include "collectives_blas1.hpp" #undef BLAS1_RAW #undef NO_CAST_ASSERT_BLAS1 #endif // end ``_H_GRB_BSP_COLL_BLAS1_RAW'' + diff --git a/include/graphblas/bsp/exec_broadcast_routines.hpp b/include/graphblas/bsp/exec_broadcast_routines.hpp new file mode 100644 index 000000000..c577fa984 --- /dev/null +++ b/include/graphblas/bsp/exec_broadcast_routines.hpp @@ -0,0 +1,81 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file exec_broadcast_routines.hpp + * + * Routines used in the Launcher for broadcasting data. + * + * @author Alberto Scolari + * @date August, 2023 + */ + +#ifndef _H_BSP1D_EXEC_BROADCAST_ROUTINES +#define _H_BSP1D_EXEC_BROADCAST_ROUTINES + +#include + +#include +#include + + +namespace grb { + + namespace internal { + + /** Global internal singleton to track whether MPI was initialized. */ + extern bool grb_mpi_initialized; + + /** + * Initialize collective communication for broadcast. + * + * @param[in,out] ctx Fresh(!) LPF context to work with. + * @param[in] s This user process ID. + * @param[in] P Total number of user processes. + * @param[in] regs Total number of memory slot registrations to be made + * as part of preparing for the broadcast. + * @param[out] coll New collectives context. + * + * \internal We follow here the LPF convention where output arguments are + * ordered last. + */ + lpf_err_t lpf_init_collectives_for_broadcast( + lpf_t &ctx, + const lpf_pid_t s, const lpf_pid_t P, + const size_t regs, + lpf_coll_t &coll + ); + + /** + * Register a memory area as a global one and perform a broadcast. + * + * @param[in,out] ctx The LPF context in which \a coll was initialised. + * @param[in] coll The initialised collectives context. + * @param[in] data Pointer to data to broadcast. + * @param[in[ size The size of the data (in bytes) to broadcast. + */ + lpf_err_t lpf_register_and_broadcast( + lpf_t &ctx, lpf_coll_t &coll, + void * const data, const size_t size + ); + + } // end internal + +} // end grb + +#endif // _H_BSP1D_EXEC_BROADCAST_ROUTINES + diff --git a/include/graphblas/bsp1d/benchmark.hpp b/include/graphblas/bsp1d/benchmark.hpp index 99bf865c5..31717d187 100644 --- a/include/graphblas/bsp1d/benchmark.hpp +++ b/include/graphblas/bsp1d/benchmark.hpp @@ -16,19 +16,24 @@ */ /* - * @author A. N. Yzelman - * @date 17th of April, 2017 + * @author A. N. Yzelman; Alberto Scolari + * @date 17th of April, 2017; 28 of August 2023 */ #ifndef _H_GRB_BSP1D_BENCH #define _H_GRB_BSP1D_BENCH +#include +#include + #include -#include -#include #include +#include + +#include + #include "exec.hpp" @@ -36,530 +41,271 @@ namespace grb { namespace internal { - struct packedBenchmarkerInput { - const void * blob; - size_t blob_size; + /** + * Data structure with input and benchmarking information. + * + * @tparam InputType The input type. + * @tparam OutputType The output type. + * @tparam _mode The #grb::EXEC_MODE of the benchmarker. + * + * In automatic mode, this struct must be broadcast from process 0 to the + * other processes, as it contains the valid number of inner and outer + * iterations. In other modes, all processes must choose the same number + * of inner/outer iterations, otherwise deadlocks may occur. + * + * @tparam _requested_broadcast Whether or not the user has requested input be + * broadcast. + * + * @tparam untyped_call Whether the user has made a benchmark request + * using an untyped ALP program. + */ + template< + typename InputType, typename OutputType, + EXEC_MODE _mode, + bool _requested_broadcast, + bool untyped_call + > + struct BenchmarkDispatcher : + ExecDispatcher< + InputType, OutputType, + _mode, _requested_broadcast, + untyped_call + >, + protected BenchmarkerBase + { + /** Whether the dispatcher requires broadcasting. */ + static constexpr bool needs_initial_broadcast = _mode == AUTOMATIC; + + /** Inner number of experiments. */ size_t inner; - size_t outer; - bool bcast_blob; - }; - - } // namespace internal -} // namespace grb - -/** Global internal function used to call lpf_hook with. */ -template< typename T, typename U > -void _grb_bench_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) { - assert( P > 0 ); - assert( s < P ); - - // construct default input type - T data_in_local; - // get input struct - assert( args.input_size == - sizeof( struct grb::internal::packedBenchmarkerInput ) ); - const struct grb::internal::packedBenchmarkerInput input = - *static_cast< const struct grb::internal::packedBenchmarkerInput * >( - args.input - ); - - // get input data from PID 0 - if( input.bcast_blob && P > 1 ) { - // init BSP & collectives - lpf_coll_t coll; - lpf_err_t brc = lpf_resize_message_queue( ctx, 2*(P-1) ); - assert( brc == LPF_SUCCESS ); - brc = lpf_resize_memory_register( ctx, 2 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll ); - assert( brc == LPF_SUCCESS ); - - // we need input fields from root - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - lpf_memslot_t global; - if( s == 0 ) { - assert( input.blob_size == sizeof( T ) ); - brc = lpf_register_global( ctx, - const_cast< void * >( input.blob ), input.blob_size, &global ); - } else { - brc = lpf_register_global( ctx, &data_in_local, sizeof( T ), &global ); - } - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, sizeof( T ), 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); -#ifdef NDEBUG - (void)brc; -#endif - } else { - // if we do not broadcast then everyone should have their own local input - assert( input.blob_size == sizeof( T ) ); - } - - // get input data - const T &data_in = input.bcast_blob ? - // then get unified view of input data after broadcast - ( s == 0 ? - *static_cast< const T * >( input.blob ) : - data_in_local - ) : - // otherwise just copy from args_in if there is one (to catch automatic mode) - *static_cast< const T * >( input.blob ); - - // we need an output field - U data_out_local = U(); - U &data_out = args.output_size == sizeof( U ) ? - *static_cast< U * >( args.output ) : // if we were passed output area, use it - data_out_local; // otherwise use local empty output area - - // init graphblas - if( grb::init( s, P, ctx ) != grb::SUCCESS ) { - std::cerr << "Could not initialise ALP/GraphBLAS" << std::endl; - assert( false ); - return; // note that there is no way to return error codes - } - - // retrieve and run the function to be executed - assert( args.f_size == 2 ); - // retrieve benchmarking functions - typedef void ( *grb_func_t )( const T &, U & ); - typedef void ( *bench_func_t )( - void ( *grb_program )( const T &, U & ), - const T &, U &, - size_t, size_t, lpf_pid_t - ); - bench_func_t bench_program = - reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] ); - grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] ); - // execute benchmark - ( *bench_program )( - grb_program, data_in, data_out, input.inner, input.outer, s - ); - - // close GraphBLAS context and done! - if( grb::finalize() != grb::SUCCESS ) { - std::cerr << "Could not finalise ALP/GraphBLAS" << std::endl; - assert( false ); - return; - } -} - -/** Global internal function used to call lpf_hook with. */ -template< typename U > -void _grb_bench_varin_spmd( lpf_t ctx, - lpf_pid_t s, lpf_pid_t P, - lpf_args_t args -) { - assert( P > 0 ); - assert( s < P ); - - // input data to grbProgram - void * data_in = nullptr; - // get input struct - assert( args.input_size == - sizeof( struct grb::internal::packedBenchmarkerInput ) ); - const struct grb::internal::packedBenchmarkerInput input = - *static_cast< const struct grb::internal::packedBenchmarkerInput * >( - args.input - ); - - // size of the data_in block - size_t size; - - // we need input fields from root. First synchronise on input size - if( input.bcast_blob && P > 1 ) { - - // init collectives - lpf_coll_t coll; - lpf_err_t brc = lpf_resize_message_queue( ctx, P - 1 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_resize_memory_register( ctx, 2 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_collectives_init( ctx, s, P, 1, 0, sizeof( size_t ), &coll ); - assert( brc == LPF_SUCCESS ); - - // broadcast the size of data - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - lpf_memslot_t global; - if( s == 0 ) { - size = input.blob_size; - } - brc = lpf_register_global( ctx, &size, sizeof( size_t ), &global ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, sizeof( size_t ), 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); - - // now that the input size is known, retrieve the input data - if( s > 0 ) { - data_in = new char[ size ]; - } else { - data_in = const_cast< void * >( input.blob ); - } - brc = lpf_register_global( ctx, data_in, size, &global ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, size, 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); - -#ifdef NDEBUG - (void)brc; -#endif - } else { - data_in = const_cast< void * >( input.blob ); - size = input.blob_size; - } - - // we need an output field - U data_out_local = U(); - U &data_out = args.output_size == sizeof( U ) ? - *static_cast< U * >( args.output ) : - data_out_local; - // note: the above switch handily catches automatic mode - - // init graphblas - if( grb::init( s, P, ctx ) != grb::SUCCESS ) { - std::cerr << "Could not initialise ALP/GraphBLAS" << std::endl; - assert( false ); - return; // note that there is no way to return error codes - } - - // retrieve and run the function to be executed - assert( args.f_size == 2 ); - // assume we are performing benchmarks - typedef void ( *grb_func_t )( void *, size_t, U & ); - typedef void ( *bench_func_t )( void ( *grb_program )( void *, size_t, U & ), - void *, size_t, U &, size_t, size_t, lpf_pid_t ); - bench_func_t bench_program = - reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] ); - grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] ); - // run benchmark - ( *bench_program )( grb_program, (void *)data_in, size, - data_out, input.inner, input.outer, s ); - - // close GraphBLAS context and done! - if( grb::finalize() != grb::SUCCESS ) { - std::cerr << "Could not finalise ALP/GraphBLAS" << std::endl; - assert( false ); - return; - } -} - -/** Global internal function used to call lpf_exec with. */ -template< typename T, typename U, bool varin > -void _grb_bench_exec( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) { - assert( P > 0 ); - assert( s < P ); - - grb::internal::packedBenchmarkerInput input; - constexpr size_t size = sizeof( struct grb::internal::packedBenchmarkerInput ); - - // only call broadcast if P > 1, or otherwise UB - if( P > 1 ) { - // init and use collectives to broadcast input - lpf_coll_t coll; - const size_t nmsgs = P + 1 > 2 * P - 3 ? - P + 1 : - 2 * P - 3; // see LPF collectives doc - lpf_err_t brc = lpf_resize_message_queue( ctx, nmsgs ); - assert( brc == LPF_SUCCESS ); - brc = lpf_resize_memory_register( ctx, 3 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_collectives_init( ctx, s, P, 1, 0, size, &coll ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - lpf_memslot_t destination, source; - brc = lpf_register_global( ctx, &input, size, &destination ); - assert( brc == LPF_SUCCESS ); - if( s == 0 ) { - assert( args.input_size == size ); - brc = lpf_register_global( ctx, - const_cast< void * >( args.input ), size, &source ); - } else { - brc = lpf_register_global( ctx, - const_cast< void * >( args.input ), 0, &source ); - } - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, source, destination, size, 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, source ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, destination ); - assert( brc == LPF_SUCCESS ); -#ifdef NDEBUG - (void)brc; -#endif - } - - // non-root processes update args - if( s > 0 ) { - input.blob = nullptr; - input.blob_size = 0; - args.input = &input; - args.input_size = size; - assert( input.bcast_blob ); - } - - // now we are at exactly the equal state as a hook-induced function - if( varin ) { - _grb_bench_varin_spmd< U >( ctx, s, P, args ); - } else { - _grb_bench_spmd< T, U >( ctx, s, P, args ); - } -} - -namespace grb { - - template<> - class Benchmarker< FROM_MPI, BSP1D > : - protected Launcher< FROM_MPI, BSP1D >, protected internal::BenchmarkerBase - { - - public: + /** Outer number of experiments. */ + size_t outer; - Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) : - Launcher< FROM_MPI, BSP1D >( comm ) + /** + * Builds dispatcher from basic information. + * + * @param[in] _in Pointer to the input data. + * @param[in] _in_size Byte size of the input data. + * @param[in] _inner The nummer of inner iterations. + * @param[in] _outer The number of outer iterations. + */ + BenchmarkDispatcher( + const InputType *_in, const size_t _in_size, + size_t _inner, size_t _outer + ) : + ExecDispatcher< InputType, OutputType, _mode, _requested_broadcast, + untyped_call >( _in, _in_size ), + inner( _inner ), outer( _outer ) {} - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, - U &data_out, - const size_t inner, const size_t outer, - const bool broadcast = false - ) const { - // check arguments - if( in_size > 0 && data_in == nullptr ) { - return ILLEGAL; + /** + * Reconstruct object from LPF args, where it is embedded in its input field. + * + * @param[in] s The process ID. + * @param[in] args The LPF I/O arguments. + */ + BenchmarkDispatcher( const lpf_pid_t s, const lpf_args_t args ) : + ExecDispatcher< + InputType, OutputType, + _mode, _requested_broadcast, + untyped_call + >( nullptr, 0 ) + { + if( s > 0 && _mode == AUTOMATIC ) { + inner = 0; + outer = 0; + return; } + typedef BenchmarkDispatcher< + InputType, OutputType, + _mode, _requested_broadcast, + untyped_call + > self_t; + const self_t *orig = reinterpret_cast< const self_t * >( args.input ); + this->in = orig->in; + this->in_size = orig->in_size; + inner = orig->inner; + outer = orig->outer; + } - // prepare packed input - struct internal::packedBenchmarkerInput input; - input.blob = data_in; - input.blob_size = in_size; - input.inner = inner; - input.outer = outer; - input.bcast_blob = broadcast; - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > ); - fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { - &input, sizeof( struct internal::packedBenchmarkerInput ), - &data_out, sizeof( U ), - fargs, 2 + /** + * Benchmark the ALP function \a fun with the given input/output parameters. + * + * @param[in] fun The ALP function to run. + * @param[in] s The process ID. + * @param[in] P The total nuber of processes. + * @param[in] in Pointer to the input data. + * @param[in] in_size Byte size of the input data. + * @param[out] out Pointer where to output. + */ + grb::RC operator()( + const lpf_func_t fun, + const lpf_pid_t s, const lpf_pid_t P, + const InputType * const in, const size_t in_size, + OutputType * const out + ) const { + auto runner = [ fun, in_size, in, out, s, P ] () { + ExecDispatcher< + InputType, OutputType, + _mode, _requested_broadcast, + untyped_call + >::lpf_grb_call( fun, s, P, in, in_size, out ); }; + return benchmark< BSP1D >( runner, out->times, inner, outer, s ); + } - // do hook - const lpf_err_t spmdrc = lpf_hook( init, - &(_grb_bench_varin_spmd< U >), args ); + }; - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } + } // namespace internal - // done - return SUCCESS; - } + /** + * Collection of processes that can launch an ALP function and benchmark it. + */ + template< enum EXEC_MODE mode > + class Benchmarker< mode, BSP1D > : protected Launcher< mode, BSP1D > { - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), // user program - const T &data_in, U &data_out, // input & output data + private: + + /** Pack input/output data and run the given ALP function. */ + template< typename T, typename U, bool untyped_call > + RC pack_and_run( + const lpf_func_t alp_program, + const T * const data_in, const size_t in_size, + U * const data_out, const size_t inner, const size_t outer, - const bool broadcast = false - ) { - // prepare packed input - struct internal::packedBenchmarkerInput input; - input.blob = data_in; - input.blob_size = sizeof( T ); - input.inner = inner; - input.outer = outer; - input.bcast_blob = broadcast; - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< T, U > ); - fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 2 }; - - // do hook - const lpf_err_t spmdrc = lpf_hook( init, &(_grb_bench_spmd< T, U >), args ); - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; + const bool broadcast + ) const { + if( broadcast ) { + typedef internal::BenchmarkDispatcher< + T, U, mode, true, + untyped_call + > Disp; + Disp disp_info( data_in, in_size, inner, outer ); + return this->template run_lpf< T, U, Disp >( + alp_program, + reinterpret_cast< void * >( &disp_info ), + sizeof( Disp ), data_out + ); + } else { + typedef internal::BenchmarkDispatcher< + T, U, mode, false, + untyped_call + > Disp; + Disp disp_info = { data_in, in_size, inner, outer }; + return this->template run_lpf< T, U, Disp >( + alp_program, + reinterpret_cast< void * >( &disp_info ), + sizeof( Disp ), data_out + ); } - - // done - return SUCCESS; } - /** This implementation needs to release MPI resources in manual mode. */ - static enum RC finalize() { - // done - return Launcher< FROM_MPI, BSP1D >::finalize(); - } - }; - - template< enum EXEC_MODE mode > - class Benchmarker< mode, BSP1D > : - protected Launcher< mode, BSP1D >, protected internal::BenchmarkerBase - { - public: + /** import constructor(s) from base class, implicitly based on mode */ + using Launcher< mode, BSP1D >::Launcher; + /** - * \internal - * @param[in] process_id User process ID - * @param[in] nproces Total number of user processes - * @param[in] hostname One of the process' hostname - * @param[in] port A free port at \a hostname - * @param[in] is_mpi_inited Whether MPI is already initialised - * \endinternal + * Run an untyped ALP function in parallel. + * + * @tparam U The output type. + * + * @param[in] alp_program ALP function to execute in parallel. + * @param[in] data_in Pointer to input data. + * @param[in] in_size Size (in bytes) of the input data. + * @param[out] data_out Output data. + * @param[in] inner Number of inner iterations. + * @param[in] outer Number of outer iterations. + * @param[in] broadcast Whether to broadcast inputs from user process zero + * to all other user processes. + * + * @returns grb::SUCCESS On a successfully completed benchmark call. + * @returns grb::ILLEGAL If \a data_in is nullptr but \a in_size is + * larger than zero. + * @returns grb::PANIC On an unrecoverable critical failure (see base + * specification). */ - Benchmarker( - const size_t process_id = 0, - const size_t nprocs = 1, - const std::string hostname = "localhost", - const std::string port = "0", - const bool is_mpi_inited = false - ) : Launcher< mode, BSP1D >( - process_id, nprocs, hostname, port, is_mpi_inited - ) {} - template< typename U > - enum RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, + RC exec( + const AlpUntypedFunc< U > alp_program, + const void * const data_in, const size_t in_size, U &data_out, const size_t inner, const size_t outer, const bool broadcast = false ) const { + static_assert( + mode != AUTOMATIC || + std::is_default_constructible< U >::value, + "The output type U should be default-constructible when using automatic " + "mode launchers." + ); // check input arguments if( in_size > 0 && data_in == nullptr ) { - return ILLEGAL; - } - - // prepare packed input - struct internal::packedBenchmarkerInput input; - input.blob = data_in; - input.blob_size = in_size; - input.inner = inner; - input.outer = outer; - input.bcast_blob = broadcast; - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > ); - fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { &input, sizeof( struct internal::packedBenchmarkerInput ), - &data_out, sizeof( U ), - fargs, 2 - }; - - // launch - lpf_err_t spmdrc = LPF_SUCCESS; - if( mode == MANUAL ) { - // do hook - spmdrc = lpf_hook( init, &(_grb_bench_varin_spmd< U >), args ); - } else { - assert( mode == AUTOMATIC ); - // do exec - spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P, - &(_grb_bench_exec< void, U, true >), args ); + return grb::ILLEGAL; } - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } - - // done - return SUCCESS; + return pack_and_run< void, U, true >( + reinterpret_cast< lpf_func_t >( alp_program ), + data_in, in_size, &data_out, + inner, outer, + broadcast + ); } - /** No implementation notes. */ + /** + * Run a typed ALP function in parallel. + * + * @tparam T Input type. + * @tparam U Output type. + * + * @param[in] alp_program The ALP function to execute in parallel. + * @param[in] data_in Pointer to the input data. + * @param[out] data_out The output data. + * @param[in] inner Number of inner iterations. + * @param[in] outer Number of outer iterations. + * @param[in] broadcast Whether to broadcast inputs from user process zero + * to all other user processes. + * + * @returns grb::SUCCESS On a successfully completed benchmark call. + * @returns grb::ILLEGAL If \a broadcast was false and the benchmarker is in + * #AUTOMATIC mode, while \a T is not default- + * constructible. + * @returns grb::PANIC On unrecoverable errors (see the base specification + * for details). + */ template< typename T, typename U > - enum RC exec( - void ( *grb_program )( const T &, U & ), // user GraphBLAS program - const T &data_in, U &data_out, // input & output data + RC exec( + const AlpTypedFunc< T, U > alp_program, + const T &data_in, U &data_out, const size_t inner, const size_t outer, const bool broadcast = false - ) { - // prepare packed input - struct internal::packedBenchmarkerInput input; - input.blob = &data_in; - input.blob_size = sizeof( T ); - input.inner = inner; - input.outer = outer; - input.bcast_blob = broadcast; - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< T, U > ); - fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { &input, sizeof( struct internal::packedBenchmarkerInput ), - &data_out, sizeof( U ), - fargs, 2 - }; - - // launch - lpf_err_t spmdrc = LPF_SUCCESS; - if( mode == MANUAL ) { - // do hook - spmdrc = lpf_hook( this->init, &(_grb_bench_spmd< T, U >), args ); - } else { - assert( mode == AUTOMATIC ); - // do exec - spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P, - &(_grb_bench_exec< T, U, false >), args ); - } - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; + ) const { + static_assert( + mode != AUTOMATIC || + std::is_default_constructible< U >::value, + "The output type U should be default-constructible when using automatic " + "mode launchers." + ); + if( + mode == AUTOMATIC && broadcast == false && + !std::is_default_constructible< T >::value + ) { + std::cerr << "Error: input type of an ALP function must be " + "default-constructible when using automatic mode benchmarkers without " + "broadcasting.\n"; + return grb::ILLEGAL; } - - // done - return SUCCESS; + return pack_and_run< T, U, false >( + reinterpret_cast< lpf_func_t >( alp_program ), + &data_in, sizeof( T ), &data_out, + inner, outer, + broadcast + ); } - /** This implementation needs to release MPI resources in manual mode. */ - static enum RC finalize() { - return Launcher< mode, BSP1D >::finalize(); - } + /** Reuse BSP1D launcher implementation of finalize. */ + using Launcher< mode, BSP1D >::finalize; }; diff --git a/include/graphblas/bsp1d/exec.hpp b/include/graphblas/bsp1d/exec.hpp index e8e627aa9..0d415d636 100644 --- a/include/graphblas/bsp1d/exec.hpp +++ b/include/graphblas/bsp1d/exec.hpp @@ -16,272 +16,800 @@ */ /* - * @author A. N. Yzelman - * @date 17th of April, 2017 + * @author A. N. Yzelman; Alberto Scolari + * @date 17th of April, 2017; 28 of August 2023 */ #ifndef _H_GRB_BSP1D_EXEC #define _H_GRB_BSP1D_EXEC +#include +#include +#include +#include +#include +#include +#include + +#ifndef _GRB_NO_STDIO + #include //for std::cerr +#endif + #include #include #include -#include -#include //for memcpy +#include //for EXEC_MODE::FROM_MPI support + +#include #include + #include + #include -#include #include "init.hpp" -#ifndef _GRB_NO_STDIO - #include //for std::cerr -#endif +#include "../bsp/exec_broadcast_routines.hpp" -/** Global internal singleton to track whether MPI was initialized. */ -extern bool _grb_mpi_initialized; +namespace grb { -/** Global internal function used to call lpf_hook or lpf_exec with. */ -template< typename T, typename U, bool broadcast = true > -void _grb_exec_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) { - assert( P > 0 ); - assert( s < P ); + namespace internal { -#ifdef _DEBUG - if( s == 0 ) { - std::cout << "Info: launcher spawned or hooked " << P << " ALP/GraphBLAS " - << "user processes.\n"; - } -#endif + /** + * Base data structure storing necessary data to run an ALP function through + * LPF. + * + * @tparam InputType The type of function input. + * @tparam mode The grb::EXEC_MODE of the launcher. + * @tparam _requested_broadcast Whether inputs shall be broadcast. + */ + template< + typename InputType, + EXEC_MODE _mode, + bool _requested_broadcast + > + struct DispatchInfo { + + /** Make available the launcher mode. */ + static constexpr EXEC_MODE mode = _mode; + + /** Make available whether input broadcast was requested. */ + static constexpr bool requested_broadcast = _requested_broadcast; + + /** Note: benchmarker classes may require initial broadcasts */ + static constexpr bool needs_initial_broadcast = false; + + /** Pointer to input argument. */ + const InputType * in; + + /** Byte size of input argument. */ + size_t in_size; + + /** + * Construct from base information. + * + * @param[in] _in Pointer to the input argument. + * @param[in] _in_size Byte size of the input argument. + */ + DispatchInfo( const InputType * const _in, const size_t _in_size ) : + in( _in ), in_size( _in_size ) + {} + + /** + * Construct from LPF arguments, following a call to lpf_hook() or + * lpf_exec(). + * + * @param[in] s The user process ID. + * @param[in] args The LPF I/O arguments. + */ + DispatchInfo( const lpf_pid_t s, const lpf_args_t args ) { + if( s > 0 && mode == AUTOMATIC ) { + in = nullptr; + in_size = 0; + } else { + in = static_cast< const InputType *>( args.input ); + in_size = args.input_size; + } + } + + /** @returns in */ + const InputType * get_input() const { return in; } + + /** @returns in_size */ + size_t get_input_size() const { return in_size; } + + }; + + /** + * Adaptor to run a typed ALP function: it stores relevant parameters for data + * broadcast. + * + * Inherited from DispatchInfo. + * + * Adapts the function call to the underlying type. + */ + template< + typename InputType, typename OutputType, + EXEC_MODE _mode, + bool _requested_broadcast, bool _variable_input + > + class ExecDispatcher : + public DispatchInfo< InputType, _mode, _requested_broadcast > + { + + protected: + + /** + * Static adapter for typed ALP functions. + * + * Casts and calls the opaque \a fun function. + * + * This function is factored out so as to allow its call from the BSP + * #grb::Benchmarker. + * + * @param[in] fun Pointer to the typed ALP function. + * @param[in] s The user process ID. + * @param[in] P The total number of user processes. + * @param[in] in Pointer to the input argument. + * @param[in] in_size Byte size of the input argument. + * @param[out] out Pointer to where to store the output. + */ + static inline void lpf_grb_call( + const lpf_func_t fun, + const lpf_pid_t s, const lpf_pid_t P, + const InputType * const in, + const size_t in_size, + OutputType *out + ) { + (void) in_size; + (void) s; + (void) P; + reinterpret_cast< AlpTypedFunc< InputType, OutputType > >( fun ) + ( *in, *out ); + } + + + public: + + /** Use base constructor */ + using DispatchInfo< InputType, _mode, _requested_broadcast >::DispatchInfo; + + /** Typed dispatching has static size inputs */ + constexpr static bool is_input_size_variable = false; + + /** + * Functor operator to call a typed ALP function. + * + * @param[in] fun Pointer to the typed ALP function. + * @param[in] s The user process ID. + * @param[in] P The total number of user processes. + * @param[in] in Pointer to the input argument. + * @param[in] in_size Byte size of the input argument. + * @param[out] out Pointer to where to store the output. + */ + inline grb::RC operator()( + const lpf_func_t fun, + const lpf_pid_t s, const lpf_pid_t P, + const InputType *in, const size_t in_size, + OutputType * out + ) const { + lpf_grb_call( fun, s, P, in, in_size, out ); + return grb::SUCCESS; + } + + }; + + /** + * Adaptor to run an untyped ALP function. + * + * It stores relevant parameters for data broadcast (inherited from + * DispatchInfo) and adapts the function call to the underlying type. + */ + template< + typename OutputType, + EXEC_MODE _mode, + bool _requested_broadcast + > + class ExecDispatcher< void, OutputType, _mode, _requested_broadcast, true > : + public DispatchInfo< void, _mode, _requested_broadcast > + { + + protected: + + /** + * Calls an untyped ALP function. + * + * Factored out as a separate function to allow its use from the BSP + * #grb::Benchmarker. + * + * @param[in] fun Pointer to the untyped ALP function. + * @param[in] s The user process ID. + * @param[in] P The total number of user processes. + * @param[in] in Pointer to the input argument. + * @param[in] in_size Byte size of the input argument. + * @param[out] out Pointer to where to store the output. + */ + static inline void lpf_grb_call( + const lpf_func_t fun, + const lpf_pid_t s, const lpf_pid_t P, + const void * const in, const size_t in_size, + OutputType * const out + ) { + (void) s; + (void) P; + reinterpret_cast< AlpUntypedFunc< OutputType > >( fun ) + ( in, in_size, *out ); + } + + + public: + + /** Use base class constructor. */ + using DispatchInfo< void, _mode, _requested_broadcast >::DispatchInfo; + + /** Untyped inputs have variably-sized inputs. */ + constexpr static bool is_input_size_variable = true; + + /** + * Functor operator to call an untyped ALP function. + * + * @param[in] fun Pointer to the untyped ALP function. + * @param[in] s The user process ID. + * @param[in] P The total number of user processes. + * @param[in] in Pointer to the input argument. + * @param[in] in_size Byte size of the input argument. + * @param[out] out Pointer to where to store the output. + */ + inline grb::RC operator()( + const lpf_func_t fun, + lpf_pid_t s, lpf_pid_t P, + const void * const in, const size_t in_size, + OutputType * const out + ) const { + lpf_grb_call( fun, s, P, in, in_size, out ); + return grb::SUCCESS; + } - T data_in_local; // construct default input type - - // get input data from PID 0 - if( broadcast && P > 1 ) { - - // init collectives - lpf_coll_t coll; - lpf_err_t brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll ); - assert( brc == LPF_SUCCESS ); - - // we need input fields from root, prepare for broadcast - brc = lpf_resize_message_queue( ctx, 2*(P-1) ); // two-phase broadcast may - // get up to P-1 messages and - // send up to P-1 messages - // per process - assert( brc == LPF_SUCCESS ); - brc = lpf_resize_memory_register( ctx, 2 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - lpf_memslot_t global; - if( s == 0 ) { - assert( args.input_size == sizeof( T ) ); - brc = lpf_register_global( ctx, - const_cast< void * >( args.input ), - args.input_size, &global + }; + + /** + * Allocator for data structures: if \a typed_allocation is \a true, then + * allocate \a T on the heap via its default contructor \a T(), otherwise as a + * byte array (without construction). + * + * @tparam T The type of the object that should be allocated. + * + * @tparam typed_allocation Whether or not we may rely on the default + * constructor of \a T. + * + * This allocator is only used for typed ALP functions. + */ + template< typename T, bool typed_allocation > + struct ExecAllocator { + + static_assert( std::is_default_constructible< T >::value, + "T must be default constructible" ); + + typedef std::function< void( T * ) > Deleter; + typedef std::unique_ptr< T, Deleter > PointerHolder; + + static PointerHolder make_pointer( size_t ) { + return PointerHolder( + new T(), // allocate with default construction + [] ( T * const ptr ) { delete ptr; } + ); + } + + }; + + /** + * Template specialisation for untyped allocation: data is allocated as a byte + * array and not initialised. + * + * This allocator is used for launching untyped ALP programs \em and may be + * used for launching typed ALP programs where inputs are not-default + * constructible but copiable. The latter only applies in broadcasting mode. + */ + template< typename T > + struct ExecAllocator< T, false > { + + typedef std::function< void( T * ) > Deleter; + typedef std::unique_ptr< T, Deleter > PointerHolder; + + static PointerHolder make_pointer( const size_t size ) { + return PointerHolder( reinterpret_cast< T * >( new char[ size ] ), + [] ( T * const ptr ) { delete [] reinterpret_cast< char * >( ptr ); } ); + } + + }; + + /** + * Dispatcher to be called via LPF for distributed execution of an ALP + * function. + * + * It handles type information of the called function via the + * \a DispatcherType structure. + * + * This call may perform memory allocations and initialisations depending + * on several conditions; in general, it performs these operations only + * if strictly needed. + * + * Depending on the \a mode type parameter, it attempts to create an input + * data structure if this is not available. This is especially important + * in AUTOMATIC mode, where processes with \a s > 0 have no data + * pre-allocated. + * + * In AUTOMATIC mode, indeed, this function does its best to supply the user + * function with input data: + * - if broadcast was requested, data must be copied from the node with + * s == 0 to the other nodes; memory on s > 0 is allocated via \a T's + * default constructor if possible, or as a byte array; in the end, + * data on s > 0 is anyway overwritten by data from s == 0; + * - if broadcast was not requested, this function allocates sensible input + * by calling \a T's default constructor, if possible. If this is not + * possible, the call to this function shall have no other effect than + * (immediately) returning #grb::ILLEGAL. + * + * For modes other than AUTOMATIC, typed ALP functions are assumed to + * always have a pre-allocated input, allocated by the function that + * \em hooked into LPF; no memory is allocated in this case. If broadcast + * is requested, the input for s > 0 is simply overwritten with that from + * s == 0. For untyped functions, memory is allocated only if broadcasting + * is requested (because the size is known a priori only at user process 0), + * otherwise no allocation occurs and each ALP function takes the original + * input from the launching function. + * + * \note Thus, implicitly, if in #grb::MANUAL or in #grb::FROM_MPI modes with + * \a broadcast true, any input pointers at user processes + * \f$ s > 0 \f$ will be ignored. + * + * @tparam T ALP function input type. + * @tparam U ALP function outut type. + * @tparam DispatcherType Information on the ALP function to run. + * + * @param[in,out] ctx LPF context to run in. + * @param[in] s User process identifier (in the range [0, P)). + * @param[in] P Number of parallel processes. + * @param[in,out] args Input and output information for LPF calls. + */ + template< + typename T, typename U, + typename DispatcherType + > + void alp_exec_dispatch( + lpf_t ctx, + const lpf_pid_t s, const lpf_pid_t P, + lpf_args_t args + ) { + static_assert( + std::is_same< T, void >::value || + std::is_trivially_copyable< T >::value || + std::is_standard_layout< T >::value, + "The input type \a T must be void or memcpy-able (trivially copyable or" + "standard layout)." ); - } else { - assert( args.input_size == 0 ); - brc = lpf_register_global( ctx, &data_in_local, sizeof( T ), &global ); - } - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, sizeof( T ), 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); - -#ifdef NDEBUG - (void)brc; -#endif - } - - // sanity check - if( !broadcast ) { - // if we do not broadcast then everyone should have their own local input - assert( args.input_size == sizeof( T ) ); - } - - // get input data - const T &data_in = broadcast ? - // then get unified view of input data after broadcast - ( s == 0 ? *static_cast< const T * >( args.input ) : data_in_local ) : - // otherwise just copy from args_in if there is one (to catch automatic mode) - *static_cast< const T * >( args.input ); - - // we need an output field - U data_out_local = U(); - U &data_out = args.output_size == sizeof( U ) ? - *static_cast< U * >( args.output ) : // if we were passed output area, use it - data_out_local; // otherwise use local empy output area - - // initialise ALP/GraphBLAS - grb::RC grb_rc = grb::init( s, P, ctx ); - if( grb_rc != grb::SUCCESS ) { - std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl; - assert( false ); - return; - } - - // retrieve and run the function to be executed - if( args.f_size == 1 ) { - typedef void ( *grb_func_t )( const T &, U & ); - grb_func_t grb_program = - reinterpret_cast< grb_func_t >( args.f_symbols[ 0 ] ); - ( *grb_program )( data_in, data_out ); - } else { - // assume we are performning benchmarks - typedef void ( *grb_func_t )( const T &, U & ); - typedef void ( *bench_func_t )( void ( *grb_program )( const T &, U & ), - const T &, U &, lpf_pid_t ); - bench_func_t bench_program = reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] ); - grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] ); - ( *bench_program )( grb_program, data_in, data_out, s ); - } - - // finalise ALP/GraphBLAS - grb_rc = grb::finalize(); - if( grb_rc != grb::SUCCESS ) { - std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl; - assert( false ); - } -} - -/** Global internal function used to call lpf_hook or lpf_exec with. */ -template< typename U, bool broadcast = true > -void _grb_exec_varin_spmd( lpf_t ctx, lpf_pid_t s, lpf_pid_t P, lpf_args_t args ) { - assert( P > 0 ); - assert( s < P ); + constexpr bool is_typed_alp_prog = !(DispatcherType::is_input_size_variable); + constexpr bool is_input_def_constructible = + std::is_default_constructible< T >::value; + constexpr grb::EXEC_MODE mode = DispatcherType::mode; + constexpr bool broadcast_input = DispatcherType::requested_broadcast; + constexpr bool dispatcher_needs_broadcast = + DispatcherType::needs_initial_broadcast; + + assert( P > 0 ); + assert( s < P ); #ifdef _DEBUG - // info to stdout - if( s == 0 ) { - std::cout << "Info: launcher spawned " << P << " processes.\n"; - } + if( s == 0 ) { + std::cout << "Info: launcher spawned or hooked " << P << " ALP user " + << "processes.\n"; + } #endif - // input data to grbProgram - void * data_in = NULL; - - // size of the data_in block - size_t size; - - // we need input fields from root. First synchronise on input size - if( broadcast && P > 1 ) { - - // init collectives - lpf_coll_t coll; - lpf_err_t brc = lpf_resize_message_queue( ctx, P - 1 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_resize_memory_register( ctx, 2 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_collectives_init( ctx, s, P, 1, 0, sizeof( size_t ), &coll ); - assert( brc == LPF_SUCCESS ); - - // broadcast the size of data - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - lpf_memslot_t global; - if( s == 0 ) { - size = args.input_size; - } - brc = lpf_register_global( ctx, &size, sizeof( size_t ), &global ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, sizeof( size_t ), 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); - - // now that the input size is known, retrieve the input data - if( s > 0 ) { - data_in = new char[ size ]; - } else { - data_in = const_cast< void * >( args.input ); + if( + !is_input_def_constructible && + is_typed_alp_prog && + mode == AUTOMATIC && + !broadcast_input && + P > 1 + ) { + std::cerr << "Error: cannot locally construct input type (typeid name \"" + << typeid(T).name() << "\"for an ALP program that is launched " + << "in automatic mode, with broadcasting, and using more than one user" + << "one user process.\n" + << "Additionally, this error should have been caught prior to the " + << "attempted launch of the ALP program-- please submit a bug report." + << std::endl; + assert( false ); + return; + } + + lpf_coll_t coll; + lpf_err_t brc = LPF_SUCCESS; + + // initialise collectives if they are needed + if( P > 1 && (broadcast_input || dispatcher_needs_broadcast) ) { + brc = lpf_init_collectives_for_broadcast( ctx, s, P, 2, coll ); + if( brc != LPF_SUCCESS ) { + std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed" + << std::endl; + } + assert( brc == LPF_SUCCESS ); + } + + // call information for the ALP function, reconstructed from the arguments + DispatcherType dispatcher( s, args ); + + // ensure dispatcher is valid + if( P > 1 && dispatcher_needs_broadcast ) { + // fetch the dispatcher + brc = lpf_register_and_broadcast( + ctx, coll, + static_cast< void * >( &dispatcher ), + sizeof( DispatcherType ) + ); + if( brc != LPF_SUCCESS ) { + std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed" + << std::endl; + } + assert( brc == LPF_SUCCESS ); + } + + // dispatcher is now valid on all processes: assign initial value for size + size_t in_size = dispatcher.get_input_size(); + + // set in_size on user processes with IDs larger than 0 + if( P > 1 ) { + // check if input args should come from PID 0 + if( broadcast_input ) { + // user requested broadcast and the input size is user-given: fetch size + lpf_err_t brc = lpf_register_and_broadcast( + ctx, coll, + reinterpret_cast< void * >( &in_size ), sizeof( size_t ) + ); + if( brc != LPF_SUCCESS ) { + std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed" + << std::endl; + } + assert( brc == LPF_SUCCESS ); + assert( in_size != 0 ); + } else if( mode == AUTOMATIC && !broadcast_input && s > 0 ) { + // AUTOMATIC mode, untyped, no broadcast: pass zero as size + in_size = 0; + } + } + + // now set the input argument (in) itself + constexpr bool typed_alloc = is_typed_alp_prog && is_input_def_constructible; + typedef ExecAllocator< T, typed_alloc > InputAllocator; + typename InputAllocator::PointerHolder data_in_holder; + + // set default value + const T * data_in = dispatcher.get_input(); + + // set in on user processes with IDs larger than 0 + if( s > 0 ) { + if( mode == AUTOMATIC && !is_typed_alp_prog && !broadcast_input ) { + // AUTOMATIC mode, untyped, no broadcast: pass nullptr + data_in = nullptr; + } else if( mode == AUTOMATIC || (broadcast_input && !is_typed_alp_prog) ) { + // if no memory exists (mode == AUTOMATIC) or the size was not known and + // the user requested broadcast, then allocate input data + data_in_holder = InputAllocator::make_pointer( in_size ); + data_in = data_in_holder.get(); + } + } + + // set contents of in + if( broadcast_input && P > 1 ) { + // retrieve data + lpf_err_t brc = lpf_register_and_broadcast( + ctx, coll, + const_cast< void * >( reinterpret_cast< const void * >( data_in ) ), + in_size + ); + if( brc != LPF_SUCCESS ) { + std::cerr << __FILE__ << ", " << __LINE__ << ": LPF collective failed" + << std::endl; + } + assert( brc == LPF_SUCCESS ); + } + + // now set the output argument + typedef ExecAllocator< U, std::is_default_constructible< U >::value > + OutputAllocator; + typename OutputAllocator::PointerHolder data_out_holder; + + // set default value + U * data_out = reinterpret_cast< U * >( args.output ); + + // set out on user processes with ID larger than 0 + if( mode == AUTOMATIC && s > 0 ) { + // allocate output if memory does not exist + data_out_holder = OutputAllocator::make_pointer( sizeof( U ) ); + data_out = reinterpret_cast< U * >( data_out_holder.get() ); + } + + // at this point, the dispatcher, input, and output are all good to go + + // now, initialise ALP + grb::RC grb_rc = grb::init< BSP1D >( s, P, ctx ); + if( grb_rc != grb::SUCCESS ) { + std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl; + assert( false ); + return; + } + + // retrieve and run the function to be executed + assert( args.f_size == 1 ); + grb_rc = dispatcher( args.f_symbols[ 0 ], s, P, data_in, in_size, data_out ); + if( grb_rc != grb::SUCCESS ) { + std::cerr << "Error: dispatcher failed" << std::endl; + assert( false ); + return; + } + + // finalise ALP/GraphBLAS + grb_rc = grb::finalize< BSP1D >(); + if( grb_rc != grb::SUCCESS ) { + std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl; + assert( false ); + } } - brc = lpf_register_global( ctx, data_in, size, &global ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_broadcast( coll, global, global, size, 0 ); - assert( brc == LPF_SUCCESS ); - brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); - assert( brc == LPF_SUCCESS ); - brc = lpf_deregister( ctx, global ); - assert( brc == LPF_SUCCESS ); - -#ifdef NDEBUG - (void)brc; -#endif - } else { - data_in = const_cast< void * >( args.input ); - size = args.input_size; - } - - // we need an output field - U data_out_local = U(); - U &data_out = args.output_size == sizeof( U ) ? - *static_cast< U * >( args.output ) : - data_out_local; - // note: the above switch handily catches automatic mode - - // initialise ALP/GraphBLAS - grb::RC grb_rc = grb::init( s, P, ctx ); - if( grb_rc != grb::SUCCESS ) { - std::cerr << "Error: could not initialise ALP/GraphBLAS" << std::endl; - assert( false ); - return; - } - - // retrieve and run the function to be executed - if( args.f_size == 1 ) { - typedef void ( *grb_func_t )( void *, size_t, U & ); - grb_func_t grb_program = - reinterpret_cast< grb_func_t >( args.f_symbols[ 0 ] ); - ( *grb_program )( (void *)data_in, size, data_out ); - } else { - // assume we are performning benchmarks - typedef void ( *grb_func_t )( void *, size_t, U & ); - typedef void ( *bench_func_t )( void ( *grb_program )( void *, size_t, U & ), - void *, size_t, - U &, lpf_pid_t - ); - bench_func_t bench_program = reinterpret_cast< bench_func_t >( args.f_symbols[ 0 ] ); - grb_func_t grb_program = reinterpret_cast< grb_func_t >( args.f_symbols[ 1 ] ); - ( *bench_program )( grb_program, (void *)data_in, size, data_out, s ); - } - - // finalise ALP/GraphBLAS - grb_rc = grb::finalize(); - if( grb_rc != grb::SUCCESS ) { - std::cerr << "Error: could not finalise ALP/GraphBLAS" << std::endl; - assert( false ); - } -} -namespace grb { + /** + * Base class for Launcher's, with common logic and information; mainly + * wrapping user #exec() parameters into internal data structures and calling + * LPF. + * + * @tparam mode grb::EXEC_MODE LPF execution mode + */ + template< enum EXEC_MODE mode > + class BaseLpfLauncher { + + protected: + + /** The LPF init struct. Will be initialised during construction. */ + lpf_init_t init; + + /** Base constructor. */ + BaseLpfLauncher() : init( LPF_INIT_NONE ) {} + + /** Disable copy constructor. */ + BaseLpfLauncher( const BaseLpfLauncher< mode > & ) = delete; + + /** Disable copy constructor. */ + BaseLpfLauncher & operator=( const BaseLpfLauncher< mode > & ) = delete; + + /** + * Run the given \a alp_program with the given pointers to input and output + * arguments. + * + * @tparam T Input type. + * @tparam U Output type. + * @tparam DispatcherType Type of the data structure that holds input and + * call information. + * + * @param[in] alp_program The ALP program to execute. + * @param[in] data_in Pointer to the input argument. + * @param[in] in_size Byte size of the input arugment. + * @param[out] data_out Pointer to where to write output. + * + * @return RC status code of the LPF call. + * + * \warning Issues with default-constructibility of the input type \a T + * (in the case of AUTOMATIC mode and no-broadcasting), while + * caught in the SPMD program itself as a safety measure, should + * be caught before a call to this function in order to comply with + * the specification. + * + * \note This function is factored out for use with the BSP + * #grb::Benchmarker. + */ + template< + typename T, typename U, + typename DispatcherType + > + RC run_lpf( + const lpf_func_t alp_program, + const void * const data_in, + const size_t in_size, + U * const data_out + ) const { + // construct LPF I/O args + lpf_args_t args = { + data_in, in_size, + data_out, sizeof( U ), + &alp_program, 1 + }; + + // get LPF function pointer + lpf_spmd_t fun = reinterpret_cast< lpf_spmd_t >( + internal::alp_exec_dispatch< T, U, DispatcherType > ); + + // execute + const lpf_err_t spmdrc = init == LPF_INIT_NONE + ? lpf_exec( LPF_ROOT, LPF_MAX_P, fun, args ) + : lpf_hook( init, fun, args ); + + // check error code + if( spmdrc != LPF_SUCCESS ) { + return PANIC; + } - /** - * No implementation notes. - */ - template<> - class Launcher< FROM_MPI, BSP1D > { + // done + return SUCCESS; + } - protected: + private: + + /** + * Pack data received from user into an internal::ExecDispatcher data + * structure and run the ALP program. + * + * @tparam T Input type. + * @tparam U Output type. + * @tparam untyped_call Whether the ALP function is typed. + * + * \note If \a untyped_call is true, then \a T must be + * void. + * + * @param[in] alp_program The ALP program to execute. + * @param[in] data_in Pointer to input data. + * @param[in] in_size Size of the input data + * + * \warning \a in_size must equal sizeof( T ) if \a untyped_call + * equals false. + * + * @param[out] data_out Pointer to where to write output data. + * @param[in] broadcast Whether to broadcast input from node 0 to all + * others. + * + * \warning Issues with default-constructibility of the input type \a T + * (in the case of AUTOMATIC mode and no-broadcasting), while + * caught in the SPMD program itself as a safety measure, should + * be caught before a call to this function in order to comply with + * the specification. + * + * @returns #grb::SUCCESS When the ALP program was launched successfully. + * @returns #grb::PANIC On error in the communication layer while + * launching the program, during program execution, + * or while terminating the program. + */ + template< typename T, typename U, bool untyped_call > + RC pack_data_and_run( + const lpf_func_t alp_program, + const T * const data_in, + const size_t in_size, + U * const data_out, + const bool broadcast + ) const { + static_assert( std::is_void< T >::value || !untyped_call, + "If T is not void, this must refer to a typed ALP program call" ); + if( !untyped_call ) { + assert( grb::utils::SizeOf< T >::value == in_size ); + } + if( broadcast ) { + typedef internal::ExecDispatcher< T, U, mode, true, untyped_call > Disp; + return run_lpf< T, U, Disp >( alp_program, data_in, in_size, data_out ); + } else { + typedef internal::ExecDispatcher< T, U, mode, false, untyped_call > Disp; + return run_lpf< T, U, Disp >( alp_program, data_in, in_size, data_out ); + } + } - /** The LPF init struct. Will be initialised during construction. */ - lpf_init_t init; + public: + + /** + * Run a typed ALP function distributed via LPF. + * + * In case of AUTOMATIC mode, input data is allocated by default (if the type + * allows) or as a sequence of bytes. This assumes the default allocator does + * not have \b any side affect (like memory allocation). In case of broadcast + * request, data is trivially serialized: hence, non-trivial objects (e.g., + * storing pointers to memory buffers) are not valid anymore in processes + * other than the master. + * + * @tparam T Input type. + * @tparam U Output type. + * + * @param[in] alp_program ALP function to run in parallel. + * @param[in] data_in Input data. + * @param[out] data_out Output data. + * @param[in] broadcast Whether to broadcast input from node 0 to the + * others. + * + * @returns #grb::SUCCESS When the ALP program was launched successfully. + * @returns #grb::ILLEGAL When the ALP program was launched in AUTOMATIC + * mode, without broadcasting, while \a T was not + * default-constructible. + * @returns #grb::PANIC On error in the communication layer while + * launching the program, during program execution, + * or while terminating the program. + */ + template< typename T, typename U > + RC exec( + const AlpTypedFunc< T, U > alp_program, + const T &data_in, + U &data_out, + const bool broadcast = false + ) { + static_assert( + mode != AUTOMATIC || + std::is_default_constructible< U >::value, + "The output type U should be default-constructible when using automatic " + "mode launchers." + ); + if( + mode == AUTOMATIC && broadcast == false && + !std::is_default_constructible< T >::value + ) { + return grb::ILLEGAL; + } else { + return pack_data_and_run< T, U, false >( + reinterpret_cast< lpf_func_t >( alp_program ), + &data_in, sizeof( T ), + &data_out, broadcast + ); + } + } + + /** + * Run an untyped ALP function in parallel via LPF. + * + * Input data has variable size, known only at runtime. Therefore, input + * data cannot be costructed by default, but are serialized and replicated as + * a mere sequence of bytes. + * + * @tparam T Input type. + * @tparam U Output type. + * + * @param[in] alp_program ALP function to run in parallel. + * @param[in] data_in Pointer to input data. + * @param[in] in_size Size of input data. + * @param[out] data_out Output data. + * @param[in] broadcast Whether to broadcast input from node 0 to the + * others. + * + * @returns #grb::SUCCESS When the ALP program was launched successfully. + * @returns #grb::PANIC On error in the communication layer while + * launching the program, during program execution, + * or while terminating the program. + */ + template< typename U > + RC exec( + const AlpUntypedFunc< U > alp_program, + const void * const data_in, const size_t in_size, + U &data_out, + const bool broadcast = false + ) { + static_assert( + mode != AUTOMATIC || + std::is_default_constructible< U >::value, + "The output type U should be default-constructible when using automatic " + "mode launchers." + ); + return pack_data_and_run< void, U, true >( + reinterpret_cast< lpf_func_t >( alp_program ), + data_in, in_size, &data_out, broadcast + ); + } + + }; + + } // end namespace internal + + /** + * Specialization of Launcher to be used when MPI has already been + * initialised but not LPF. + */ + template<> + class Launcher< FROM_MPI, BSP1D > : + public internal::BaseLpfLauncher< FROM_MPI > + { public: @@ -293,13 +821,6 @@ namespace grb { * @throws runtime_error When a standard MPI call fails. */ Launcher( const MPI_Comm comm = MPI_COMM_WORLD ) { - // run-time sanity check when using MPI: - // we (as in LPF) should NOT be managing MPI - if( LPF_MPI_AUTO_INITIALIZE ) { - throw std::runtime_error( "Program was not linked with the symbol " - "LPF_MPI_AUTO_INITIALIZE set to 0 while an instance of " - "Launcher or Launcher is being requested." ); - } // init from communicator const lpf_err_t initrc = lpf_mpi_initialize_with_mpicomm( comm, &init ); @@ -307,22 +828,15 @@ namespace grb { // check for success if( initrc != LPF_SUCCESS ) { throw std::runtime_error( - "LPF could not connect launcher group over TCP/IP." + "LPF could not be initialized via the given MPI communicator." ); } // done! } - /** Disable copy constructor. */ - Launcher( const Launcher & ) = delete; - - /** Disable copy constructor. */ - Launcher & operator=( const Launcher & ) = delete; - /** - * Implementation note: this Launcher will clear a field of - * type \a lpf_init_t. + * Implementation note: this Launcher will clear #init. */ ~Launcher() { assert( init != LPF_INIT_NONE ); @@ -335,66 +849,6 @@ namespace grb { init = LPF_INIT_NONE; } - /** No implementation notes. */ - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, - U &data_out, - const bool broadcast = false - ) const { - // check input arguments - if( in_size > 0 && data_in == nullptr ) { - return ILLEGAL; - } - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { data_in, in_size, &data_out, sizeof( U ), fargs, 1 }; - - // do hook - const lpf_err_t spmdrc = broadcast ? - lpf_hook( init, &(_grb_exec_varin_spmd< U, true >), args ) : - lpf_hook( init, &(_grb_exec_varin_spmd< U, false >), args ); - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } - - // done - return SUCCESS; - } - - /** No implementation notes. */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), // user GraphBLAS program - const T &data_in, U &data_out, // input & output data - const bool broadcast = false - ) { - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 1 }; - - // do hook - const lpf_err_t spmdrc = broadcast ? - lpf_hook( init, &(_grb_exec_spmd< T, U, true >), args ) : - lpf_hook( init, &(_grb_exec_spmd< T, U, false >), args ); - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } - - // done - return SUCCESS; - } - /** * Since the user is using ALP/GraphBLAS directly from MPI, the user codes * should call MPI_Finalize. This function thus is a no-op in this particular @@ -407,64 +861,83 @@ namespace grb { }; /** - * No implementation notes. + * Specialisation of Launcher for the automatic mode. + * + * Assumes LPF takes care of any initialisation requirements. */ - template< enum EXEC_MODE mode > - class Launcher< mode, BSP1D > { - - - private: - - // we should never be called for FROM_MPI mode-- the above - // specialisation should be used instead - static_assert( mode != FROM_MPI, - "EXEC_MODE::FROM_MPI for BSP1D is implemented in specialised class" ); - - /** The user process ID in this launcher group. */ - const size_t _s; + template<> + class Launcher< AUTOMATIC, BSP1D > : + public internal::BaseLpfLauncher< AUTOMATIC > + { - /** The total number of user processes in this launcher group. */ - const size_t _P; + public: - /** The connection broker in this launcher group. */ - const std::string _hostname; + Launcher() = default; - /** The port at #_hostname used for brokering connections. */ - const std::string _port; + ~Launcher() { + assert( init == LPF_INIT_NONE ); + } + static RC finalize() { + return grb::SUCCESS; + } - protected: + }; - /** The LPF init struct. Will be initialised during construction. */ - lpf_init_t init; + /** + * Specialisation of Launcher for the manual mode. + * + * The callee here manually connects existing processes into a joint LPF + * context, that is then used to execute (parallel) ALP programs. + * + * Assumes the pre-existing processes may be connected via TCP/IP. + */ + template< enum EXEC_MODE mode > + class Launcher< mode, BSP1D > : public internal::BaseLpfLauncher< mode > { + static_assert( mode == MANUAL, "Expected manual launcher mode" ); public: /** - * When \a mode is #AUTOMATIC, this implementation adheres to - * the base specification. When \a mode is #MANUAL, this - * implementation specifies additionally the following: + * Constructs a manual mode launcher. * - * The time-out of this constructor is thirty seconds. + * This implementation specifies the following constraints on the specified + * input arguments. * - * @param[in] hostname May not be empty. Must resolve to an IP. - * @param[in] port May not be empty. Must be either a port - * number of a registered service name. + * @param[in] process_id User process ID. + * @param[in] nprocs Total number of user processes. + * @param[in] hostname Host name (or IP) of one of the user processes + * involved in the collective construction of this + * launcher. May not be empty. + * @param[in] port A free port for connecting to \a hostname during the + * collective construction of this launcher. May not be + * empty. Must be either a port number of a registered + * service name. * - * In addition to the standard-defined exceptions, the following - * may additionally be thrown: - * @throws invalid_argument When hostname or port are empty. - * @throws runtime_error When the requested launcher group - * could not be created. + * The time-out of this constructor is two minutes. + * + * If giving a \a hostname as a string, it must resolve to an IP; if + * resolution fails, this constructor call will fail. + * + * If giving a \a port as a string, it must resolve to a port number; if + * resolution fails, this constructor call will fail. + * + * In addition to the standard-defined exceptions, the following errors may + * additionally be thrown: + * + * @throws invalid_argument When hostname or port are empty but \a nprocs is + * larger than one. + * @throws runtime_error When the requested launcher group could not be + * created. */ Launcher( - const size_t process_id = 0, // user process ID - const size_t nprocs = 1, // total number of user processes - const std::string hostname = "localhost", // one of the process' hostnames - const std::string port = "0", // a free port at hostname + const size_t process_id = 0, + const size_t nprocs = 1, + const std::string &hostname = "localhost", + const std::string &port = "0", const bool is_mpi_inited = false - ) : _s( process_id ), _P( nprocs ), _hostname( hostname ), _port( port ) { + ) { // sanity check if( nprocs == 0 ) { throw std::invalid_argument( "Total number of user processes must be " @@ -474,161 +947,49 @@ namespace grb { throw std::invalid_argument( "Process ID must be strictly smaller than " "total number of user processes." ); } - - // when using MPI in hook mode - if( mode == MANUAL ) { - // run-time sanity check when using MPI: - // we (as in LPF) should NOT be managing MPI - if( LPF_MPI_AUTO_INITIALIZE ) { - throw std::runtime_error( "Program was not linked with the symbol " - "LPF_MPI_AUTO_INITIALIZE set to 0 while an instance of " - "Launcher or Launcher is being requested." ); - } - // initialise MPI if not already done - if( !is_mpi_inited && !_grb_mpi_initialized ) { - if( MPI_Init( NULL, NULL ) != MPI_SUCCESS ) { - throw std::runtime_error( "Call to MPI_Init failed." ); - } else { - _grb_mpi_initialized = true; - } - } + if( nprocs > 1 && (hostname.empty() || port.empty()) ) { + throw std::invalid_argument( "Host or port names may not be empty if the " + "launcher group contains more than one process." ); } - // handle each mode's specifics - if( mode == MANUAL ) { - // additional sanity check - if( hostname.compare( "" ) == 0 || port.compare( "" ) == 0 ) { - throw std::invalid_argument( - "Hostname and/or port name cannot be empty." - ); + // initialise MPI if not already done + // TODO FIXME the MPI_Init should not be here. See GitHub issue #240. + if( !is_mpi_inited && !internal::grb_mpi_initialized ) { + if( MPI_Init( NULL, NULL ) != MPI_SUCCESS ) { + throw std::runtime_error( "Call to MPI_Init failed." ); + } else { + internal::grb_mpi_initialized = true; } + } - // try and create a lpf_init_t - const lpf_err_t initrc = lpf_mpi_initialize_over_tcp( - hostname.c_str(), port.c_str(), // server info - 120000, // time out - process_id, nprocs, // process info - &init - ); + // try and create a lpf_init_t + const lpf_err_t initrc = lpf_mpi_initialize_over_tcp( + hostname.c_str(), port.c_str(), // server info + 120000, // time out + process_id, nprocs, // process info + &(this->init) + ); - // check for success - if( initrc != LPF_SUCCESS ) { + // check for success + if( initrc != LPF_SUCCESS ) { #ifndef _GRB_NO_STDIO - throw std::runtime_error( - "LPF could not connect launcher group over TCP/IP." - ); + throw std::runtime_error( + "LPF could not connect launcher group over TCP/IP." + ); #endif - } - } else { - // sanity check: we should be in automatic mode - assert( mode == AUTOMATIC ); - // otherwise, we don't need init - init = LPF_INIT_NONE; } - } - /** Disable copy constructor. */ - Launcher( const Launcher & ) = delete; - - /** Disable copy constructor. */ - Launcher & operator=( const Launcher & ) = delete; - - /** - * Implementation note: this Launcher may need to clear a field of - * type \a lpf_init_t when used in MANUAL mode. - */ ~Launcher() { - if( mode == MANUAL ) { - assert( init != LPF_INIT_NONE ); - // try and destroy the lpf_init_t - const lpf_err_t finrc = lpf_mpi_finalize( init ); - if( finrc != LPF_SUCCESS ) { + assert( this->init != LPF_INIT_NONE ); + // try and destroy the lpf_init_t + const lpf_err_t finrc = lpf_mpi_finalize( this->init ); + if( finrc != LPF_SUCCESS ) { #ifndef _GRB_NO_STDIO - std::cerr << "Warning: could not destroy launcher::init from ~launcher.\n"; + std::cerr << "Warning: could not destroy launcher::init from ~launcher.\n"; #endif - } - init = LPF_INIT_NONE; - } else { - assert( init == LPF_INIT_NONE ); } - } - - /** No implementation notes. */ - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, - U &data_out, - const bool broadcast = false - ) const { - // check input arguments - if( in_size > 0 && data_in == nullptr ) { - return ILLEGAL; - } - - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { data_in, in_size, &data_out, sizeof( U ), fargs, 1 }; - - // launch - lpf_err_t spmdrc = LPF_SUCCESS; - if( mode == MANUAL ) { - // do hook - spmdrc = broadcast ? - lpf_hook( init, &(_grb_exec_varin_spmd< U, true >), args ) : - lpf_hook( init, &(_grb_exec_varin_spmd< U, false >), args ); - } else { - assert( mode == AUTOMATIC ); - // do exec - spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P, - &(_grb_exec_varin_spmd< U >), args ); - } - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } - - // done - return SUCCESS; - } - - /** No implementation notes. */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), // user GraphBLAS program - const T &data_in, U &data_out, // input & output data - const bool broadcast = false - ) { - // prepare args - lpf_func_t fargs[ 2 ]; - lpf_args_t args; - fargs[ 0 ] = reinterpret_cast< lpf_func_t >( grb_program ); - args = { &data_in, sizeof( T ), &data_out, sizeof( U ), fargs, 1 }; - - // launch - lpf_err_t spmdrc = LPF_SUCCESS; - if( mode == MANUAL ) { - // do hook - spmdrc = broadcast ? - lpf_hook( init, &(_grb_exec_spmd< T, U, true >), args ) : - lpf_hook( init, &(_grb_exec_spmd< T, U, false >), args ); - } else { - assert( mode == AUTOMATIC ); - // do exec - spmdrc = lpf_exec( LPF_ROOT, LPF_MAX_P, &(_grb_exec_spmd< T, U >), args ); - } - - // check error code - if( spmdrc != LPF_SUCCESS ) { - return PANIC; - } - - // done - return SUCCESS; + this->init = LPF_INIT_NONE; } /** @@ -636,15 +997,14 @@ namespace grb { */ static RC finalize() { // finalise MPI when in manual mode - if( mode == MANUAL && _grb_mpi_initialized ) { - _grb_mpi_initialized = false; - if( MPI_Finalize() != MPI_SUCCESS ) { + // TODO FIXME the MPI_Finalize should not be here. See GitHub issue #240. + if( internal::grb_mpi_initialized && MPI_Finalize() != MPI_SUCCESS ) { #ifndef _GRB_NO_STDIO - std::cerr << "Warning: MPI_Finalize returned non-SUCCESS exit code.\n"; + std::cerr << "Warning: MPI_Finalize returned non-SUCCESS exit code.\n"; #endif - return grb::PANIC; - } + return grb::PANIC; } + internal::grb_mpi_initialized = false; return grb::SUCCESS; } diff --git a/include/graphblas/bsp1d/matrix.hpp b/include/graphblas/bsp1d/matrix.hpp index 13ce193aa..ea0ed76b3 100644 --- a/include/graphblas/bsp1d/matrix.hpp +++ b/include/graphblas/bsp1d/matrix.hpp @@ -197,7 +197,7 @@ namespace grb { // check default fields that should have been set by public constructor assert( _m == 0 ); assert( _n == 0 ); - assert( _id = std::numeric_limits< uintptr_t >::max() ); + assert( _id == std::numeric_limits< uintptr_t >::max() ); assert( _ptr == nullptr ); assert( _cap == 0 ); // these default values correspond to an empty matrix and which the @@ -265,7 +265,7 @@ namespace grb { size_t global_cap = 0; try { // complete local initialisation - _local.initialize( &_id, local_m, local_n, local_nz ); + _local.initialize( &id, local_m, local_n, local_nz ); // sync global capacity global_cap = capacity( _local ); diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp index 2bcf796aa..f7ecb8cc2 100644 --- a/include/graphblas/exec.hpp +++ b/include/graphblas/exec.hpp @@ -45,7 +45,10 @@ #ifdef _GRB_BACKEND namespace grb { - template< enum EXEC_MODE mode, enum Backend implementation = config::default_backend > + template< + enum EXEC_MODE mode, + enum Backend implementation = config::default_backend + > class Launcher; } #endif diff --git a/include/graphblas/hyperdags/benchmark.hpp b/include/graphblas/hyperdags/benchmark.hpp index 23502f33c..5492a8b6e 100644 --- a/include/graphblas/hyperdags/benchmark.hpp +++ b/include/graphblas/hyperdags/benchmark.hpp @@ -27,10 +27,8 @@ #ifndef _H_GRB_HYPERDAGS_BENCH #define _H_GRB_HYPERDAGS_BENCH -#include #include - -#include "exec.hpp" +#include namespace grb { @@ -38,60 +36,27 @@ namespace grb { /** \internal Simply wraps around the underlying Benchmarker implementation. */ template< enum EXEC_MODE mode > class Benchmarker< mode, hyperdags > : - protected Launcher< mode, hyperdags >, protected internal::BenchmarkerBase + public Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING > { private: typedef Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING > MyBenchmarkerType; - MyBenchmarkerType benchmarker; - public: - /** \internal Simple delegation. */ - Benchmarker( - const size_t process_id = 0, - const size_t nprocs = 1, - const std::string hostname = "localhost", - const std::string port = "0" - ) : - benchmarker( process_id, nprocs, hostname, port ) - {} - - /** \internal Simple delegation. */ - template< typename U > - RC exec( void ( *grb_program )( const void *, const size_t, U & ), - const void * const data_in, const size_t in_size, - U &data_out, - const size_t inner, const size_t outer, - const bool broadcast = false - ) const { - return benchmarker.exec( - grb_program, - data_in, in_size, - data_out, - inner, outer, - broadcast - ); - } - - /** \internal Simple delegation. */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), - const T &data_in, U &data_out, - const size_t inner, const size_t outer, - const bool broadcast = false - ) { - return benchmarker.exec( - grb_program, - data_in, data_out, - inner, outer, - broadcast - ); - } + /** + * \internal Delegates to #grb::Benchmarker constructor. By default, this + * reverts to the reference backend. + */ + using MyBenchmarkerType::Benchmarker; + + /** + * \internal Delegates to #grb::Benchmarker finalize. By default, this + * reverts to the reference backend. + */ + using MyBenchmarkerType::finalize; }; diff --git a/include/graphblas/hyperdags/exec.hpp b/include/graphblas/hyperdags/exec.hpp index 376e78b5b..14001e4be 100644 --- a/include/graphblas/hyperdags/exec.hpp +++ b/include/graphblas/hyperdags/exec.hpp @@ -37,64 +37,28 @@ namespace grb { * No implementation notes. */ template< EXEC_MODE mode > - class Launcher< mode, hyperdags > { + class Launcher< mode, hyperdags > : + public Launcher< mode, _GRB_WITH_HYPERDAGS_USING > + { private: - /** - * Rely on underlying backend. - */ typedef Launcher< mode, _GRB_WITH_HYPERDAGS_USING > MyLauncherType; - /** - * Instantiate the sub-backend. - */ - MyLauncherType launcher; - public: /** - * Default constructor. - * - * Simply calls that of the underlying constructor. - */ - Launcher( - const size_t process_id = 0, const size_t nprocs = 1, - const std::string hostname = "localhost", - const std::string port = "0" - ) : launcher( process_id, nprocs, hostname, port ) {} - - /** - * Variable input-size execution. - * - * Simply calls underlying launcher. + * \internal Delegates to #grb::Launcher (reference) constructor. By + * default, this reverts to the reference backend. */ - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, - const size_t in_size, - U &data_out, - const bool broadcast = false - ) { - return launcher.exec( grb_program, data_in, in_size, data_out, broadcast ); - } + using MyLauncherType::Launcher; /** - * Fixed-size execution. - * - * Simply calls underlying launcher. + * \internal Delegates to #grb::Launcher finalize. By default, this reverts + * to the reference backend. */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), - const T &data_in, - U &data_out, - const bool broadcast = false - ) { - return launcher.exec( grb_program, data_in, data_out, broadcast ); - } + using MyLauncherType::finalize; }; diff --git a/include/graphblas/nonblocking/benchmark.hpp b/include/graphblas/nonblocking/benchmark.hpp index 8b62cb016..627002f02 100644 --- a/include/graphblas/nonblocking/benchmark.hpp +++ b/include/graphblas/nonblocking/benchmark.hpp @@ -27,10 +27,8 @@ #ifndef _H_GRB_NONBLOCKING_BENCH #define _H_GRB_NONBLOCKING_BENCH -#include #include - -#include "exec.hpp" +#include namespace grb { @@ -41,51 +39,15 @@ namespace grb { * \internal The public API simply wraps the reference Benchmarker. */ template< enum EXEC_MODE mode > - class Benchmarker< mode, nonblocking > { - - private: - - /** \internal Reuse reference benchmarker. */ - Benchmarker< mode, reference > ref; - + class Benchmarker< mode, nonblocking >: public Benchmarker< mode, reference > { public: - /** \internal Mirror reference constructor. */ - Benchmarker( - size_t process_id = 0, - size_t nprocs = 1, - std::string hostname = "localhost", - std::string port = "0" - ) : - ref(process_id, nprocs, hostname, port) - {} - - /** \internal Mirror reference exec. */ - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, - U &data_out, - const size_t inner, const size_t outer, - const bool broadcast = false - ) const { - return ref.exec( - grb_program, data_in, in_size, data_out, inner, outer, broadcast - ); - } + /** \internal Delegates to #grb::Benchmarker (reference) constructor. */ + using Benchmarker< mode, reference >::Benchmarker; - /** \internal Mirror reference exec. */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), - const T &data_in, U &data_out, - const size_t inner, - const size_t outer, - const bool broadcast = false - ) { - return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast ); - } + /** \internal Delegates to #grb::Benchmarker (reference) finalize. */ + using Benchmarker< mode, reference >::finalize; }; diff --git a/include/graphblas/nonblocking/exec.hpp b/include/graphblas/nonblocking/exec.hpp index 09f679526..80f46f79d 100644 --- a/include/graphblas/nonblocking/exec.hpp +++ b/include/graphblas/nonblocking/exec.hpp @@ -28,74 +28,23 @@ #define _H_GRB_NONBLOCKING_EXEC #include -#include - -#include "init.hpp" +#include namespace grb { /** The Launcher class is based on that of the reference backend */ template< EXEC_MODE mode > - class Launcher< mode, nonblocking > { - - private: - - Launcher< mode, reference > ref; + class Launcher< mode, nonblocking >: public Launcher< mode, reference > { public: - /** - * This implementation only accepts a single user process. It ignores - * \a hostname and \a port. - */ - Launcher( - const size_t process_id = 0, - const size_t nprocs = 1, - const std::string hostname = "localhost", - const std::string port = "0" - ) { - // ignore hostname and port - (void) hostname; - (void) port; - // sanity checks - if( nprocs != 1 ) { - throw std::invalid_argument( "Total number of user processes must be " - "exactly one when using the nonblocking implementation." - ); - } - if( process_id != 0 ) { - throw std::invalid_argument( "Process ID must always be zero in the " - "nonblocking implementation." - ); - } - } - - /** No implementation notes. */ - ~Launcher() {} - - /** exec is based on that of the reference backend */ - template< typename U > - RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, - U &data_out, const bool broadcast = false - ) const { - return ref.exec( grb_program, data_in, in_size, data_out, broadcast ); - } + /** \internal Delegates to #grb::Launcher (reference) constructor. */ + using Launcher< mode, reference >::Launcher; - /** exec is based on that of the reference backend */ - template< typename T, typename U > - RC exec( - void ( *grb_program )( const T &, U & ), - const T &data_in, U &data_out, - const bool broadcast = false - ) { - return ref.exec( grb_program, data_in, data_out, broadcast ); - } + /** \internal Delegates to #grb::Launcher (reference) finalize. */ + using Launcher< mode, reference >::finalize; - /** finalize is based on that of the reference backend */ - grb::RC finalize() { return ref.finalize(); } }; } // namespace grb diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp index ff40be8dd..9b6a58782 100644 --- a/include/graphblas/nonblocking/io.hpp +++ b/include/graphblas/nonblocking/io.hpp @@ -28,8 +28,9 @@ #define _H_GRB_NONBLOCKING_IO #include -#include -#include +#include +#include "vector.hpp" +#include "matrix.hpp" #include "lazy_evaluation.hpp" #include "boolean_dispatcher_io.hpp" @@ -1334,7 +1335,7 @@ namespace grb { } template< typename InputType, typename RIT, typename CIT, typename NIT > - RC wait( const Matrix< InputType, nonblocking > &A ) { + RC wait( const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A ) { (void) A; //TODO: currently, matrices are read only and no action is required // once the level-3 primitives are implemented diff --git a/include/graphblas/nonblocking/spmd.hpp b/include/graphblas/nonblocking/spmd.hpp index 126d50f33..0f169593e 100644 --- a/include/graphblas/nonblocking/spmd.hpp +++ b/include/graphblas/nonblocking/spmd.hpp @@ -30,6 +30,7 @@ #include //size_t #include +#include namespace grb { diff --git a/include/graphblas/reference/benchmark.hpp b/include/graphblas/reference/benchmark.hpp index 226500ecf..a6dd4ad1b 100644 --- a/include/graphblas/reference/benchmark.hpp +++ b/include/graphblas/reference/benchmark.hpp @@ -20,7 +20,7 @@ * @date 17th of April, 2017 */ -#if ! defined _H_GRB_REFERENCE_BENCH || defined _H_GRB_REFERENCE_OMP_BENCH +#if !defined _H_GRB_REFERENCE_BENCH || defined _H_GRB_REFERENCE_OMP_BENCH #define _H_GRB_REFERENCE_BENCH #include @@ -28,6 +28,7 @@ #include "exec.hpp" + namespace grb { /** @@ -38,23 +39,18 @@ namespace grb { */ template< enum EXEC_MODE mode > class Benchmarker< mode, reference > : - protected Launcher< mode, reference >, protected internal::BenchmarkerBase + public Launcher< mode, reference >, protected internal::BenchmarkerBase { public: /** \internal Delegates to #grb::Launcher (reference) constructor. */ - Benchmarker( - const size_t process_id = 0, // user process ID - const size_t nprocs = 1, // total number of user processes - std::string hostname = "localhost", // one of the user process hostnames - std::string port = "0" // a free port at hostname - ) : Launcher< mode, reference >( process_id, nprocs, hostname, port ) {} + using Launcher< mode, reference >::Launcher; - /** \internal No implementation notes. */ + /** \internal Use base benchmarker. */ template< typename U > RC exec( - void ( *grb_program )( const void *, const size_t, U & ), + AlpUntypedFunc< U > alp_program, const void * data_in, const size_t in_size, U &data_out, const size_t inner, const size_t outer, @@ -65,48 +61,32 @@ namespace grb { if( in_size > 0 && data_in == nullptr ) { return ILLEGAL; } - // initialise GraphBLAS - RC ret = grb::init(); - - // call graphBLAS algo - if( ret == SUCCESS ) { - benchmark< U >( grb_program, data_in, in_size, data_out, inner, outer, 0 ); - } - // finalise the GraphBLAS - const RC frc = grb::finalize(); - if( ret == SUCCESS ) { - ret = frc; - } - // and done - return ret; + auto fun = [ data_in, in_size, &data_out, alp_program, inner, outer ] { + benchmark< U, reference >( alp_program, data_in, in_size, data_out, inner, + outer, 0 ); + }; + return Launcher< mode, reference >::init_and_run( fun, broadcast ); } /** \internal No implementation notes. */ template< typename T, typename U > RC exec( - void ( *grb_program )( const T &, U & ), // user GraphBLAS program - const T &data_in, U &data_out, // input & output data + AlpTypedFunc< T, U > alp_program, + const T &data_in, U &data_out, const size_t inner, const size_t outer, const bool broadcast = false ) { - (void) broadcast; // value doesn't matter for a single user process - // initialise GraphBLAS - RC ret = grb::init(); - // call graphBLAS algo - if( ret == SUCCESS ) { - // call graphBLAS algo - benchmark< T, U >( grb_program, data_in, data_out, inner, outer, 0 ); - } - // finalise the GraphBLAS - const RC frc = grb::finalize(); - if( ret == SUCCESS ) { - ret = frc; - } - // and done - return ret; + auto fun = [ &data_in, &data_out, alp_program, inner, outer ] { + benchmark< T, U, reference >( alp_program, data_in, data_out, inner, + outer, 0 ); + }; + return Launcher< mode, reference >::init_and_run( fun, broadcast ); } + /** \internal Use reference Launcher finalize */ + using Launcher< mode, reference >::finalize; + }; } // namespace grb diff --git a/include/graphblas/reference/exec.hpp b/include/graphblas/reference/exec.hpp index d5d705f2c..e08463826 100644 --- a/include/graphblas/reference/exec.hpp +++ b/include/graphblas/reference/exec.hpp @@ -37,6 +37,27 @@ namespace grb { template< EXEC_MODE mode > class Launcher< mode, reference > { + protected: + + template< typename Runner > + RC init_and_run( + Runner &runner, + const bool broadcast + ) const { + // value doesn't matter for a single user process + (void) broadcast; + // intialise + RC ret = grb::init(); + // call algo + if( ret == SUCCESS ) { + runner(); + ret = grb::finalize(); + } + // and done + return ret; + } + + public: /** @@ -76,49 +97,36 @@ namespace grb { /** No implementation notes. */ template< typename U > RC exec( - void ( *grb_program )( const void *, const size_t, U & ), - const void * data_in, const size_t in_size, + AlpUntypedFunc< U > alp_program, + const void * const data_in, const size_t in_size, U &data_out, const bool broadcast = false ) const { - // value doesn't matter for a single user process - (void) broadcast; // check input arguments if( in_size > 0 && data_in == nullptr ) { return ILLEGAL; } - // intialise GraphBLAS - RC ret = grb::init(); - // call graphBLAS algo - if( ret == SUCCESS ) { - (*grb_program)( data_in, in_size, data_out ); - ret = grb::finalize(); - } - // and done - return ret; + auto fun = [ data_in, in_size, &data_out, alp_program ] { + (*alp_program)( data_in, in_size, data_out ); + }; + return init_and_run( fun, broadcast ); } /** No implementation notes. */ template< typename T, typename U > RC exec( - void ( *grb_program )( const T &, U & ), // user ALP/GraphBLAS program - const T &data_in, U &data_out, // input & output data + AlpTypedFunc< T, U > alp_program, + const T &data_in, U &data_out, const bool broadcast = false ) { - (void) broadcast; // value doesn't matter for a single user process - // intialise ALP/GraphBLAS - RC ret = grb::init(); - // call graphBLAS algo - if( ret == SUCCESS ) { - (*grb_program)( data_in, data_out ); - ret = grb::finalize(); - } - // and done - return ret; + auto fun = [ &data_in, &data_out, alp_program ] { + (*alp_program)( data_in, data_out ); + }; + return init_and_run( fun, broadcast ); } /** No implementation notes. */ - grb::RC finalize() { return grb::SUCCESS; } + static grb::RC finalize() { return grb::SUCCESS; } }; diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp index 10229d1c6..891b13488 100644 --- a/include/graphblas/reference/io.hpp +++ b/include/graphblas/reference/io.hpp @@ -25,8 +25,8 @@ #include -#include -#include +#include "vector.hpp" +#include "matrix.hpp" #define NO_CAST_ASSERT( x, y, z ) \ static_assert( x, \ diff --git a/include/graphblas/reference/pinnedvector.hpp b/include/graphblas/reference/pinnedvector.hpp index 7a3332ab7..d51b3f59d 100644 --- a/include/graphblas/reference/pinnedvector.hpp +++ b/include/graphblas/reference/pinnedvector.hpp @@ -79,8 +79,8 @@ namespace grb { _raw_deleter( x._raw_deleter ), _stack_deleter( x._buffer_deleter ), _buffered_values( x._raw ), _buffered_coordinates( x._coordinates ) { - (void)mode; // sequential and parallel IO mode are equivalent for this - // implementation. + (void) mode; // sequential and parallel IO mode are equivalent for this + // implementation. } // default destructor is OK diff --git a/include/graphblas/utils/TimerResults.hpp b/include/graphblas/utils/TimerResults.hpp index 36e50238b..276579716 100644 --- a/include/graphblas/utils/TimerResults.hpp +++ b/include/graphblas/utils/TimerResults.hpp @@ -23,12 +23,16 @@ #ifndef _H_GRB_TIMERRESULTS #define _H_GRB_TIMERRESULTS + namespace grb { + namespace utils { /** - * A structure holding benchmarking results, with initial io, a preamble time for setup, - * a useful time for actual processing, and a postamble time for cleaning up + * A structure holding benchmark timing results. + * + * It keeps track of initial io, a preamble time for setup, a useful time for + * actual processing, and a postamble time for cleaning up. */ struct TimerResults { double io; @@ -41,7 +45,7 @@ namespace grb { useful = val; postamble = val; } - void accum( TimerResults & times ) { + void accum( TimerResults × ) { io += times.io; preamble += times.preamble; useful += times.useful; @@ -54,13 +58,13 @@ namespace grb { useful /= loops; postamble /= loops; } - void min( const TimerResults & times ) noexcept { + void min( const TimerResults × ) noexcept { io = ( times.io < io ) ? times.io : io; preamble = ( times.preamble < preamble ) ? times.preamble : preamble; useful = ( times.useful < useful ) ? times.useful : useful; postamble = ( times.postamble < postamble ) ? times.postamble : postamble; } - void max( const TimerResults & times ) noexcept { + void max( const TimerResults × ) noexcept { io = ( times.io > io ) ? times.io : io; preamble = ( times.preamble > preamble ) ? times.preamble : preamble; useful = ( times.useful > useful ) ? times.useful : useful; @@ -69,5 +73,8 @@ namespace grb { }; } // namespace utils + } // namespace grb + #endif // ``_H_GRB_TIMERRESULTS'' + diff --git a/src/graphblas/CMakeLists.txt b/src/graphblas/CMakeLists.txt index a562c2550..5668f5869 100644 --- a/src/graphblas/CMakeLists.txt +++ b/src/graphblas/CMakeLists.txt @@ -73,9 +73,10 @@ set( backend_reference_srcs ${CMAKE_CURRENT_SOURCE_DIR}/rc.cpp ) -# the only source file common to all BSP-based backends +# source files common to all BSP-based backends set( backend_bsp_srcs ${CMAKE_CURRENT_SOURCE_DIR}/bsp/collectives.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/bsp/exec_broadcast_routines.cpp ) # include only selected backends diff --git a/src/graphblas/bsp/exec_broadcast_routines.cpp b/src/graphblas/bsp/exec_broadcast_routines.cpp new file mode 100644 index 000000000..62cf84885 --- /dev/null +++ b/src/graphblas/bsp/exec_broadcast_routines.cpp @@ -0,0 +1,76 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 17th of April, 2017 + */ + +#include "graphblas/bsp/exec_broadcast_routines.hpp" + +#include +#include + +#include + +#include +#include + + +bool grb::internal::grb_mpi_initialized = false; + +lpf_err_t grb::internal::lpf_init_collectives_for_broadcast( + lpf_t &ctx, + const lpf_pid_t s, const lpf_pid_t P, const size_t max_regs, + lpf_coll_t &coll +) { + assert( max_regs >= 2 ); + lpf_err_t brc = lpf_resize_memory_register( ctx, max_regs ); + assert( brc == LPF_SUCCESS ); + // lpf_collectives_init needs at least one slot, and if this call is followed + // by lpf_register_and_broadcast (as is intended), then at least one more slot + // is needed. + brc = lpf_collectives_init( ctx, s, P, 0, 0, 0, &coll ); + assert( brc == LPF_SUCCESS ); + // required messages follows LPF collectives user manual + const size_t nmsgs = P > 1 ? std::max( P + 1, 2 * P - 3 ) : P + 1; + brc = lpf_resize_message_queue( ctx, nmsgs ); + assert( brc == LPF_SUCCESS ); + brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); + assert( brc == LPF_SUCCESS ); + return brc; +} + +lpf_err_t grb::internal::lpf_register_and_broadcast( + lpf_t &ctx, lpf_coll_t &coll, + void * data, size_t size +) { + lpf_memslot_t global; + lpf_err_t brc = lpf_register_global( ctx, data, size, &global ); + assert( brc == LPF_SUCCESS ); + // TODO FIXME: double sync for registrations on launcher::exec necessary? + brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); + assert( brc == LPF_SUCCESS ); + brc = lpf_broadcast( coll, global, global, size, 0 ); + assert( brc == LPF_SUCCESS ); + brc = lpf_sync( ctx, LPF_SYNC_DEFAULT ); + assert( brc == LPF_SUCCESS ); + brc = lpf_deregister( ctx, global ); + assert( brc == LPF_SUCCESS ); + return brc; +} + diff --git a/src/graphblas/bsp1d/CMakeLists.txt b/src/graphblas/bsp1d/CMakeLists.txt index 0e62e623c..65b09b24d 100644 --- a/src/graphblas/bsp1d/CMakeLists.txt +++ b/src/graphblas/bsp1d/CMakeLists.txt @@ -75,7 +75,6 @@ endmacro( make_bsp1d_target ) set( backend_bsp1d_srcs "${backend_reference_srcs}" "${backend_bsp_srcs}" - ${CMAKE_CURRENT_SOURCE_DIR}/exec.cpp ${CMAKE_CURRENT_SOURCE_DIR}/init.cpp ${CMAKE_CURRENT_SOURCE_DIR}/config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/io.cpp diff --git a/src/graphblas/bsp1d/exec.cpp b/src/graphblas/bsp1d/exec.cpp deleted file mode 100644 index f156e9dad..000000000 --- a/src/graphblas/bsp1d/exec.cpp +++ /dev/null @@ -1,29 +0,0 @@ - -/* - * Copyright 2021 Huawei Technologies Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * @author A. N. Yzelman - * @date 17th of April, 2017 - */ - -#include - -#ifdef _GRB_MANUAL -const int LPF_MPIRMA_AUTO_INITIALIZE = 0; -#endif - -bool _grb_mpi_initialized = false; diff --git a/src/graphblas/nonblocking/io.cpp b/src/graphblas/nonblocking/io.cpp index 4c7e86885..c2e28980a 100644 --- a/src/graphblas/nonblocking/io.cpp +++ b/src/graphblas/nonblocking/io.cpp @@ -24,8 +24,7 @@ * @date 16th of May, 2022 */ -#include - +#include #include diff --git a/src/graphblas/reference/io.cpp b/src/graphblas/reference/io.cpp index 6d4be2d93..b0e5d073f 100644 --- a/src/graphblas/reference/io.cpp +++ b/src/graphblas/reference/io.cpp @@ -20,7 +20,7 @@ * @date 29th of March, 2022 */ -#include +#include namespace grb { diff --git a/tests/smoke/label_test.cpp b/tests/smoke/label_test.cpp index 246b31e53..3b9025e22 100644 --- a/tests/smoke/label_test.cpp +++ b/tests/smoke/label_test.cpp @@ -179,7 +179,7 @@ int main( int argc, char ** argv ) { grb::Launcher< AUTOMATIC > launcher; - enum grb::RC rc = launcher.exec( &grbProgram, in, out ); + enum grb::RC rc = launcher.exec( &grbProgram, in, out, true ); if( rc != SUCCESS ) { std::cerr << "launcher.exec returns with non-SUCCESS error code " << toString(rc) << std::endl; @@ -190,10 +190,11 @@ int main( int argc, char ** argv ) { // done if( out.error_code != SUCCESS ) { - std::cout << "Test FAILED\n\n"; + std::cerr << std::flush; + std::cout << "Test FAILED\n" << std::endl; return 1; } - std::cout << "Test OK\n\n"; + std::cout << "Test OK\n" << std::endl; return 0; } diff --git a/tests/smoke/simple_pagerank_from_mpi.cpp b/tests/smoke/simple_pagerank_from_mpi.cpp index 9788c36d1..e9560ddd2 100644 --- a/tests/smoke/simple_pagerank_from_mpi.cpp +++ b/tests/smoke/simple_pagerank_from_mpi.cpp @@ -56,7 +56,7 @@ struct output_vector { grb::utils::TimerResults times; }; -void grbProgram( const input_matrix & A, struct output_vector & out ) { +void grbProgram( const input_matrix &A, struct output_vector &out ) { // assume successful run out.error_code = 0; @@ -147,7 +147,7 @@ int main( int argc, char ** argv ) { } // create more convenient view of in_size - const struct input_matrix & A = *reinterpret_cast< struct input_matrix * >( data_in ); + const struct input_matrix &A = *reinterpret_cast< struct input_matrix * >( data_in ); // output vector struct output_vector pr; @@ -157,6 +157,7 @@ int main( int argc, char ** argv ) { grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD ); + // note: this exec passes pointers within a single process const enum grb::RC rc = launcher.exec( &grbProgram, A, pr ); if( rc != SUCCESS ) { std::cerr << "grb::Launcher< FROM_MPI >::exec returns with " diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5ac228625..1fb38f7fb 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -73,6 +73,10 @@ add_grb_executables( id id.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) +add_grb_executables( id_distributed id_distributed.cpp + BACKENDS bsp1d hybrid +) + add_grb_executables( dot dot.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) @@ -236,8 +240,11 @@ add_grb_executables( eWiseApply_matrix eWiseApply_matrix.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) +# in the below, test_utils_headers is retained in case CMake is configured to +# include _DEBUG flags add_grb_executables( eWiseApplyMatrixReference eWiseApplyMatrixReference.cpp BACKENDS reference reference_omp hyperdags nonblocking + ADDITIONAL_LINK_LIBRARIES test_utils_headers ) add_grb_executables( outer outer.cpp @@ -288,6 +295,25 @@ add_grb_executables( matrix_type static_asserts/matrix.cpp BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking ) +add_grb_executables( launch_benchmark_auto launcherAndBenchmarker.cpp + BACKENDS bsp1d hybrid + COMPILE_DEFINITIONS DISTRIBUTED_EXECUTION +) + +add_grb_executables( launch_benchmark_auto launcherAndBenchmarker.cpp + BACKENDS reference reference_omp hyperdags nonblocking +) + +add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp + BACKENDS bsp1d hybrid + COMPILE_DEFINITIONS DISTRIBUTED_EXECUTION NO_LPF_AUTO_INIT +) + +add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp + BACKENDS reference reference_omp hyperdags nonblocking + COMPILE_DEFINITIONS NO_LPF_AUTO_INIT +) + # targets to list and build the test for this category get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit ) add_custom_target( "list_tests_category_unit" diff --git a/tests/unit/auto_launcher.cpp b/tests/unit/auto_launcher.cpp index b2686a957..d5e574acb 100644 --- a/tests/unit/auto_launcher.cpp +++ b/tests/unit/auto_launcher.cpp @@ -54,13 +54,14 @@ int main( int argc, char ** argv ) { grb::Launcher< grb::AUTOMATIC > launcher; // run - if( launcher.exec( &grbProgram, P, exit_status ) != grb::SUCCESS ) { + if( launcher.exec( &grbProgram, P, exit_status, true ) != grb::SUCCESS ) { std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl; return 200; } // master process reports test success if( exit_status ) { + std::cerr << std::flush; std::cout << "Test FAILED (exit code " << exit_status << ").\n" << std::endl; } else { std::cout << "Test OK\n" << std::endl; diff --git a/tests/unit/buildVector.cpp b/tests/unit/buildVector.cpp index aea50b842..d056af3d5 100644 --- a/tests/unit/buildVector.cpp +++ b/tests/unit/buildVector.cpp @@ -297,6 +297,7 @@ int main( int argc, char ** argv ) { if( error == 0 ) { std::cout << "Test OK" << std::endl; } else { + std::cerr << std::flush; std::cout << "Test FAILED" << std::endl; } diff --git a/tests/unit/eWiseApplyMatrixReference.cpp b/tests/unit/eWiseApplyMatrixReference.cpp index 6d675aa97..1a1982b5d 100644 --- a/tests/unit/eWiseApplyMatrixReference.cpp +++ b/tests/unit/eWiseApplyMatrixReference.cpp @@ -125,15 +125,24 @@ void checkCRSandCCS( const auto & crsExpected = internal::getCRS( expected ); for( size_t i = 0; i < nrows( obtained ); ++i ) { for( size_t k = crsObtained.col_start[ i ]; k < crsObtained.col_start[ i + 1 ]; ++k ) { - if( crsObtained.row_index[ k ] != crsExpected.row_index[ k ] ) { - std::cerr << "Error: unexpected entry at ( " << i << ", " << crsObtained.row_index[ k ] << " ), " - << "expected one at ( " << i << ", " << crsExpected.row_index[ k ] << " ) " - << "instead (CRS).\n"; + const auto nValuesInRow = crsObtained.col_start[ i + 1 ] - crsObtained.col_start[ i ]; + const auto expectedValuesInRow = crsExpected.col_start[ i + 1 ] - crsExpected.col_start[ i ]; + if( nValuesInRow != expectedValuesInRow ) { + std::cerr << "Error: unexpected number of non-zero entries in row " << i << "; " + << "expected " << expectedValuesInRow << ", " + << "obtained " << nValuesInRow << " (CRS).\n"; rc = FAILED; } - if( crsObtained.values[ k ] != crsExpected.values[ k ] ) { - std::cerr << "Error: unexpected value " << crsObtained.values[ k ] << "; " - << "expected " << crsExpected.values[ k ] << " (CRS).\n"; + const auto searchedJ = crsObtained.row_index[ k ]; + const auto searchedV = crsObtained.values[ k ]; + bool found = false; + for( size_t l = crsExpected.col_start[ i ]; l < crsExpected.col_start[ i + 1 ]; ++l ) { + found |= ( crsExpected.row_index[ l ] == searchedJ ) && ( crsExpected.values[ l ] == searchedV ); + } + if( !found ) { + std::cerr << "Error: Can not found entry " + << "( " << i << ", " << searchedJ << " ) = " + << searchedV << " (CRS).\n"; rc = FAILED; } } @@ -143,18 +152,26 @@ void checkCRSandCCS( { // check CCS output const auto & ccsObtained = internal::getCCS( obtained ); const auto & ccsExpected = internal::getCCS( expected ); - for( size_t j = 0; j < ncols( obtained ); ++j ) { - for( size_t k = ccsExpected.col_start[ j ]; k < ccsExpected.col_start[ j + 1 ]; ++k ) { - if( ccsObtained.row_index[ k ] != ccsExpected.row_index[ k ] ) { - std::cerr << "Error: unexpected entry at " - << "( " << ccsObtained.row_index[ k ] << ", " << j << " ), " - << "expected one at ( " << ccsExpected.row_index[ k ] << ", " << j << " ) " - << "instead (CCS).\n"; + for( size_t i = 0; i < ncols( obtained ); ++i ) { + for( size_t k = ccsExpected.col_start[ i ]; k < ccsExpected.col_start[ i + 1 ]; ++k ) { + const auto nValuesInRow = ccsObtained.col_start[ i + 1 ] - ccsObtained.col_start[ i ]; + const auto expectedValuesInRow = ccsExpected.col_start[ i + 1 ] - ccsExpected.col_start[ i ]; + if( nValuesInRow != expectedValuesInRow ) { + std::cerr << "Error: unexpected number of non-zero entries in row " << i << "; " + << "expected " << expectedValuesInRow << ", " + << "obtained " << nValuesInRow << " (CCS).\n"; rc = FAILED; } - if( ccsObtained.values[ k ] != ccsExpected.values[ k ] ) { - std::cerr << "Error: unexpected value " << ccsObtained.values[ k ] << "; " - << "expected " << ccsExpected.values[ k ] << " (CCS).\n"; + const auto searchedJ = ccsObtained.row_index[ k ]; + const auto searchedV = ccsObtained.values[ k ]; + bool found = false; + for( size_t l = ccsExpected.col_start[ i ]; l < ccsExpected.col_start[ i + 1 ]; ++l ) { + found |= ( ccsExpected.row_index[ l ] == searchedJ ) && ( ccsExpected.values[ l ] == searchedV ); + } + if( !found ) { + std::cerr << "Error: Can not found entry " + << "( " << i << ", " << searchedJ << " ) = " + << searchedV << " (CCS).\n"; rc = FAILED; } } diff --git a/tests/unit/id_distributed.cpp b/tests/unit/id_distributed.cpp new file mode 100644 index 000000000..9a7a6a8d9 --- /dev/null +++ b/tests/unit/id_distributed.cpp @@ -0,0 +1,307 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + + +struct input { + bool check; + std::array< size_t, 3 > values; +}; + +struct output { + grb::RC rc; + std::array< size_t, 3 > IDs; +}; + +// test grb::getID on vectors +void grb_program1( const struct input &in, struct output &out ) { + grb::RC &rc = out.rc; + assert( rc == grb::SUCCESS ); + if( grb::spmd<>::pid() == 0 ) { + if( in.check ) { + std::cerr << "\t in vector check, phase 4/4\n"; + } else { + std::cerr << "\t in initial vector test, phase 1/4\n"; + } + } + + grb::Vector< std::pair< int, float > > one( 1000000 ); + grb::Vector< size_t > two( 5000000 ); + const size_t oneLocalID = grb::getID( grb::internal::getLocal( one ) ); + out.IDs[ 0 ] = oneLocalID; + const size_t twoLocalID = grb::getID( grb::internal::getLocal( two ) ); + out.IDs[ 1 ] = twoLocalID; + if( oneLocalID == twoLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (I)\n"; + rc = grb::FAILED; + return; + } + if( oneLocalID != grb::getID( grb::internal::getLocal( one ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (I)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID != grb::getID( grb::internal::getLocal( two ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (II)\n"; + rc = grb::FAILED; + return; + } + + if( in.check ) { + if( oneLocalID != in.values[ 0 ] ) { + std::cerr << "\t container ID is not consistent with previous run (IV)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID != in.values[ 1 ] ) { + std::cerr << "\t container ID is not consistent with previous run (V)\n"; + rc = grb::FAILED; + return; + } + } + + grb::Vector< size_t > three( two ); + const size_t threeLocalID = grb::getID( grb::internal::getLocal( three ) ); + out.IDs[ 2 ] = threeLocalID; + if( threeLocalID != grb::getID( grb::internal::getLocal( three ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (III): " << threeLocalID << " vs. " << grb::getID( grb::internal::getLocal( three ) ) << "\n"; + rc = grb::FAILED; + return; + } + if( oneLocalID == threeLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (II)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID == threeLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (III)\n"; + rc = grb::FAILED; + return; + } + + if( in.check ) { + if( threeLocalID != in.values[ 2 ] ) { + std::cerr << "\t container ID is not consistent with previous run (VI): " + << threeLocalID << " vs. " << in.values[ 2 ] << "\n"; + rc = grb::FAILED; + return; + } + } + + std::swap( two, three ); + if( twoLocalID != grb::getID( grb::internal::getLocal( three ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap " + << "produce different IDs (I)\n"; + rc = grb::FAILED; + return; + } + if( threeLocalID != grb::getID( grb::internal::getLocal( two ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap " + << "produce different IDs (II)\n"; + rc = grb::FAILED; + return; + } +} + +// test grb::getID on matrices +void grb_program2( const struct input &in, struct output &out ) { + grb::RC &rc = out.rc; + assert( rc == grb::SUCCESS ); + if( grb::spmd<>::pid() == 0 ) { + if( in.check ) { + std::cerr << "\t in matrix check, phase 4/4\n"; + } else { + std::cerr << "\t in initial matrix test, phase 1/4\n"; + } + } + + grb::Matrix< std::pair< int, float > > one( 1000000, 100000 ); + grb::Matrix< size_t > two( 5000000, 100000 ); + const size_t oneLocalID = grb::getID( grb::internal::getLocal( one ) ); + out.IDs[ 0 ] = oneLocalID; + const size_t twoLocalID = grb::getID( grb::internal::getLocal( two ) ); + out.IDs[ 1 ] = twoLocalID; + if( oneLocalID == twoLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (I)\n"; + rc = grb::FAILED; + return; + } + if( oneLocalID != grb::getID( grb::internal::getLocal( one ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (I)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID != grb::getID( grb::internal::getLocal( two ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (II)\n"; + rc = grb::FAILED; + return; + } + + if( in.check ) { + if( oneLocalID != in.values[ 0 ] ) { + std::cerr << "\t container ID is not consistent with previous run (IV)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID != in.values[ 1 ] ) { + std::cerr << "\t container ID is not consistent with previous run (V)\n"; + rc = grb::FAILED; + return; + } + } + + grb::Matrix< size_t > three( two ); + const size_t threeLocalID = grb::getID( grb::internal::getLocal( three ) ); + out.IDs[ 2 ] = threeLocalID; + if( threeLocalID != grb::getID( grb::internal::getLocal( three ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container produce different " + << "IDs (III): " << threeLocalID << " vs. " << grb::getID( grb::internal::getLocal( three ) ) << "\n"; + rc = grb::FAILED; + return; + } + if( oneLocalID == threeLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (II)\n"; + rc = grb::FAILED; + return; + } + if( twoLocalID == threeLocalID ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on different containers result in the " + << "same ID (III)\n"; + rc = grb::FAILED; + return; + } + + if( in.check ) { + if( threeLocalID != in.values[ 2 ] ) { + std::cerr << "\t container ID is not consistent with previous run (VI): " + << threeLocalID << " vs. " << in.values[ 2 ] << "\n"; + rc = grb::FAILED; + return; + } + } + + std::swap( two, three ); + if( twoLocalID != grb::getID( grb::internal::getLocal( three ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap " + << "produce different IDs (I)\n"; + rc = grb::FAILED; + return; + } + if( threeLocalID != grb::getID( grb::internal::getLocal( two ) ) ) { + std::cerr << "\t two calls to getID(getLocal(mat)) on the same container after an std::swap " + << "produce different IDs (II)\n"; + rc = grb::FAILED; + return; + } +} + +// NOTE: +// the spec does not promise anything when called on empty containers such as +// grb::Vector< T > empty_vector( 0 ) or grb::Matrix< T > empty_matrix( 0 ), +// therefore we cannot unit test the behaviour of grb::getID on such +// containers. + +int main( int argc, char ** argv ) { + // defaults + bool printUsage = false; + + // error checking + if( argc != 1 ) { + printUsage = true; + } + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 1; + } + + std::cout << "This is functional test " << argv[ 0 ] << "\n"; + grb::Launcher< grb::AUTOMATIC > launcher; + struct input in_vector{ false, {0,0,0} }; + struct input in_matrix{ false, {0,0,0} }; + struct output out; + out.rc = grb::SUCCESS; + in_vector.check = in_matrix.check = false; + + if( launcher.exec( &grb_program1, in_vector, out, true ) != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 1 FAILED (launcher error)" << std::endl; + return 255; + } + if( out.rc != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 1 FAILED (" << grb::toString( out.rc ) << ")" << std::endl; + return 255; + } + std::copy( out.IDs.begin(), out.IDs.end(), in_vector.values.begin() ); + + assert( out.rc == grb::SUCCESS ); + if( launcher.exec( &grb_program2, in_matrix, out, true ) != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 2 FAILED (launcher error)" << std::endl; + return 255; + } + if( out.rc != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 2 FAILED (" << grb::toString( out.rc ) << ")" << std::endl; + return 255; + } + std::copy( out.IDs.begin(), out.IDs.end(), in_matrix.values.begin() ); + + in_matrix.check = true; + assert( out.rc == grb::SUCCESS ); + if( launcher.exec( &grb_program2, in_matrix, out, true ) != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 3 FAILED (launcher error)" << std::endl; + return 255; + } + if( out.rc != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 3 FAILED (" << grb::toString( out.rc ) << ")" << std::endl; + return 255; + } + + in_vector.check = true; + assert( out.rc == grb::SUCCESS ); + if( launcher.exec( &grb_program1, in_vector, out, true ) != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 4 FAILED (launcher error)" << std::endl; + return 255; + } + if( out.rc != grb::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Test 4 FAILED (" << grb::toString( out.rc ) << ")" << std::endl; + return 255; + } + + std::cout << "Test OK" << std::endl; + return 0; +} + diff --git a/tests/unit/launcherAndBenchmarker.cpp b/tests/unit/launcherAndBenchmarker.cpp new file mode 100644 index 000000000..17a4990a1 --- /dev/null +++ b/tests/unit/launcherAndBenchmarker.cpp @@ -0,0 +1,680 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Tests the grb::Launcher abstraction. + * + * @author Alberto Scolari + * @date August 2023 + */ + + +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef DISTRIBUTED_EXECUTION + #include +#endif + +#include +#include + + +#ifdef NO_LPF_AUTO_INIT + const int LPF_MPI_AUTO_INITIALIZE = 0; +#endif + +constexpr size_t STR_LEN = 1024; + +static const char prelude[ STR_LEN + 1 ] = "O Earth O Earth return!\n" + "Arise from out the dewy grass;"; + +static const char truth[ STR_LEN + 1 ] = "Night is worn,\n" + "and the morn\n" + "rises from the slumberous mass."; + +static const char default_str[ STR_LEN + 1 ] = "Hear the voice of the Bard!\n" + "Who Present, Past, and Future, sees;"; + +struct input { + char str[ STR_LEN + 1 ]; + + input() { + (void) strncpy( str, default_str, STR_LEN + 1 ); + } +}; + +// same as input, just not default-constructible for a testing scenarion +struct nd_input : input { + + nd_input() = delete; // make this non default-constructible + + nd_input( const char * _str ) { + (void) strncpy( this->str, _str, STR_LEN + 1 ); + } +}; + +bool operator==( const struct input &obj, const char * ext ) { + return strnlen( obj.str, STR_LEN + 1 ) == strnlen( ext, STR_LEN + 1 ) && + strncmp( obj.str, ext, STR_LEN + 1 ) == 0; +} + +bool operator==( const char * ext, const struct input &obj ) { + return obj == ext; +} + +struct output { + int exit_code; + size_t P; + grb::utils::TimerResults times; +}; + +template< grb::EXEC_MODE mode, bool broadcasted, typename InputT > +void grbProgram( const InputT &in, struct output &out ) { + static_assert( std::is_base_of< input, InputT >::value ); + out.times.preamble = 2.0; + out.times.useful = 2.0; + out.times.io = out.times.postamble = 2.0; + out.times.postamble = 2.0; + + const size_t P = grb::spmd<>::nprocs(); + const size_t s = grb::spmd<>::pid(); + out.P = P; + + const char * expected = nullptr; + + if( broadcasted ) { + // independently from mode is or process id, every process must have the same + // string + expected = truth; + } else { + // in non-broadcasting mode, what a process has depends on its rank and the + // launcher mode. + switch (mode) { + case grb::AUTOMATIC: + // here, only the master process can have the "new" string + // while the other processes have the "default" string + expected = s == 0 ? truth : default_str; + break; + case grb::FROM_MPI: + case grb::MANUAL: + // the master must have the new string, while other processes the prelude + expected = s == 0 ? truth : prelude; + break; + default: + out.exit_code = 1; + printf( "- ERROR: unknown mode %d\n", mode ); + return; + break; + } + } + out.exit_code = in == expected ? 0 : 1; + + std::cout << "--- PID " << s << " of " << P << ": "; + if( out.exit_code == 0 ) { + std::cout << "MATCH\n"; + } else { + std::cout << "ERROR! Input string\n\"" << in.str + << "\"\n!= Expected string\n\"" << expected << "\"\n"; + } +} + +template< grb::EXEC_MODE mode, bool broadcasted, typename InputT > +void vgrbProgram( + const void * const __in, const size_t size, + struct output &out +) { + if( size != STR_LEN + 1 ) { + const size_t P = grb::spmd<>::nprocs(); + const size_t s = grb::spmd<>::pid(); + out.P = P; + std::cout << "--- PID " << s << " of " << P << ": " + << "ERROR! Input size " << size << " !- expected " << (STR_LEN+1) << "\n"; + return; + } + const struct input &in = *reinterpret_cast< const struct input *>( __in ); + grbProgram< mode, broadcasted, InputT >( in, out ); +} + +void autoVgrbProgram( + const void * const __in, const size_t size, + struct output &out +) { + const size_t P = grb::spmd<>::nprocs(); + const size_t s = grb::spmd<>::pid(); + out.P = P; + if( s == 0 ) { + const input &in = *static_cast< const input * >( __in ); + out.exit_code = size == sizeof( input ) && + in == truth ? 0 : 1; + std::cout << "--- PID " << s << " of " << P << ": "; + if( out.exit_code == 0 ) { + std::cout << "MATCH\n"; + } else { + std::cout << "ERROR! Input size is " << size << ", " + << "string\n\"" << in.str << "\"\n!= " + << "expected\n\"" << truth << "\"\n"; + } + } else { + out.exit_code = __in == nullptr && size == 0 ? 0 : 1; + std::cout << "--- PID " << s << " of " << P << ": "; + if( out.exit_code == 0 ) { + std::cout << "MATCH, got expected values (nullptr and 0)\n"; + } else { + std::cout << "ERROR! Got " << __in << " != nullptr and " << size + << " != 0\n"; + } + } +} + +template< grb::EXEC_MODE mode, bool broadcasted, typename InputT > +struct caller { + static constexpr grb::AlpTypedFunc< InputT, output > fun = + grbProgram< mode, broadcasted, InputT >; +}; + +template< grb::EXEC_MODE mode, bool broadcasted, typename InputT > +struct vcaller { + static constexpr grb::AlpUntypedFunc< output > fun = + vgrbProgram< mode, broadcasted, input >; +}; + +template< typename InputT > +struct vcaller< grb::AUTOMATIC, false, InputT > { + static constexpr grb::AlpUntypedFunc< output > fun = autoVgrbProgram; +}; + +template< typename InputT > +class Runner { + + public: + + virtual grb::RC launch_typed( + grb::AlpTypedFunc< InputT, output >, + const InputT &, output &, + bool + ) = 0; + + virtual grb::RC launch_untyped( + grb::AlpUntypedFunc< output >, + const void *, size_t, + output &, + bool + ) = 0; + + virtual grb::RC finalize() = 0; + + virtual ~Runner() = default; + +}; + +template< grb::EXEC_MODE mode, typename InputT > +class bsp_launcher : + public grb::Launcher< mode >, public Runner< InputT > +{ + + public: + + using grb::Launcher< mode >::Launcher; + + grb::RC launch_typed( + grb::AlpTypedFunc< InputT, output > grbProgram, + const InputT &in, output &out, bool bc + ) override { + return this->exec( grbProgram, in, out, bc ); + } + + grb::RC launch_untyped( + grb::AlpUntypedFunc< output > grbProgram, + const void * in, size_t in_size, + output &out, bool bc + ) override { + return this->exec( grbProgram, in, in_size, out, bc ); + } + + virtual grb::RC finalize() override { + return grb::Launcher< mode >::finalize(); + } + +}; + +template< grb::EXEC_MODE mode, typename InputT > +class bsp_benchmarker : + public grb::Benchmarker< mode >, public Runner< InputT > +{ + + private: + + size_t inner = 2; + size_t outer = 2; + + + public: + + using grb::Benchmarker< mode >::Benchmarker; + + grb::RC launch_typed( + grb::AlpTypedFunc< InputT, output > grbProgram, + const InputT &in, output &out, + bool bc + ) override { + return this->exec( grbProgram, in, out, inner, outer, bc ); + } + + grb::RC launch_untyped( + const grb::AlpUntypedFunc< output > grbProgram, + const void * const in, const size_t in_size, + output &out, const bool bc + ) override { + return this->exec( grbProgram, in, in_size, out, inner, outer, bc ); + } + + virtual grb::RC finalize() override { + return grb::Benchmarker< mode >::finalize(); + } + +}; + + +enum RunnerType { Launch, Benchmark }; + +template< typename InputT > +std::unique_ptr< Runner< InputT > > make_runner( + grb::EXEC_MODE mode, RunnerType type, + size_t s, size_t P, + const std::string &host, const std::string &port, + const bool mpi_inited +) { + Runner< InputT > *ret = nullptr; +#ifndef DISTRIBUTED_EXECUTION + (void) mpi_inited; +#endif + + switch (type) { + + case Launch: + + switch (mode) { + case grb::AUTOMATIC: + ret = new bsp_launcher< grb::AUTOMATIC, InputT >; + break; +#ifdef DISTRIBUTED_EXECUTION + case grb::FROM_MPI: + ret = new bsp_launcher< grb::FROM_MPI, InputT >( MPI_COMM_WORLD ); + break; + + case grb::MANUAL: + ret = new bsp_launcher< grb::MANUAL, InputT >( s, P, host, port, + mpi_inited ); + break; +#else + case grb::MANUAL: + ret = new bsp_launcher< grb::MANUAL, InputT >( s, P, host, port ); + break; +#endif + default: + break; + } + break; + + case Benchmark: + switch (mode) { + case grb::AUTOMATIC: + ret = new bsp_benchmarker< grb::AUTOMATIC, InputT >; + break; +#ifdef DISTRIBUTED_EXECUTION + case grb::FROM_MPI: + ret = new bsp_benchmarker< grb::FROM_MPI, InputT >( MPI_COMM_WORLD ); + break; + + case grb::MANUAL: + ret = new bsp_benchmarker< grb::MANUAL, InputT >( s, P, host, port, + mpi_inited ); + break; +#else + case grb::MANUAL: + ret = new bsp_benchmarker< grb::MANUAL, InputT >( s, P, host, port ); + break; + + case grb::FROM_MPI: +#endif + + default: + break; + } + break; + + default: + // error is caught later + break; + + } + + if( ret == nullptr ) { + throw std::runtime_error( "Error while creating runner" ); + } + return std::unique_ptr< Runner< InputT > >( ret ); +} + +#define ERROR_ON( cond, str ) if( cond ) { \ + std::cerr << __FILE__ ", " << __LINE__ << ": " << str << std::endl; \ + std::cout << "Test FAILED\n" << std::endl; \ + throw std::runtime_error( "check failed" ); \ + } + + +template< + template< grb::EXEC_MODE, bool, typename InputT > class FunT, + grb::EXEC_MODE mode, typename RetT, typename InputT +> +RetT getFun( bool broadcast ) { + return broadcast + ? FunT< mode, true, InputT >::fun + : FunT< mode, false, InputT >::fun; +} + +template< + template< grb::EXEC_MODE, bool, typename InputT > class CallerT, + typename RetT, typename InputT +> +RetT getALPFun( grb::EXEC_MODE mode, bool broadcast ) { + switch (mode) { + case grb::AUTOMATIC: + return getFun< CallerT, grb::AUTOMATIC, RetT, InputT >( broadcast ); + break; + case grb::FROM_MPI: + return getFun< CallerT, grb::FROM_MPI, RetT, InputT >( broadcast ); + break; + case grb::MANUAL: + return getFun< CallerT, grb::MANUAL, RetT, InputT >( broadcast ); + break; + default: + std::cerr << __FILE__ ", " << __LINE__ << ": " << "unknown mode " << mode + << std::endl; + throw std::runtime_error( "unknown mode" ); + break; + } +} + +template< typename InputT > +std::unique_ptr< Runner< InputT > > create_runner( + grb::EXEC_MODE mode, RunnerType rt, + size_t s, size_t P, + const std::string &host, const std::string &port, + bool mpi_inited +) { + try { + return make_runner< InputT >( + mode, rt, s, P, + host, + port, + mpi_inited + ); + } catch( std::runtime_error &e ) { + std::cerr << "got a runtime exception: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + throw e; + } catch( std::exception &e ) { + std::cerr << "got an exception: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + throw e; + } catch( ... ) { + std::cerr << "got an unknown exception" << std::endl; + std::cout << "Test FAILED\n" << std::endl; + throw std::runtime_error( "unknown exception" ); + } + return std::unique_ptr< Runner< InputT > >(); +} + +int main( int argc, char ** argv ) { + + std::cout << "Functional test executable: " << argv[ 0 ] << "\n"; + +#ifdef DISTRIBUTED_EXECUTION + int lpf_mpi_inited = 0; + int success = MPI_Initialized( &lpf_mpi_inited ); + ERROR_ON( success != MPI_SUCCESS, "cannot determine initalization info" ); +#endif + const char * host = nullptr; + const char * port = nullptr; +#ifdef DISTRIBUTED_EXECUTION + typedef lpf_pid_t test_pid_t; +#else + typedef size_t test_pid_t; +#endif + // default values for shared-memory execution + test_pid_t P = 1; + test_pid_t s = 0; + grb::EXEC_MODE mode = grb::AUTOMATIC; + +#ifdef DISTRIBUTED_EXECUTION + if( lpf_mpi_inited != 0 ) { + mode = grb::AUTOMATIC; + ERROR_ON( argc != 1, "no argument needed" ); + } else { + if( argc == 1 ) { + mode = grb::FROM_MPI; + } else if( argc == 5 ) { + mode = grb::MANUAL; + } else { + ERROR_ON( true, "either no arguments or four arguments expected.\n" + "For the four-argument variant, the following are expected:\n" + " - hostname\n" + " - portname\n" + " - total number of processes\n" + " - unique ID of this process\n" + ); + } + } +#else + if( argc == 1 ) { + mode = grb::AUTOMATIC; + } else if( argc == 5 ) { + mode = grb::MANUAL; + } else { + ERROR_ON( true, "either no arguments or four arguments expected.\n" + "For the four-argument variant, the following are expected:\n" + " - hostname\n" + " - portname\n" + " - total number of processes\n" + " - unique ID of this process\n" + ); + } +#endif + const char *mode_str = nullptr; + + switch( mode ) { + case grb::AUTOMATIC: + mode_str = "AUTOMATIC"; + break; +#ifdef DISTRIBUTED_EXECUTION + case grb::FROM_MPI: + mode_str = "FROM_MPI"; + break; +#endif + case grb::MANUAL: + mode_str = "MANUAL"; + break; + default: + ERROR_ON( true, "unrecognised or invalid option: " << mode ); + break; + } + + std::cout << "\n===> chosen initialisation method: " << mode_str << " <===" + << std::endl; + + if( mode == grb::MANUAL ) { + // read command-line args + host = argv[ 1 ]; + port = argv[ 2 ]; + try { + P = static_cast< test_pid_t >( std::stoi( argv[ 3 ] ) ); + s = static_cast< test_pid_t >( std::stoi( argv[ 4 ] ) ); + } catch( std::exception &e ) { + std::cerr << "Caught exception: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return EXIT_FAILURE; + } + + // input sanity checks + ERROR_ON( host == nullptr || strlen( host ) == 0, + "Invalid hostname: " << argv[ 1 ] ); + ERROR_ON( port == nullptr || strlen( port ) == 0, + "value for port name or number: " << argv[ 2 ] ); + ERROR_ON( !grb::utils::is_in_normalized_range( s, P ), + "Invalid value for PID: " << argv[ 4 ] ); + } +#ifdef DISTRIBUTED_EXECUTION + if( mode == grb::FROM_MPI || mode == grb::MANUAL ) { + success = MPI_Init( NULL, NULL ); + ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Init failed" ); + } + if( mode == grb::FROM_MPI ) { + int rank; + success = MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Comm_rank failed" ); + s = static_cast< test_pid_t >( rank ); + } +#endif + + const char * input_str = ( mode == grb::AUTOMATIC ) ? truth : + ( s == 0 ) ? truth : prelude; + + struct input in; + struct output out; + for( const bool broadcast : { true, false } ) { + for( const RunnerType rt : { Launch, Benchmark } ) { + const char * const runner_name = rt == Launch ? "Launch" : "Benchmark"; + const char * const bc_str = broadcast ? "true" : "false"; + std::cout << "\n ==> runner type: " << runner_name << ", " + << "broadcast: " << bc_str << std::endl; + std::unique_ptr< Runner< input > > runner = create_runner< input >( + mode, rt, s, P, + std::string( (host != nullptr ? host : "" ) ), + std::string( (port != nullptr ? port : "" ) ), + true + ); + std::cout << " => untyped call\n" << std::endl; + (void) strncpy( in.str, input_str, STR_LEN + 1 ); + grb::AlpUntypedFunc< output > vfun = + getALPFun< vcaller, grb::AlpUntypedFunc< output >, input >( + mode, broadcast + ); + out.exit_code = 256; // the ALP function MUST set to 0 + grb::RC ret = runner->launch_untyped( + vfun, + reinterpret_cast< void * >( &in ), sizeof( input ), + out, broadcast + ); + ERROR_ON( ret != grb::SUCCESS, + "untyped test FAILED with code: " << grb::toString( ret ) ); + ERROR_ON( out.exit_code != 0, + "untyped test FAILED with exit code " << out.exit_code ); + + std::cout << "\n => typed call\n" << std::endl; + grb::AlpTypedFunc< input, output > fun = + getALPFun< caller, grb::AlpTypedFunc< input, output >, input >( + mode, broadcast + ); + out.exit_code = 256; + ret = runner->launch_typed( fun, in, out, broadcast ); + ERROR_ON( ret != grb::SUCCESS, + "typed test FAILED with code: " << grb::toString( ret ) ); + ERROR_ON( out.exit_code != 0, + "typed test FAILED with exit code " << out.exit_code ); + + ret = runner->finalize(); + + ERROR_ON( ret != grb::SUCCESS, + "finalisation FAILED with code: " << grb::toString( ret ) ); + std::cout << " => OK" << std::endl; + + if( mode == grb::AUTOMATIC ) { + // AUTOMTIC mode must implement a specific behaviour for + // non-default-constructible input types like nd_input, here tested + + std::unique_ptr< Runner< nd_input > > nd_runner = create_runner< nd_input >( + mode, rt, s, P, + std::string( (host != nullptr ? host : "" ) ), + std::string( (port != nullptr ? port : "" ) ), + true + ); + + std::cout << "\n => untyped call, non-default-constructible input\n" + << std::endl; + out.exit_code = 256; + nd_input ndin( input_str ); + ret = nd_runner->launch_untyped( + vfun, + reinterpret_cast< void * >( &ndin ), sizeof( nd_input ), + out, broadcast + ); + // untyped calls must succeed even with a non-default-constructible input + ERROR_ON( ret != grb::SUCCESS, + "untyped test FAILED with code: " << grb::toString( ret ) ); + ERROR_ON( out.exit_code != 0, + "untyped test FAILED with exit code " << out.exit_code ); + + std::cout << "\n => typed call, non-default-constructible input\n" + << std::endl; + out.exit_code = 256; + grb::AlpTypedFunc< nd_input, output > ndfun = + getALPFun< caller, grb::AlpTypedFunc< nd_input, output >, nd_input >( + mode, broadcast + ); + ret = nd_runner->launch_typed( ndfun, ndin, out, broadcast ); + // get P from process, as it may not be known outside of the + // launcher (e.g., for AUTOMATIC mode) + const bool should_fail = ( !broadcast ) && out.P > 1; + int expected_retval = should_fail ? 256 : 0; + // typed call should fail if ALL of the following conditions are met: + // - AUTOMATIC mode + // - non-default-constructible input + // - no broadcast requested + // - more than one process to run. + // The idea is that process 0 receives the "original" input via + // the launcher, but other processes cannot create a meaningful + // one, because the input is non-default-constructible and + // because broadcast has not been requested (note: broadcast + // occurs ONLY on user's request): in such a case, the call + // cannot proceed and is aborted + ERROR_ON( should_fail && ret == grb::SUCCESS, + "run is successful, but should have failed" ); + ERROR_ON( out.exit_code != expected_retval, + "typed test FAILED with exit code " << out.exit_code ); + } + } + } +#ifdef DISTRIBUTED_EXECUTION + if( mode == grb::FROM_MPI || mode == grb::MANUAL ) { + success = MPI_Finalize(); + ERROR_ON( success != MPI_SUCCESS, "Call to MPI_Finalize failed" ); + } +#endif + + std::cout << "\nTest OK\n" << std::endl; + return EXIT_SUCCESS; +} + diff --git a/tests/unit/mxv.cpp b/tests/unit/mxv.cpp index 226fa5999..c35270376 100644 --- a/tests/unit/mxv.cpp +++ b/tests/unit/mxv.cpp @@ -112,7 +112,7 @@ int main( int argc, char ** argv ) { grb::Launcher< AUTOMATIC > automatic_launcher; - if( automatic_launcher.exec( &grbProgram, in, out ) != SUCCESS ) { + if( automatic_launcher.exec( &grbProgram, in, out, true ) != SUCCESS ) { std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl; return EXIT_FAILURE; } diff --git a/tests/unit/pinnedVector.cpp b/tests/unit/pinnedVector.cpp index c9a4ce7b4..3a28c895a 100644 --- a/tests/unit/pinnedVector.cpp +++ b/tests/unit/pinnedVector.cpp @@ -342,7 +342,7 @@ int runTests( struct input< T > &in ) { for( const auto &test : AllTests ) { // run test in.test = test; - rc = rc ? rc : launcher.exec( &grbProgram, in, out ); + rc = rc ? rc : launcher.exec( &grbProgram, in, out, true ); if( out.error_code != SUCCESS ) { return offset + 10; } @@ -531,6 +531,26 @@ int runTests( struct input< T > &in ) { return 0; } +// default-constructible and trivially copiable pair of values for launcher +struct Couple { + size_t a; float b; + bool operator==( const struct Couple &c ) const { + return c.a == a && c.b == b; + } + + bool operator!=( const struct Couple &c ) const { + return !((*this) == c ); + } +}; + +#ifdef _DEBUG +// adaptor to output stream +std::ostream & operator<<( std::ostream &out, const struct Couple &c ) { + out << "( " << c.a << ", " << c.b << " )"; + return out; +} +#endif + int main( int argc, char ** argv ) { // sanity check if( argc != 1 ) { @@ -562,12 +582,19 @@ int main( int argc, char ** argv ) { // run tests using a non-fundamental type if( error == 0 ) { - std::cout << "\t running tests with std::pair vector entries...\n"; + std::cout << "\t running tests with DC and SL vector entries...\n"; struct input< std::pair< size_t, float > > in_pair; in_pair.element = std::make_pair< size_t, float >( 17, -2.7 ); in_pair.mode = mode; error = runTests( in_pair ); } + if( error == 0 ) { + std::cout << "\t running tests with DC and TC vector entries...\n"; + struct input< struct Couple > in_pair; + in_pair.element = { 17, -2.7 }; + in_pair.mode = mode; + error = runTests( in_pair ); + } if( error ) { break; } } diff --git a/tests/unit/sparse_mxv.cpp b/tests/unit/sparse_mxv.cpp index ee82484bb..f56c42b7d 100644 --- a/tests/unit/sparse_mxv.cpp +++ b/tests/unit/sparse_mxv.cpp @@ -190,14 +190,15 @@ void grbProgram( const int &, int &error ) { } int main( int argc, char ** argv ) { - (void)argc; - (void)printf( "Functional test executable: %s\n", argv[ 0 ] ); + (void) argc; + std::cout << "Functional test executable: " << argv[ 0 ] << "\n"; // sanity check against metabugs int error = 0; for( size_t i = 0; i < 15; ++i ) { if( ! grb::utils::equals( data1[ i ] * data2[ i ], chk[ i ] ) ) { - (void)fprintf( stderr, "Sanity check error at position %zd: %d + %d does not equal %d.\n", i, data1[ i ], data2[ i ], chk[ i ] ); + std::cerr << "Sanity check error at position " << i << ": " << data1[ i ] + << " + " << data2[ i ] << " does not equal " << chk[ i ] << ".\n"; error = 1; } } @@ -205,15 +206,16 @@ int main( int argc, char ** argv ) { if( !error ) { grb::Launcher< AUTOMATIC > launcher; if( launcher.exec( &grbProgram, error, error ) != grb::SUCCESS ) { - (void)fprintf( stderr, "Fatal error: could not launch test.\n" ); + std::cerr << "Fatal error: could not launch test.\n"; error = 2; } } if( !error ) { - (void)printf( "Test OK\n\n" ); + std::cout << "Test OK\n" << std::endl; } else { - (void)printf( "Test FAILED\n\n" ); + std::cerr << std::flush; + std::cout << "Test FAILED\n" << std::endl; } // done diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index f34229c16..5963dc16c 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -179,6 +179,14 @@ for MODE in ${MODES}; do grep 'Test OK' ${TEST_OUT_DIR}/id_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " + if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then + echo ">>> [x] [ ] Testing grb::id on distributed vectors and matrices" + $runner ${TEST_BIN_DIR}/id_distributed_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/id_distributed_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo " " + fi + echo ">>> [x] [ ] Testing grb::capacity, grb::resize, and default" echo " and explicit capacities set during container" echo " construction" @@ -643,6 +651,14 @@ for MODE in ${MODES}; do fi echo " " + echo ">>> [x] [ ] Testing Launcher and Benchmarker, AUTOMATIC mode." + test_name=launch_benchmark_auto_${MODE}_${BACKEND} + test_log=${TEST_OUT_DIR}/${test_name}_${P}_${T}.log + $runner ${TEST_BIN_DIR}/${test_name} &> ${test_log} + head -1 ${test_log} + grep -i 'Test OK' ${test_log} || echo "Test FAILED" + echo " " + #if [ "$BACKEND" = "reference_omp" ]; then # echo "Additional standardised unit tests not yet supported for the ${BACKEND} backend" # echo @@ -651,9 +667,40 @@ for MODE in ${MODES}; do #none here: all unit tests are operational for reference_omp + if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then + echo ">>> [x] [ ] Testing Launcher and Benchmarker, FROM_MPI mode for distributed backends." + test_name=launch_benchmark_frommpi_manual_${MODE}_${BACKEND} + test_log=${TEST_OUT_DIR}/launch_benchmark_frommpi_${MODE}_${BACKEND}_${P}_${T}.log + $runner ${TEST_BIN_DIR}/${test_name} &> ${test_log} + head -1 ${test_log} + grep -i 'Test OK' ${test_log} || echo "Test FAILED" + echo " " + else + echo ">>> [x] [ ] Testing Launcher and Benchmarker, MANUAL mode for shared-memory backends." + test_log=${TEST_OUT_DIR}/launch_benchmark_manual_${MODE}_${BACKEND}_${P}_${T}.log + $runner ${TEST_BIN_DIR}/launch_benchmark_frommpi_manual_${MODE}_${BACKEND} localhost 77770 1 0 &> ${test_log} + head -1 ${test_log} + grep -i 'Test OK' ${test_log} || echo "Test FAILED" + echo " " + fi + done + done + if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then + echo ">>> [x] [ ] Testing Launcher and Benchmarker, MANUAL mode for distributed backends." + test_name=launch_benchmark_frommpi_manual_${MODE}_${BACKEND} + test_log=${TEST_OUT_DIR}/launch_benchmark_manual_${MODE}_${BACKEND}.log + bash -c "${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 0 &> ${test_log}.0 & \ + ${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 3 &> ${test_log}.3 & \ + ${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 1 &> ${test_log}.1 & \ + ${MANUALRUN} ${TEST_BIN_DIR}/${test_name} localhost 77770 4 2 &> ${test_log}.2 & \ + wait" + (grep -q 'Test OK' ${test_log}.1 && grep -q 'Test OK' ${test_log}.2 && grep -q 'Test OK' ${test_log}.3 \ + && grep -q 'Test OK' ${test_log}.0 && printf "Test OK.\n\n") || (printf "Test FAILED.\n\n") + fi + if [ "$BACKEND" = "bsp1d" ]; then echo "Additional unit tests for the BSP1D backend:" echo " " diff --git a/tests/unit/vmxa.cpp b/tests/unit/vmxa.cpp index 030117a3f..cd8048aa4 100644 --- a/tests/unit/vmxa.cpp +++ b/tests/unit/vmxa.cpp @@ -118,7 +118,7 @@ int main( int argc, char ** argv ) { if( error == 0 ) { grb::RC rc = grb::SUCCESS; grb::Launcher< grb::AUTOMATIC > launcher; - rc = launcher.exec( alpProgram, rc, error ); + rc = launcher.exec( alpProgram, rc, error, true ); if( rc != grb::SUCCESS ) { std::cerr << "Could not launch the ALP program.\n"; error = 10; diff --git a/tests/unit/vxm.cpp b/tests/unit/vxm.cpp index 1cccf2ebf..4430aa416 100644 --- a/tests/unit/vxm.cpp +++ b/tests/unit/vxm.cpp @@ -113,7 +113,7 @@ int main( int argc, char ** argv ) { grb::Launcher< AUTOMATIC > automatic_launcher; - if( automatic_launcher.exec( &grbProgram, in, out ) != SUCCESS ) { + if( automatic_launcher.exec( &grbProgram, in, out, true ) != SUCCESS ) { std::cout << "Test FAILED (launcher did not return SUCCESS).\n" << std::endl; return EXIT_FAILURE; } diff --git a/tests/unit/wait.cpp b/tests/unit/wait.cpp index b07920ffc..1456b6473 100644 --- a/tests/unit/wait.cpp +++ b/tests/unit/wait.cpp @@ -155,11 +155,12 @@ int main( int argc, char ** argv ) { std::cout << "This is functional test " << argv[ 0 ] << "\n"; grb::Launcher< grb::AUTOMATIC > launcher; grb::RC out; - if( launcher.exec( &grbProgram, input, out, false ) != grb::SUCCESS ) { + if( launcher.exec( &grbProgram, input, out, true ) != grb::SUCCESS ) { std::cerr << "Launching test FAILED\n"; return 255; } if( out != grb::SUCCESS ) { + std::cerr << std::flush; std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl; return out; } else { diff --git a/tests/utils/output_verification.hpp b/tests/utils/output_verification.hpp index 531695129..c9c3ae403 100644 --- a/tests/utils/output_verification.hpp +++ b/tests/utils/output_verification.hpp @@ -27,13 +27,13 @@ #include -#include #include -#include +#include #include -#include +#include #include #include +#include #include @@ -238,7 +238,7 @@ int vector_verification( // the condition evaluated by the function isless will be false and then // the whole condition of the if-statement will be evaluated to true // making the verification to fail as expected - if( !isless( curInfNorm, c2 * magnitudeInf + eps ) ) { + if( !std::isless( curInfNorm, c2 * magnitudeInf + eps ) ) { std::cerr << "Output vector failed inf-norm verification at index " << i << ":\n" << "\tmeasured absolute error at this index: " << curInfNorm << "\n" @@ -260,7 +260,7 @@ int vector_verification( // isgreaterequal is used to ensure that the condition norm_inf >= 0 // will be evaluated to false when norm_inf is equal to NaN or -NaN - if( !isgreaterequal( norm_inf, 0 ) ) { + if( !std::isgreaterequal( norm_inf, 0 ) ) { std::cerr << "Output vector failed inf-norm verification:\n" << "\tinf-norm is neither positive nor zero -- " << "it reads " << norm_inf << " instead\n"; @@ -275,7 +275,7 @@ int vector_verification( // isgreaterequal is used to ensure that the condition norm2 >= 0 // will be evaluated to false when norm2 is equal to NaN or -NaN - if( isgreaterequal( norm2, 0 ) ) { + if( std::isgreaterequal( norm2, 0 ) ) { norm2 = sqrt( norm2 ); } else { std::cerr << "Output vector failed 2-norm verification:\n" @@ -293,7 +293,7 @@ int vector_verification( delete [] raw_output_vector; // perform check and return - if( !isless( norm2, c1 * magnitude2 + n * eps ) ) { + if( !std::isless( norm2, c1 * magnitude2 + n * eps ) ) { std::cerr << "Output vector failed 2-norm verification:\n" << "\t2-norm is " << norm2 << ".\n" << "\t2-norm is larger than the specified relative tolerance of " @@ -306,7 +306,7 @@ int vector_verification( << "\t2-norm is " << norm2 << " which is smaller or equal to the effective " << "relative tolerance of " << (c1 * magnitude2 + n * eps) << "\n"; } - if( !isless( norm_inf, c2 * magnitudeInf + eps ) ) { + if( !std::isless( norm_inf, c2 * magnitudeInf + eps ) ) { std::cerr << "Output vector failed inf-norm verification:\n" << "\tinf-norm is " << norm_inf << " at index " << norm_inf_at << "\n" << "\tinf-norm is larger than the specified relative tolerance of " diff --git a/tests/utils/print_vec_mat.hpp b/tests/utils/print_vec_mat.hpp index 4db9d0afb..761664ed4 100644 --- a/tests/utils/print_vec_mat.hpp +++ b/tests/utils/print_vec_mat.hpp @@ -20,32 +20,58 @@ /** * @file print_vec_mat.hpp - * @author Alberto Scolari (alberto.scolari@huawei.com) - * @brief Routines to print a grb::Vector, a grb::Matrix and a grb::PinnedVector; they are in templated form - * to be generic w.r.t. stored data type and backend implementation. - * @version 0.1 - * @date 2021-04-30 + * + * Utilities to print grb containers and objects. + * + * @authors + * - Alberto Scolari (alberto.scolari@huawei.com) + * - Benjamin Lozes (benjamin.lozes@huawei.com) + * + * Routines to print: + * - grb::Vector, grb::Matrix & grb::PinnedVector: These primitives are in + * templated form to be generic w.r.t. stored data type and + * backend implementation. + * - reference/CompressedStorage (CRS & CCS): These primitives are in + * templated form to be generic w.r.t. stored data type, but only for + * reference and reference_omp backends. + * + * @version 0.2 + * @date 25th of August 2023 */ + #include #include +#include +#include #include +using namespace grb; + /** - * @brief Prints the first \p _limit items (including zeroes) of vector \p x with optional heading \p head. + * Prints the first \p limit items (including zeroes) of vector \p x + * with optional heading \p head. * - * @tparam T vector data type - * @tparam B GraphBLAS backend storing the vector - * @param x vector to print - * @param _limit max number of elements to print; 0 for the entire vector + * Contents will be printed to the standard output stream. + * + * @tparam T Vector data type. + * @tparam B Vector backend. + * + * @param[in] x The vector to print + * @param[in] limit Max. number of elements to print; 0 for the entire vector * @param head optional heading to print \b before the vector + * + * \warning Assumes iterators over \a x are ordered. */ template< typename T, enum grb::Backend B > -void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const char * head = nullptr ) { - // const T * const raw{grb::internal::getRaw(x)}; - size_t x_size { grb::size( x ) }; - size_t limit { _limit == 0 ? x_size : std::min( x_size, _limit ) }; +void print_vector( + const grb::Vector< T, B > &x, + size_t limit = 10UL, + const char * const head = nullptr +) { + size_t x_size = grb::size( x ); + limit = limit == 0 ? x_size : std::min( x_size, limit ); if( head != nullptr ) { std::cout << "<<< " << head << " >>>" << std::endl; @@ -55,19 +81,21 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch std::cout << "(size 0 vector)"; } - typename grb::Vector< T, B >::const_iterator it { x.cbegin() }; - typename grb::Vector< T, B >::const_iterator end { x.cend() }; + typename grb::Vector< T, B >::const_iterator it = x.cbegin(); + typename grb::Vector< T, B >::const_iterator end = x.cend(); - size_t previous_nnz { it == end ? limit : it->first }; + size_t previous_nnz = it == end ? limit : it->first; if( previous_nnz == 0 ) { std::cout << it->second; - ++it; + (void) ++it; } else if( x_size > 0 ) { std::cout << 0; } - size_t next_nnz { it == end ? limit : it->first }, position { 1 }; + size_t next_nnz, position; + next_nnz = it == end ? limit : it->first; + position = 1; while( position < limit ) { - size_t zero_streak { std::min( next_nnz, limit ) }; + size_t zero_streak = std::min( next_nnz, limit ); // print sequence of zeroes for( ; position < zero_streak; ++position ) { std::cout << ", "; @@ -76,8 +104,8 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch if( position < limit ) { std::cout << ", "; std::cout << it->second; - ++position; - ++it; + (void) ++position; + (void) ++it; next_nnz = it->first; } } @@ -85,18 +113,23 @@ void print_vector( const grb::Vector< T, B > & x, size_t _limit = 10UL, const ch } /** - * @brief Prints the first \p limit items of pinned vector \p x with optional + * Prints the first \p limit items of pinned vector \p x with optional * heading \p head. * + * Contents will be printed to the standard output stream. + * * @tparam T vector data type * @tparam B GraphBLAS backend storing the vector * - * @param[in] v pinned vector to print - * @param[in] _limit max number of elements to print; 0 for the entire vector + * @param[in] v Pinned vector to print + * @param[in] limit Max number of elements to print; 0 for the entire vector * @param[in] head optional heading to print \b before the vector + * + * \warning Nonzero values will be printed in an undefined order. */ template< typename T, enum grb::Backend B > -void print_vector( const grb::PinnedVector< T, B > &v, +void print_vector( + const grb::PinnedVector< T, B > &v, const size_t limit = 10UL, const char * const head = nullptr ) { @@ -107,83 +140,137 @@ void print_vector( const grb::PinnedVector< T, B > &v, std::cout << "<<< " << head << " >>>" << std::endl; } std::cout << "First " << limit << " nonzeroes of x are: ( "; - size_t k { 0 }; + size_t k = 0; if( k < v.nonzeroes() && limit > 0 ) { std::cout << v.getNonzeroValue( k++ ); } - for( size_t nnzs { 1 }; nnzs < limit && k < v.nonzeroes(); k++ ) { + for( size_t nnzs = 1; nnzs < limit && k < v.nonzeroes(); k++ ) { std::cout << ", " << v.getNonzeroValue( k ); - ++nnzs; + (void) ++nnzs; } std::cout << " )" << std::endl; } /** - * @brief Easy matrix container to store a matrix in a \b dense format, thus also zeroes are stored - * and the memory occupation is proportional to the full size of the matrix; hence, use with case! + * Easy matrix container to store a matrix in a \b dense format. + * + * \warning Thus, also zeroes are stored and the memory occupation is + * proportional to the full size of the matrix. Hence, use this + * function with care! + * + * @tparam T the type of the matrix values. * - * @tparam T the type of the matrix values */ template< typename T > struct dense_mat { - const size_t rows, cols; ///< matrix dimensions - T * const dense; ///< pointer to data, stored in a linear format (row-wise) + + /** The number of rows in the matrix. */ + const size_t rows; + + /** The number of columns in the matrix. */ + const size_t cols; + + /** Pointer to the raw data, row-major storage. */ + T * const dense; /** - * @brief Construct a new dense_mat object of given rows and columns, allocating the necessary - * physical memory for dense storage. + * Construct a new dense_mat object of given rows and columns. + * + * This function allocates the necessary physical memory for dense + * storage. + * + * @param[in] rows The number of matrix rows. + * @param[in] cols The number of matrix columns. + * @param[in] initial_value Optional; by default equal to zero. + * + * \warning This function assumes that zero maps to the literal 0. + * + * @throws Out of memory errors in case #::dense cannot be allocated. */ - dense_mat( size_t _nrows, size_t _ncols ) : - rows( _nrows ), cols( _ncols ), dense( new T[ rows * cols ] ) // we assume new throws if not enough memory + dense_mat( + const size_t _nrows, const size_t _ncols, + const T initial_value = T( 0 ) + ) : + rows( _nrows ), cols( _ncols ), + dense( new T[ rows * cols ] ) { assert( rows != 0 ); assert( cols != 0 ); - memset( dense, T( 0 ), rows * cols * ( sizeof( T ) ) ); + std::fill( dense, dense + rows * cols, initial_value ); } + /** + * Releases the resources corresponding to this instance. + */ ~dense_mat() { delete[] dense; } /** - * @brief Operator to access an entire row, which simply returns the pointer to the first row element; - * this way, one can conveniently write \code mat[i][j]] \endcode to access each element. + * Operator to access an entire row. + * + * @param[in] row The row to access. + * + * Simply returns the pointer to the first row element; this way, one can + * conveniently write \code mat[i][j]] \endcode to access each element. */ - inline T * operator[]( size_t row ) { + inline T * operator[]( const size_t row ) { return dense + row * cols; } /** - * @brief Operator to access an entire row, which simply returns the const pointer to the first row element; - * this way, one can conveniently write \code mat[i][j]] \endcode to access each element. + * Operator to access an entire row. + * + * @param[in] row The row to access. + * + * Simply returns the const pointer to the first row element; this way, one can + * conveniently write \code mat[i][j]] \endcode to access each element. */ - inline const T * operator[]( size_t row ) const { + inline const T * operator[]( const size_t row ) const { return dense + row * cols; } }; /** - * @brief Prints up to \p _limit rows and columns of matrix \p mat with optional heading \p head. + * Prints up to \p limit rows and columns of matrix \p mat with optional + * heading \p head. * * @tparam T matrix data type - * @tparam B GraphBLAS backend storing the matrix - * @param mat matrix to print - * @param _limit max number of rows and columns to print (0 for all) - * @param head optional heading to print \b before the matrix + * @tparam B ALP/GraphBLAS backend storing the matrix + * + * @param[in] mat Matrix to print + * @param[in] limit Max. number of rows and columns to print (0 for all) + * @param[in] head Optional heading to print \b before the matrix + * + * \warning This first casts \a mat to a dense matrix. + * + * \warning This function does not guard against iterators over \a mat + * (erroneously) returning an element at the same coordinate more + * than once. */ -template< typename T, enum grb::Backend B > -void print_matrix( const grb::Matrix< T, B > & mat, size_t _limit = 0, const char * head = nullptr ) { +template< + typename T, + enum grb::Backend B, + typename std::enable_if< !std::is_void< T >::value >::type * = nullptr +> +void print_matrix( + const grb::Matrix< T, B > &mat, + const size_t limit = 0, + const char * const head = nullptr +) { const size_t rows = grb::nrows( mat ); const size_t cols = grb::ncols( mat ); - size_t row_limit = _limit == 0 ? rows : std::min( _limit, rows ); - size_t col_limit = _limit == 0 ? cols : std::min( _limit, cols ); + size_t row_limit = limit == 0 ? rows : std::min( limit, rows ); + size_t col_limit = limit == 0 ? cols : std::min( limit, cols ); // create and dump only relevant portion - dense_mat< T > dump( row_limit, col_limit ); - for( const std::pair< std::pair< size_t, size_t >, T > & t : mat ) { - size_t row { t.first.first }; - size_t col { t.first.second }; + dense_mat< std::pair< bool, T> > dump( + row_limit, col_limit, std::make_pair( false, static_cast< T >( 0 ) ) + ); + for( const std::pair< std::pair< size_t, size_t >, T > &t : mat ) { + size_t row = t.first.first; + size_t col = t.first.second; if( row < row_limit && col < col_limit ) { - dump[ row ][ col ] = t.second; + dump[ row ][ col ] = std::make_pair( true, t.second ); } } @@ -194,18 +281,307 @@ void print_matrix( const grb::Matrix< T, B > & mat, size_t _limit = 0, const cha std::cout << "Size: " << rows << " x " << cols << std::endl; for( size_t i = 0; i < row_limit; ++i ) { for( size_t j = 0; j < col_limit; ++j ) { - double val = dump[ i ][ j ]; - std::cout << val; - if( val == 0.0 ) { - std::cout << " "; + bool assigned = dump[ i ][ j ].first; + auto val = dump[ i ][ j ].second; + if( assigned ) { + std::cout << val; } else { - std::cout << " "; + std::cout << "_"; } + std::cout << " "; } std::cout << std::endl; } std::cout << "==============" << std::endl << std::endl; } +/** + * Prints up to \p limit rows and columns of matrix \p mat with optional header + * \p head. + * + * Specialisation for void matrices. + * + * @tparam T matrix data type + * @tparam B GraphBLAS backend storing the matrix + * + * @param[in] mat Matrix to print + * @param[in] limit Max. number of rows and columns to print (0 for all) + * @param[in] head Optional heading to print \b before the matrix + * + * \warning This first casts \a mat to a dense matrix. + * + * \warning This function does not guard against iterators over \a mat + * (erroneously) returning an element at the same coordinate more + * than once. + */ +template< + typename T, + enum grb::Backend B, + typename std::enable_if< std::is_void< T >::value >::type * = nullptr +> +void print_matrix( + const grb::Matrix< T, B > &mat, + size_t limit = 0, + const char * head = nullptr +) { + const size_t rows = grb::nrows( mat ); + const size_t cols = grb::ncols( mat ); + size_t row_limit = limit == 0 ? rows : std::min( limit, rows ); + size_t col_limit = limit == 0 ? cols : std::min( limit, cols ); + // create and dump only relevant portion + dense_mat< bool > assigned( row_limit, col_limit, false ); + for( const auto &t : mat ) { + auto row = t.first; + auto col = t.second; + assigned[ row ][ col ] = ( row < row_limit && col < col_limit ); + } + + if( head != nullptr ) { + std::cout << "<<< " << head << " >>>" << std::endl; + } + std::cout << "=== PATTERN-MATRIX ===" << std::endl; + std::cout << "Size: " << rows << " x " << cols << std::endl; + for( size_t i = 0; i < row_limit; ++i ) { + for( size_t j = 0; j < col_limit; ++j ) { + if( assigned[ i ][ j ] ) { + std::cout << "X"; + } else { + std::cout << "_"; + } + std::cout << " "; + } + std::cout << std::endl; + } + std::cout << "==============" << std::endl << std::endl; +} + +namespace { + + /** + * \internal + * Helper function for printing a void reference CompressedStorage object. + * \endinternal + */ + template< typename D, class Storage > + void printCompressedStorage( + const Storage &storage, + const size_t n, + const size_t nnz, + std::ostream &os = std::cout, + const typename std::enable_if< + std::is_void< D >::value, void + >::type * const = nullptr + ) { + os << " col_start (" << n + 1 << "): [ "; + for( size_t i = 0; i <= n; ++i ) { + os << storage.col_start[ i ] << " "; + } + os << "]" << std::endl; + os << " row_index (" << nnz << "): \n[\n"; + for( size_t i = 0; i < n; ++i ) { + os << " " << std::setfill( '0' ) << std::setw( 2 ) << i << ": "; + for( auto t = storage.col_start[ i ]; t < storage.col_start[ i + 1 ]; t++ ) + os << std::setfill( '0' ) << std::setw( 2 ) << storage.row_index[ t ] << " "; + os << std::endl; + } + os << "]" << std::endl; + } + + /** + * \internal + * Helper function for printing a general reference CompressedStorage object. + * \endinternal + */ + template< typename D, class Storage > + void printCompressedStorage( + const Storage &storage, + const size_t n, + const size_t nnz, + std::ostream &os, + const typename std::enable_if< + !std::is_void< D >::value, void + >::type * const = nullptr + ) { + printCompressedStorage< void >( storage, n, nnz, os ); + os << " values (" << nnz << "): [ "; + for( size_t i = 0; i < nnz; ++i ) { + os << storage.values[ i ] << " "; + } + os << "]" << std::endl << std::flush; + } + +} // namespace + +/** + * Print the CRS structure of a grb::Matrix. + * + * @tparam Enabled boolean flag to enable/disable the function. + * + * @param[in] mat Matrix CRS to print. + * @param[in] label Label to print before the matrix. + * @param[in] limit Max number of rows and columns to print (-1 for all). + * @param[in,out] os Output stream (optional; default is std::cout). + * + * \warning This function does \em not convert to CRS; if the implementing + * backend is not back by a CRS-like format, calling this function will + * not compile. + */ +template< + bool Enabled = true, + typename D, typename RIT, typename CIT, typename NIT, + Backend implementation +> +void printCRS( + const Matrix< D, implementation, RIT, CIT, NIT > &, + const std::string & = "", + const size_t limit = 128, + std::ostream & = std::cout, + const typename std::enable_if< + implementation != reference && + implementation != reference_omp, + void >::type * const = nullptr +) { + static_assert( + implementation != reference && + implementation != reference_omp, + "printCRS() is only available for reference and reference_omp backends" + ); +} + +/** + * Print the CRS structure of a grb::Matrix. + * + * This is the specialisation for the reference and reference_omp backends. + * + * @tparam Enabled boolean flag to enable/disable the function + * + * @param[in] mat Matrix CRS to print. + * @param[in] label Label to print before the matrix. + * @param[in] limit Max number of rows and columns to print (-1 for all). + * @param[in,out] os Output stream (optional; default is std::cout). + * + * \note The value -1 for \a limit refers to SIZE_MAX. + */ +template< + bool Enabled = true, + typename D, typename RIT, typename CIT, typename NIT, + Backend implementation +> +void printCRS( + const Matrix< D, implementation, RIT, CIT, NIT > &mat, + const std::string &label = "", + const size_t limit = 128, + std::ostream &os = std::cout, + const typename std::enable_if< + implementation == reference || + implementation == reference_omp, + void >::type * const = nullptr +) { + constexpr const size_t smax = std::numeric_limits< size_t >::max(); + if( !Enabled ) { return; } + if( limit < smax && (nrows( mat ) > limit || ncols( mat ) > limit) ) { return; } + + const grb::RC rc = grb::wait( mat ); + if( rc != grb::SUCCESS ) { + throw std::runtime_error( grb::toString( rc ) ); + } + os << "CRS \"" << label + << "\" (" << nrows( mat ) << "x" << ncols( mat ) << "):\n"; + printCompressedStorage< D >( + internal::getCRS( mat ), + grb::nrows( mat ), + grb::nnz( mat ), + os + ); +} + +/** + * Print the CCS structure of a grb::Matrix. + * + * @tparam Enabled boolean flag to enable/disable the function. + * + * @param[in] mat Matrix CCS to print. + * @param[in] label Label to print before the matrix. + * @param[in] limit Max number of rows and columns to print (-1 for all). + * @param[in,out] os Output stream (optional, default is std::cout. + * + * \note The value -1 for \a limit refers to SIZE_MAX. + * + * \warning This function does \em not convert to CCS; if the implementing + * backend is not back by a CCS-like format, calling this function will + * not compile. + */ +template< + bool Enabled = true, + typename D, typename RIT, typename CIT, typename NIT, + Backend implementation +> +void printCCS( + const Matrix< D, implementation, RIT, CIT, NIT > &mat, + const std::string &label = "", + const size_t limit = 128, + std::ostream &os = std::cout, + const typename std::enable_if< + implementation != reference && + implementation != reference_omp, + void >::type * const = nullptr +) { + static_assert( + implementation != reference && + implementation != reference_omp, + "printCCS() is only available for reference and reference_omp backends" + ); +} + +/** + * Print the CCS structure of a grb::Matrix. + * + * This is the specialisation for the reference and reference_omp backends. + * + * @tparam Enabled boolean flag to enable/disable the function. + * + * @param[in] mat Matrix CCS to print. + * @param[in] label Label to print before the matrix. + * @param[in] limit Max number of rows and columns to print (-1 for all). + * @param[in,out] os Output stream (optional, default is std::cout. + * + * \note The value -1 for \a limit refers to SIZE_MAX. + */ +template< + bool Enabled = true, + typename D, typename RIT, typename CIT, typename NIT, + Backend implementation +> +void printCCS( + const Matrix< D, implementation, RIT, CIT, NIT > &mat, + const std::string &label = "", + const size_t limit = 128, + std::ostream &os = std::cout, + const typename std::enable_if< + implementation == reference || + implementation == reference_omp, + void >::type * const = nullptr +) { + constexpr const size_t smax = std::numeric_limits< size_t >::max(); + if( !Enabled ) { return; } + + const long rows = static_cast< long >( nrows( mat ) ); + const long cols = static_cast< long >( ncols( mat ) ); + if( limit < smax && (rows > limit || cols > limit) ) { return; } + + const grb::RC rc = grb::wait( mat ); + if( rc != grb::SUCCESS ) { + throw std::runtime_error( grb::toString( rc ) ); + } + os << "CCS \"" << label + << "\" (" << nrows( mat ) << "x" << ncols( mat ) << "):\n" ; + printCompressedStorage< D >( + internal::getCCS( mat ), + grb::ncols( mat ), + grb::nnz( mat ), + os + ); +} + #endif // _H_TEST_UTILS_PRINT_VEC_MAT From 95e87742c6d816b72fa04b8e9d87be744989355d Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Thu, 2 Nov 2023 13:36:13 +0100 Subject: [PATCH 23/37] Clean blas3.hpp --- include/graphblas/reference/blas3.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 1f382f017..8c17995e7 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1513,7 +1513,6 @@ namespace grb { const auto j = coors1.index( k ); const auto A_val = getValue(vbuf1, j, identity_A); const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B; - std::cout << " * (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n"; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); @@ -1543,7 +1542,6 @@ namespace grb { } const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A; const auto B_val = getValue(vbuf2, j, identity_B); - std::cout << " # (" << i << ", " << j << ") = " << A_val << " " << B_val << "\n"; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); From f58de690ff843524e8600d07388e43d98b77ffca Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Tue, 28 Nov 2023 13:49:49 +0100 Subject: [PATCH 24/37] Refactor test to support bsp1d --- tests/unit/eWiseApplyMatrix_variants.cpp | 188 ++++++++++++----------- 1 file changed, 99 insertions(+), 89 deletions(-) diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index cab4b4ec7..71875d755 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -171,25 +171,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { rc = SUCCESS; } -int main( int argc, char ** argv ) { - (void) argc; - (void) argv; - - size_t N = 10; - - if( argc > 2 ) { - std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl; - return 1; - } - if( argc == 2 ) { - N = std::stoul( argv[ 1 ] ); - } - - std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush; - - Launcher< AUTOMATIC > launcher; - - // Create input data +void test_program( const size_t& N, size_t& ) { /** Matrix A: Row matrix filled with A_INITIAL_VALUE * X X X X X * _ _ _ _ _ @@ -198,14 +180,19 @@ int main( int argc, char ** argv ) { * _ _ _ _ _ * (...) */ - Matrix< nz_type > A( N, N, N ); + Matrix< nz_type > A( N, N ); + grb::resize( A, N ); { std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); std::iota( A_cols.begin(), A_cols.end(), 0 ); - if( SUCCESS != + if( + SUCCESS != buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL ) - ) { return 2; } + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } } /** Matrix B: Column matrix filled with B_INITIAL_VALUE @@ -221,9 +208,13 @@ int main( int argc, char ** argv ) { std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); std::iota( B_rows.begin(), B_rows.end(), 0 ); - if( SUCCESS != - buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL ) - ) { return 3; } + if( + SUCCESS != + buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL + ) ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } } { // C = A .+ B @@ -246,15 +237,18 @@ int main( int argc, char ** argv ) { std::fill( C_monoid_truth_values.begin() + 1, C_monoid_truth_values.begin() + nrows( A ), A_INITIAL_VALUE ); std::fill( C_monoid_truth_values.begin() + nrows( A ), C_monoid_truth_values.end(), B_INITIAL_VALUE ); if( SUCCESS != - buildMatrixUnique( - C_monoid_truth, - C_monoid_truth_rows.data(), - C_monoid_truth_cols.data(), - C_monoid_truth_values.data(), - C_monoid_truth_values.size(), - SEQUENTIAL - ) - ) { return 4; } + buildMatrixUnique( + C_monoid_truth, + C_monoid_truth_rows.data(), + C_monoid_truth_cols.data(), + C_monoid_truth_values.data(), + C_monoid_truth_values.size(), + SEQUENTIAL + ) + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } /** Matrix C_op_truth: Intersection of A and B * X+Y ___ ___ ___ ___ @@ -268,30 +262,28 @@ int main( int argc, char ** argv ) { std::vector< size_t > C_op_truth_rows( 1, 0 ), C_op_truth_cols( 1, 0 ); std::vector< nz_type > C_op_truth_values( 1, A_INITIAL_VALUE + B_INITIAL_VALUE ); if( SUCCESS != - buildMatrixUnique( - C_op_truth, - C_op_truth_rows.data(), - C_op_truth_cols.data(), - C_op_truth_values.data(), - C_op_truth_values.size(), - SEQUENTIAL - ) - ) { return 5; } + buildMatrixUnique( + C_op_truth, + C_op_truth_rows.data(), + C_op_truth_cols.data(), + C_op_truth_values.data(), + C_op_truth_values.size(), + SEQUENTIAL + ) + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } input_t< - Monoid< operators::add< nz_type >, identities::zero > + Monoid< operators::add< nz_type >, identities::zero > > input { A, B, C_monoid_truth, C_op_truth }; output_t output { SUCCESS }; // Run the test - RC rc = launcher.exec( &grb_program, input, output, false ); - // Check the result - if( rc != SUCCESS ) { - std::cerr << "Error: Launcher::exec\n"; - return 6; - } + grb_program(input, output ); if( output.rc != SUCCESS ) { - std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; - return 7; + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED (" + toString( output.rc ) + ")" ); } } @@ -311,30 +303,27 @@ int main( int argc, char ** argv ) { std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+A_INITIAL_VALUE ); std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 ); if( SUCCESS != - buildMatrixUnique( - C_truth, - C_truth_rows.data(), - C_truth_cols.data(), - C_truth_values.data(), - C_truth_values.size(), - SEQUENTIAL - ) - ) { return 8; } + buildMatrixUnique( + C_truth, + C_truth_rows.data(), + C_truth_cols.data(), + C_truth_values.data(), + C_truth_values.size(), + SEQUENTIAL + )) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } input_t< - Monoid< operators::add< nz_type >, identities::zero > + Monoid< operators::add< nz_type >, identities::zero > > input { A, A, C_truth, C_truth }; output_t output { SUCCESS }; // Run the test - RC rc = launcher.exec( &grb_program, input, output, false ); - // Check the result - if( rc != SUCCESS ) { - std::cerr << "Error: Launcher::exec\n"; - return 9; - } + grb_program(input, output ); if( output.rc != SUCCESS ) { - std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; - return 10; + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED (" + toString( output.rc ) + ")" ); } } @@ -354,33 +343,54 @@ int main( int argc, char ** argv ) { std::vector< nz_type > C_truth_values( nvalues, A_INITIAL_VALUE+B_INITIAL_VALUE ); std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 ); if( SUCCESS != - buildMatrixUnique( - C_truth, - C_truth_rows.data(), - C_truth_cols.data(), - C_truth_values.data(), - C_truth_values.size(), - SEQUENTIAL - ) - ) { return 8; } + buildMatrixUnique( + C_truth, + C_truth_rows.data(), + C_truth_cols.data(), + C_truth_values.data(), + C_truth_values.size(), + SEQUENTIAL + )) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED: buildMatrixUnique" ); + } input_t< - Monoid< operators::add< nz_type >, identities::zero >, - descriptors::transpose_right + Monoid< operators::add< nz_type >, identities::zero >, + descriptors::transpose_right > input { A, B, C_truth, C_truth }; output_t output { SUCCESS }; // Run the test - RC rc = launcher.exec( &grb_program, input, output, false ); - // Check the result - if( rc != SUCCESS ) { - std::cerr << "Error: Launcher::exec\n"; - return 9; - } + grb_program(input, output ); if( output.rc != SUCCESS ) { - std::cerr << "Test FAILED (" << toString( output.rc ) << ")" << std::endl; - return 10; + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + "Test FAILED (" + toString( output.rc ) + ")" ); } } +} + +int main( int argc, char ** argv ) { + (void) argc; + (void) argv; + + std::cerr << __func__ << " is not implemented yet" << std::endl; + + size_t N = 10; + + if( argc > 2 ) { + std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl; + return 1; + } + if( argc == 2 ) { + N = std::stoul( argv[ 1 ] ); + } + + std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush; + + Launcher< AUTOMATIC > launcher; + + // Create input data + RC rc = launcher.exec( &test_program, N, N, false ); std::cerr << std::flush; std::cout << "Test OK" << std::endl << std::flush; From d0cecde6ae98fc02b1c4487531738926664ab81b Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Tue, 28 Nov 2023 13:58:26 +0100 Subject: [PATCH 25/37] Simplify bsp1d API --- include/graphblas/bsp1d/blas3.hpp | 112 ++++++++++++------------------ 1 file changed, 45 insertions(+), 67 deletions(-) diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp index 386beb164..1cf46e98e 100644 --- a/include/graphblas/bsp1d/blas3.hpp +++ b/include/graphblas/bsp1d/blas3.hpp @@ -119,89 +119,67 @@ namespace grb { /** \internal Simply delegates to process-local backend */ template< - Descriptor descr = descriptors::no_operation, - class MulMonoid, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 + Descriptor descr = descriptors::no_operation, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 > RC eWiseApply( - Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, - const MulMonoid &mul, - const Phase phase = EXECUTE, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_monoid< MulMonoid >::value, - void >::type * const = nullptr + Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, + const MulMonoid &mul, + const Phase phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< MulMonoid >::value, + void >::type * const = nullptr ) { assert( phase != TRY ); - RC local_rc = SUCCESS; - if( phase == RESIZE ) { - RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), internal::getLocal( B ), - mul, - RESIZE - ); - if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) { - return PANIC; - } else { - return ret; - } - } else { - assert( phase == EXECUTE ); - local_rc = eWiseApply< descr >( + RC ret = eWiseApply< descr >( internal::getLocal( C ), - internal::getLocal( A ), internal::getLocal( B ), + internal::getLocal( A ), + internal::getLocal( B ), mul, - EXECUTE - ); - } - return internal::checkGlobalErrorStateOrClear( C, local_rc ); + phase + ); + return internal::checkGlobalErrorStateOrClear( C, ret ); } /** \internal Simply delegates to process-local backend */ template< - Descriptor descr = descriptors::no_operation, - class Operator, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 + Descriptor descr = descriptors::no_operation, + class Operator, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 > RC eWiseApply( - Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, - const Operator &op, - const Phase phase = EXECUTE, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr + Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, + const Operator &op, + const Phase phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void >::type * const = nullptr ) { assert( phase != TRY ); RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), internal::getLocal( B ), - op, - phase + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + op, + phase ); - if( phase == RESIZE ) { - if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) { - return PANIC; - } else { - return SUCCESS; - } - } - assert( phase == EXECUTE ); return internal::checkGlobalErrorStateOrClear( C, ret ); } From c524470b3e2c035885eeeca438908db471357c88 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 4 Dec 2023 14:03:23 +0100 Subject: [PATCH 26/37] Syntax arrangements --- include/graphblas/bsp1d/blas3.hpp | 88 ++++++++++++------------ include/graphblas/reference/blas3.hpp | 66 +++++++++--------- tests/unit/eWiseApplyMatrix_variants.cpp | 47 ++++++------- 3 files changed, 101 insertions(+), 100 deletions(-) diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp index 1cf46e98e..0ddbdf41b 100644 --- a/include/graphblas/bsp1d/blas3.hpp +++ b/include/graphblas/bsp1d/blas3.hpp @@ -119,66 +119,66 @@ namespace grb { /** \internal Simply delegates to process-local backend */ template< - Descriptor descr = descriptors::no_operation, - class MulMonoid, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 + Descriptor descr = descriptors::no_operation, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 > RC eWiseApply( - Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, - const MulMonoid &mul, - const Phase phase = EXECUTE, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_monoid< MulMonoid >::value, - void >::type * const = nullptr + Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, + const MulMonoid &mul, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< MulMonoid >::value, void + >::type * const = nullptr ) { assert( phase != TRY ); RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), - internal::getLocal( B ), - mul, - phase + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + mul, + phase ); return internal::checkGlobalErrorStateOrClear( C, ret ); } /** \internal Simply delegates to process-local backend */ template< - Descriptor descr = descriptors::no_operation, - class Operator, - typename OutputType, typename InputType1, typename InputType2, - typename RIT1, typename CIT1, typename NIT1, - typename RIT2, typename CIT2, typename NIT2, - typename RIT3, typename CIT3, typename NIT3 + Descriptor descr = descriptors::no_operation, + class Operator, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 > RC eWiseApply( - Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, - const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, - const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, - const Operator &op, - const Phase phase = EXECUTE, - const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr + Matrix< OutputType, BSP1D, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, BSP1D, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, BSP1D, RIT3, CIT3, NIT3 > &B, + const Operator &op, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, void + >::type * const = nullptr ) { assert( phase != TRY ); RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), - internal::getLocal( B ), - op, - phase + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + op, + phase ); return internal::checkGlobalErrorStateOrClear( C, ret ); } diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 8c17995e7..219bd93d4 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -961,12 +961,12 @@ namespace grb { const Operator &oper, const Phase &phase, const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void - >::type * const = nullptr + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void + >::type * const = nullptr ) { #ifdef _DEBUG std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; @@ -1264,11 +1264,12 @@ namespace grb { const Monoid &monoid, const Phase &phase, const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_monoid< Monoid >::value, - void >::type * const = nullptr + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr ) { #ifdef _DEBUG @@ -1623,11 +1624,11 @@ namespace grb { } // namespace internal /** - * Computes \f$ C = A . B \f$ for a given monoid. + * Computes \f$ C = A . B \f$ for a given monoid (union pattern). * * \internal Allows pattern matrix inputs. * - * \internal Dispatches to internal::eWiseApply_matrix_generic + * \internal Dispatches to internal::eWiseApply_matrix_generic_union */ template< Descriptor descr = descriptors::no_operation, @@ -1642,12 +1643,14 @@ namespace grb { const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, const MulMonoid &mulmono, - const Phase phase = EXECUTE, - const typename std::enable_if< !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_monoid< MulMonoid >::value, - void >::type * const = nullptr + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< MulMonoid >::value, + void + >::type * const = nullptr ) { // static checks NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || @@ -1679,13 +1682,12 @@ namespace grb { } /** - * Computes \f$ C = A . B \f$ for a given binary operator. + * Computes \f$ C = A . B \f$ for a given operator (intersection pattern). * - * \internal Pattern matrices not allowed + * \internal Allows pattern matrix inputs. * - * \internal Dispatches to internal::eWiseApply_matrix_generic + * \internal Dispatches to internal::eWiseApply_matrix_generic_intersection */ - template< Descriptor descr = grb::descriptors::no_operation, class Operator, @@ -1698,13 +1700,15 @@ namespace grb { Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, const Matrix< InputType1, reference, RIT2, CIT2, NIT2 > &A, const Matrix< InputType2, reference, RIT3, CIT3, NIT3 > &B, - const Operator &mulOp, - const Phase phase = EXECUTE, - const typename std::enable_if< !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void >::type * const = nullptr + const Operator &op, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void + >::type * const = nullptr ) { // static checks NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || @@ -1737,7 +1741,7 @@ namespace grb { #endif return internal::eWiseApply_matrix_generic_intersection< descr >( - C, A, B, mulOp, phase + C, A, B, op, phase ); } diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index 71875d755..5e92b7d3c 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -43,12 +43,13 @@ using namespace grb; - using nz_type = int; constexpr nz_type A_INITIAL_VALUE = 1; constexpr nz_type B_INITIAL_VALUE = 3; +// #define _DEBUG + template< typename D > bool equals_matrix( @@ -127,7 +128,6 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { std::cerr << "Error: Phase::EXECUTE\n"; return; } - print_matrix( C, 10, "C (intersection)" ); if( !equals_matrix( C, input.C_operator ) ) { std::cerr << "Error: Wrong result\n"; rc = FAILED; @@ -158,7 +158,6 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { std::cerr << "Error: Phase::EXECUTE\n"; return; } - print_matrix( C, 10, "C (union)" ); if( !equals_matrix( C, input.C_monoid ) ) { std::cerr << "Error: Wrong result\n"; rc = FAILED; @@ -180,8 +179,7 @@ void test_program( const size_t& N, size_t& ) { * _ _ _ _ _ * (...) */ - Matrix< nz_type > A( N, N ); - grb::resize( A, N ); + Matrix< nz_type > A( N, N, N ); { std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); @@ -208,10 +206,9 @@ void test_program( const size_t& N, size_t& ) { std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); std::iota( B_rows.begin(), B_rows.end(), 0 ); - if( - SUCCESS != - buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL - ) ) { + if( SUCCESS != + buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL) + ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) + "Test FAILED: buildMatrixUnique" ); } @@ -243,9 +240,8 @@ void test_program( const size_t& N, size_t& ) { C_monoid_truth_cols.data(), C_monoid_truth_values.data(), C_monoid_truth_values.size(), - SEQUENTIAL - ) - ) { + SEQUENTIAL) + ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) + "Test FAILED: buildMatrixUnique" ); } @@ -268,9 +264,8 @@ void test_program( const size_t& N, size_t& ) { C_op_truth_cols.data(), C_op_truth_values.data(), C_op_truth_values.size(), - SEQUENTIAL - ) - ) { + SEQUENTIAL) + ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) + "Test FAILED: buildMatrixUnique" ); } @@ -310,7 +305,8 @@ void test_program( const size_t& N, size_t& ) { C_truth_values.data(), C_truth_values.size(), SEQUENTIAL - )) { + ) + ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) + "Test FAILED: buildMatrixUnique" ); } @@ -350,7 +346,8 @@ void test_program( const size_t& N, size_t& ) { C_truth_values.data(), C_truth_values.size(), SEQUENTIAL - )) { + ) + ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) + "Test FAILED: buildMatrixUnique" ); } @@ -373,9 +370,7 @@ int main( int argc, char ** argv ) { (void) argc; (void) argv; - std::cerr << __func__ << " is not implemented yet" << std::endl; - - size_t N = 10; + size_t N = 1000; if( argc > 2 ) { std::cout << "Usage: " << argv[ 0 ] << " [n=" << N << "]" << std::endl; @@ -387,13 +382,15 @@ int main( int argc, char ** argv ) { std::cout << "This is functional test " << argv[ 0 ] << std::endl << std::flush; + // Launch the test Launcher< AUTOMATIC > launcher; - - // Create input data - RC rc = launcher.exec( &test_program, N, N, false ); + RC rc = launcher.exec( &test_program, N, N, true ); + if( rc != SUCCESS ) { + std::cout << "Test FAILED (" << grb::toString( rc ) << ")" << std::endl; + return static_cast( rc ); + } std::cerr << std::flush; - std::cout << "Test OK" << std::endl << std::flush; - + std::cout << std::flush << "Test OK" << std::endl; return 0; } From 5559540e11a8930f157ee59bcc1dd4b6f4e3845e Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 4 Dec 2023 14:43:13 +0100 Subject: [PATCH 27/37] Enabling and testing pattern inputs matrices --- include/graphblas/reference/blas3.hpp | 19 +-- tests/unit/eWiseApplyMatrix_variants.cpp | 182 +++++++++++++++++++---- 2 files changed, 162 insertions(+), 39 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 219bd93d4..e47daf074 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -972,14 +972,7 @@ namespace grb { std::cout << "In grb::internal::eWiseApply_matrix_generic_intersection\n"; #endif assert( phase != TRY ); - static_assert( - !( - std::is_same< InputType1, void >::value || - std::is_same< InputType2, void >::value - ), - "grb::internal::eWiseApply_matrix_generic_intersection: the non-monoid" - " version of elementwise mxm can only be used if neither of the" - " input matrices is a pattern matrix (of type void)" ); + constexpr bool crs_only = descr & descriptors::force_row_major; // get whether the matrices should be transposed prior to execution constexpr bool trans_left = descr & descriptors::transpose_left; @@ -1729,12 +1722,10 @@ namespace grb { "called with an output matrix C that does not match the output domain " "of the given multiplication operator" ); - static_assert( ( !( - std::is_same< InputType1, void >::value || - std::is_same< InputType2, void >::value ) - ), "grb::eWiseApply (reference, matrix <- matrix x matrix, operator): " - "the operator version of eWiseApply cannot be used if either of the " - "input matrices is a pattern matrix (of type void)" + static_assert( + !std::is_void< OutputType >::value, + "grb::eWiseApply: the elementwise mxm cannot be used if the" + " output matrix is a pattern matrix (of type void)" ); #ifdef _DEBUG std::cout << "In grb::eWiseApply( reference, operator )\n"; diff --git a/tests/unit/eWiseApplyMatrix_variants.cpp b/tests/unit/eWiseApplyMatrix_variants.cpp index 5e92b7d3c..8b8cc7437 100644 --- a/tests/unit/eWiseApplyMatrix_variants.cpp +++ b/tests/unit/eWiseApplyMatrix_variants.cpp @@ -75,19 +75,26 @@ bool equals_matrix( return std::is_permutation( A_vec.cbegin(), A_vec.cend(), B_vec.cbegin() ); } -template< class Monoid, Descriptor descr = descriptors::no_operation > + +template< + class Monoid, + typename ValueTypeA, + typename ValueTypeB, + typename ValueTypeC, + Descriptor descr = descriptors::no_operation +> struct input_t { - const Matrix< nz_type > &A; - const Matrix< nz_type > &B; - const Matrix< nz_type > &C_monoid; - const Matrix< nz_type > &C_operator; + const Matrix< ValueTypeA > &A; + const Matrix< ValueTypeB > &B; + const Matrix< ValueTypeC > &C_monoid; + const Matrix< ValueTypeC > &C_operator; const Monoid &monoid; input_t( - const Matrix< nz_type > &A = {0,0}, - const Matrix< nz_type > &B = {0,0}, - const Matrix< nz_type > &C_monoid = {0,0}, - const Matrix< nz_type > &C_operator = {0,0}, + const Matrix< ValueTypeA > &A = {0,0}, + const Matrix< ValueTypeB > &B = {0,0}, + const Matrix< ValueTypeC > &C_monoid = {0,0}, + const Matrix< ValueTypeC > &C_operator = {0,0}, const Monoid &monoid = Monoid() ) : A( A ), B( B ), @@ -96,12 +103,22 @@ struct input_t { monoid( monoid ) {} }; + struct output_t { RC rc; }; -template< class Monoid, Descriptor descr > -void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { +template< + class Monoid, + typename ValueTypeA, + typename ValueTypeB, + typename ValueTypeC, + Descriptor descr +> +void grb_program( + const input_t< Monoid, ValueTypeA, ValueTypeB, ValueTypeC, descr > &input, + output_t &output +) { static_assert( is_monoid< Monoid >::value, "Monoid required" ); const auto &op = input.monoid.getOperator(); @@ -110,7 +127,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { { // Operator variant std::cout << " -- eWiseApply using Operator, supposed to be" << " annihilating non-zeroes -> INTERSECTION\n"; - Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); + Matrix< ValueTypeC > C( nrows( input.A ), ncols( input.A ) ); rc = eWiseApply( C, input.A, input.B, op, RESIZE ); if( rc != SUCCESS ) { @@ -140,7 +157,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { { // Monoid variant std::cout << " -- eWiseApply using Monoid, supposed to consider" << " non-zeroes as the identity -> UNION\n"; - Matrix< nz_type > C( nrows( input.A ), ncols( input.A ) ); + Matrix< ValueTypeC > C( nrows( input.A ), ncols( input.A ) ); rc = eWiseApply( C, input.A, input.B, input.monoid, RESIZE ); if( rc != SUCCESS ) { @@ -171,7 +188,7 @@ void grb_program( const input_t< Monoid, descr > &input, output_t &output ) { } void test_program( const size_t& N, size_t& ) { - /** Matrix A: Row matrix filled with A_INITIAL_VALUE + /** Matrix A: Matrix filled with A_INITIAL_VALUE * X X X X X * _ _ _ _ _ * _ _ _ _ _ (...) @@ -180,6 +197,7 @@ void test_program( const size_t& N, size_t& ) { * (...) */ Matrix< nz_type > A( N, N, N ); + Matrix< void > A_void( N, N, N ); { std::vector< size_t > A_rows( N, 0 ), A_cols( N, 0 ); std::vector< nz_type > A_values( N, A_INITIAL_VALUE ); @@ -189,10 +207,18 @@ void test_program( const size_t& N, size_t& ) { buildMatrixUnique( A, A_rows.data(), A_cols.data(), A_values.data(), A_values.size(), SEQUENTIAL ) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); + } + if( + SUCCESS != + buildMatrixUnique( A_void, A_rows.data(), A_cols.data(), A_rows.size(), SEQUENTIAL ) + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED: buildMatrixUnique" ); } } + /** Matrix B: Column matrix filled with B_INITIAL_VALUE * Y _ _ _ _ * Y _ _ _ _ @@ -202,6 +228,7 @@ void test_program( const size_t& N, size_t& ) { * (...) */ Matrix< nz_type > B( N, N, N ); + Matrix< void > B_void( N, N, N ); { std::vector< size_t > B_rows( N, 0 ), B_cols( N, 0 ); std::vector< nz_type > B_values( N, B_INITIAL_VALUE ); @@ -210,7 +237,14 @@ void test_program( const size_t& N, size_t& ) { buildMatrixUnique( B, B_rows.data(), B_cols.data(), B_values.data(), B_values.size(), SEQUENTIAL) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); + } + if( + SUCCESS != + buildMatrixUnique( B_void, B_rows.data(), B_cols.data(), B_rows.size(), SEQUENTIAL ) + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED: buildMatrixUnique" ); } } @@ -243,7 +277,7 @@ void test_program( const size_t& N, size_t& ) { SEQUENTIAL) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); } /** Matrix C_op_truth: Intersection of A and B @@ -267,18 +301,19 @@ void test_program( const size_t& N, size_t& ) { SEQUENTIAL) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); } input_t< - Monoid< operators::add< nz_type >, identities::zero > + Monoid< operators::add< nz_type >, identities::zero >, + nz_type, nz_type, nz_type > input { A, B, C_monoid_truth, C_op_truth }; output_t output { SUCCESS }; // Run the test grb_program(input, output ); if( output.rc != SUCCESS ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED (" + toString( output.rc ) + ")" ); + + ": Test FAILED (" + toString( output.rc ) + ")" ); } } @@ -308,21 +343,115 @@ void test_program( const size_t& N, size_t& ) { ) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); } input_t< - Monoid< operators::add< nz_type >, identities::zero > + Monoid< operators::add< nz_type >, identities::zero >, + nz_type, nz_type, nz_type > input { A, A, C_truth, C_truth }; output_t output { SUCCESS }; // Run the test grb_program(input, output ); if( output.rc != SUCCESS ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED (" + toString( output.rc ) + ")" ); + + ": Test FAILED (" + toString( output.rc ) + ")" ); } } + { // C = A .+ A(void) + std::cout << "-- Test C = A .+ A(void)\n"; + /** Matrix C_truth: Union/intersection of A and A + * X+0 X+0 X+0 X+0 X+0 + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___(...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + const Matrix< nz_type >& C_truth = A; + + input_t< + Monoid< operators::add< nz_type >, identities::zero >, + nz_type, void, nz_type + > input { A, A_void, C_truth, C_truth }; + output_t output { SUCCESS }; + // Run the test + grb_program(input, output ); + if( output.rc != SUCCESS ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED (" + toString( output.rc ) + ")" ); + } + } + + { // C = A(void) .+ A + std::cout << "-- Test C = A(void) .+ A\n"; + /** Matrix C_truth: Union/intersection of A and A + * 0+X 0+X 0+X 0+X 0+X + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___(...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + const Matrix< nz_type >& C_truth = A; + + input_t< + Monoid< operators::add< nz_type >, identities::zero >, + void, nz_type, nz_type + > input { A_void, A, C_truth, C_truth }; + output_t output { SUCCESS }; + // Run the test + grb_program(input, output ); + if( output.rc != SUCCESS ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED (" + toString( output.rc ) + ")" ); + } + } + + { // C = A(void) .+ A + std::cout << "-- Test C = A(void) .+ A(void)\n"; + /** Matrix C_truth: Union/intersection of A and A + * 0+0 0+0 0+0 0+0 0+0 + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___(...) + * ___ ___ ___ ___ ___ + * ___ ___ ___ ___ ___ + * (...) + */ + Matrix< nz_type > C_truth( N, N ); + size_t nvalues = ncols( A ); + std::vector< size_t > C_truth_rows( nvalues, 0 ), C_truth_cols( nvalues, 0 ); + std::vector< nz_type > C_truth_values( nvalues, 0 ); + std::iota( C_truth_cols.begin(), C_truth_cols.end(), 0 ); + if( SUCCESS != + buildMatrixUnique( + C_truth, + C_truth_rows.data(), + C_truth_cols.data(), + C_truth_values.data(), + C_truth_values.size(), + SEQUENTIAL + ) + ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED: buildMatrixUnique" ); + } + + input_t< + Monoid< operators::add< nz_type >, identities::zero >, + void, void, nz_type + > input { A_void, A_void, C_truth, C_truth }; + output_t output { SUCCESS }; + // Run the test + grb_program(input, output ); + if( output.rc != SUCCESS ) { + throw std::runtime_error("(LINE " + std::to_string(__LINE__) + + ": Test FAILED (" + toString( output.rc ) + ")" ); + } + } + + { // C = A .+ Bt std::cout << "-- Test C = A .+ Bt\n"; /** Matrix C_truth: Union/intersection of A and Bt @@ -349,11 +478,12 @@ void test_program( const size_t& N, size_t& ) { ) ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED: buildMatrixUnique" ); + + ": Test FAILED: buildMatrixUnique" ); } input_t< Monoid< operators::add< nz_type >, identities::zero >, + nz_type, nz_type, nz_type, descriptors::transpose_right > input { A, B, C_truth, C_truth }; output_t output { SUCCESS }; @@ -361,9 +491,11 @@ void test_program( const size_t& N, size_t& ) { grb_program(input, output ); if( output.rc != SUCCESS ) { throw std::runtime_error("(LINE " + std::to_string(__LINE__) - + "Test FAILED (" + toString( output.rc ) + ")" ); + + ": Test FAILED (" + toString( output.rc ) + ")" ); } } + + } int main( int argc, char ** argv ) { From fb4e14115526228ffa593e5d38a59a6be5500a3c Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 4 Dec 2023 14:53:22 +0100 Subject: [PATCH 28/37] Redundancy removal in static assertions --- include/graphblas/nonblocking/blas3.hpp | 26 ------------------------- include/graphblas/reference/blas3.hpp | 6 ++++-- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp index 5ecb1fffa..8b6cd732b 100644 --- a/include/graphblas/nonblocking/blas3.hpp +++ b/include/graphblas/nonblocking/blas3.hpp @@ -500,32 +500,6 @@ namespace grb { grb::is_operator< Operator >::value, void >::type * const = nullptr ) { - // static checks - NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || - std::is_same< typename Operator::D1, InputType1 >::value ), - "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", - "called with a prefactor input matrix A that does not match the first " - "domain of the given multiplication operator" - ); - NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || - std::is_same< typename Operator::D2, InputType2 >::value ), - "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", - "called with a postfactor input matrix B that does not match the first " - "domain of the given multiplication operator" - ); - NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || - std::is_same< typename Operator::D3, OutputType >::value ), - "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", - "called with an output matrix C that does not match the output domain " - "of the given multiplication operator" - ); - static_assert( ( !( - std::is_same< InputType1, void >::value || - std::is_same< InputType2, void >::value ) - ), "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator): " - "the operator version of eWiseApply cannot be used if either of the " - "input matrices is a pattern matrix (of type void)" - ); if( internal::NONBLOCKING::warn_if_not_native && config::PIPELINE::warn_if_not_native ) { std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a " << "blocking implementation.\n" diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index e47daf074..aafe136ef 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1703,15 +1703,17 @@ namespace grb { void >::type * const = nullptr ) { + typedef typename std::conditional::value, typename Operator::D1, InputType1>::type ActualInputType1; + typedef typename std::conditional::value, typename Operator::D2, InputType1>::type ActualInputType2; // static checks NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || - std::is_same< typename Operator::D1, InputType1 >::value ), + std::is_same< typename Operator::D1, ActualInputType1 >::value ), "grb::eWiseApply (reference, matrix <- matrix x matrix, operator)", "called with a prefactor input matrix A that does not match the first " "domain of the given multiplication operator" ); NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || - std::is_same< typename Operator::D2, InputType2 >::value ), + std::is_same< typename Operator::D2, ActualInputType2 >::value ), "grb::eWiseApply (reference, matrix <- matrix x matrix, operator)", "called with a postfactor input matrix B that does not match the first " "domain of the given multiplication operator" From 5eea2358a846dfa632dad98ff9b725ac4c06a9b0 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Fri, 8 Dec 2023 16:53:55 +0100 Subject: [PATCH 29/37] temporary bugfix for parallel coordinates assignments --- include/graphblas/reference/blas3.hpp | 85 ++------------------------- 1 file changed, 5 insertions(+), 80 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index e3efe56f6..d5d1eca1a 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1126,38 +1126,11 @@ namespace grb { for( size_t i = 0; i < m; ++i ) { coors1.clear(); -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel default(none) \ - shared(coors1, valbuf) \ - firstprivate(i, A_raw, dummy_identity) -#endif - { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update = coors1.EMPTY_UPDATE(); - const size_t maxAsyncAssigns = coors1.maxAsyncAssigns(); - size_t assigns = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update ) ) { - valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); - if( ++assigns == maxAsyncAssigns ) { - coors1.joinUpdate( local_update ); - assigns = 0; - } - } -#else - if( !coors1.assign( k_col ) ) { - valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); - } -#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + if( !coors1.assign( k_col ) ) { + valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors1.joinUpdate( local_update ) ) {} -#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { @@ -1430,78 +1403,30 @@ namespace grb { } } - // do computations - nzc = 0; + std::cerr << "HERE\n"; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); coors2.clear(); -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel default(none) \ - shared(coors1, vbuf1, coors2, vbuf2) \ - firstprivate(i, A_raw, identity_A, B_raw, identity_B ) -#endif - { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update1 = coors1.EMPTY_UPDATE(); - const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); - size_t assigns1 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update1 ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - if( ++assigns1 == maxAsyncAssigns1 ) { - coors1.joinUpdate( local_update1 ); - assigns1 = 0; - } - } -#else if( !coors1.assign( k_col ) ) { assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } -#endif } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors1.joinUpdate( local_update1 )) {} -#endif - - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update2 = coors2.EMPTY_UPDATE(); - const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); - size_t assigns2 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = B_raw.row_index[ k ]; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors2.asyncAssign( k_col, local_update2 ) ) { - assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); - if( ++assigns2 == maxAsyncAssigns2 ) { - coors2.joinUpdate( local_update2 ); - assigns2 = 0; - } - } -#else if( !coors2.assign( k_col ) ) { assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); } -#endif } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors2.joinUpdate( local_update2 )) {} -#endif - } for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { const auto j = coors1.index( k ); From 7872b79fdec8fba67003ada251f4acd1523efdea Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 17 Jan 2024 11:43:09 +0100 Subject: [PATCH 30/37] Re-enable parallel Coordinates iteration --- include/graphblas/reference/blas3.hpp | 129 ++++++++++++++++++++------ 1 file changed, 99 insertions(+), 30 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index d5d1eca1a..6d8406e5e 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -63,22 +63,22 @@ namespace grb::internal { template< typename D, typename T > - static inline void assignValue( + static void assignValue( D *array, size_t i, const T& value, typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr ) { array[i] = value; } template< typename T > - static inline void assignValue( void *, size_t, const T& ) { /* do nothing */ } + static void assignValue( void *, size_t, const T& ) { /* do nothing */ } template< typename D, typename T > - static inline T getValue( + static T getValue( const D *array, size_t i, const T&, typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr ) { return array[i]; } template< typename T > - static inline T getValue( const void *, size_t, const T& identity ) { return identity; } + static T getValue( const void *, size_t, const T& identity ) { return identity; } } // namespace grb::internal @@ -961,11 +961,10 @@ namespace grb { const Operator &oper, const Phase &phase, const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_operator< Operator >::value, - void + !is_object< OutputType >::value && + !is_object< InputType1 >::value && + !is_object< InputType2 >::value && + is_operator< Operator >::value >::type * const = nullptr ) { #ifdef _DEBUG @@ -1013,8 +1012,7 @@ namespace grb { const auto dummy_identity = identities::zero< OutputType >::value(); // retrieve buffers - char * arr1, * arr3, * buf1, * buf3; - arr1 = buf1 = nullptr; + char * arr1 = nullptr, * arr3 = nullptr, * buf1 = nullptr, * buf3 = nullptr; InputType1 * vbuf1 = nullptr; OutputType * valbuf = nullptr; internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); @@ -1022,7 +1020,7 @@ namespace grb { // end buffer retrieval // initialisations - internal::Coordinates< reference > coors1; + Coordinates coors1; coors1.set( arr1, false, buf1, n ); if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 @@ -1066,8 +1064,7 @@ namespace grb { if( phase == EXECUTE ) { nzc = 0; // retrieve additional buffer - config::NonzeroIndexType * const C_col_index = internal::template - getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); + auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 @@ -1126,11 +1123,38 @@ namespace grb { for( size_t i = 0; i < m; ++i ) { coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - if( !coors1.assign( k_col ) ) { - valbuf[ k_col ] = A_raw.getValue( k, dummy_identity ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel default(none) \ + shared(coors1, valbuf) \ + firstprivate(i, A_raw, dummy_identity) +#endif + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + auto local_update = coors1.EMPTY_UPDATE(); + const size_t maxAsyncAssigns = coors1.maxAsyncAssigns(); + size_t assigns = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; + +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors1.asyncAssign( k_col, local_update ) ) { + assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) ); + if( ++assigns == maxAsyncAssigns ) { + coors1.joinUpdate( local_update ); + assigns = 0; + } + } +#else + if( !coors1.assign( k_col ) ) { + assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) ); + } +#endif } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors1.joinUpdate( local_update ) ) {} +#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { @@ -1149,7 +1173,7 @@ namespace grb { // update CCS if( !crs_only ) { - C_col_index[ j ]++; + ++C_col_index[ j ]; const size_t CCS_index = CCS_raw.col_start[ j+1 ] - C_col_index[ j ]; CCS_raw.row_index[ CCS_index ] = i; CCS_raw.setValue( CCS_index, result_value ); @@ -1230,11 +1254,10 @@ namespace grb { const Monoid &monoid, const Phase &phase, const typename std::enable_if< - !grb::is_object< OutputType >::value && - !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - grb::is_monoid< Monoid >::value, - void + !is_object< OutputType >::value && + !is_object< InputType1 >::value && + !is_object< InputType2 >::value && + is_monoid< Monoid >::value >::type * const = nullptr ) { @@ -1298,7 +1321,7 @@ namespace grb { // end buffer retrieval // initialisations - internal::Coordinates< reference > coors1, coors2; + Coordinates< reference > coors1, coors2; coors1.set( arr1, false, buf1, n ); coors2.set( arr2, false, buf2, n ); if( !crs_only ) { @@ -1343,8 +1366,7 @@ namespace grb { // computational phase if( phase == EXECUTE ) { // retrieve additional buffer - config::NonzeroIndexType * const C_col_index = internal::template - getReferenceBuffer< typename config::NonzeroIndexType >( n + 1 ); + auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); // perform column-wise nonzero count nzc = 0; @@ -1379,9 +1401,8 @@ namespace grb { const RC clear_rc = clear( C ); if( clear_rc != SUCCESS ) { return PANIC; - } else { - return FAILED; } + return FAILED; } // prefix sum for CCS_raw.col_start @@ -1403,30 +1424,78 @@ namespace grb { } } + // do computations + nzc = 0; - std::cerr << "HERE\n"; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); coors2.clear(); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel default(none) \ + shared(coors1, vbuf1, coors2, vbuf2) \ + firstprivate(i, A_raw, identity_A, B_raw, identity_B ) +#endif + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + auto local_update1 = coors1.EMPTY_UPDATE(); + const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); + size_t assigns1 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors1.asyncAssign( k_col, local_update1 ) ) { + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + if( ++assigns1 == maxAsyncAssigns1 ) { + coors1.joinUpdate( local_update1 ); + assigns1 = 0; + } + } +#else if( !coors1.assign( k_col ) ) { assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } +#endif } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors1.joinUpdate( local_update1 )) {} +#endif + + +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + auto local_update2 = coors2.EMPTY_UPDATE(); + const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); + size_t assigns2 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = B_raw.row_index[ k ]; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors2.asyncAssign( k_col, local_update2 ) ) { + assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); + if( ++assigns2 == maxAsyncAssigns2 ) { + coors2.joinUpdate( local_update2 ); + assigns2 = 0; + } + } +#else if( !coors2.assign( k_col ) ) { assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); } +#endif } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors2.joinUpdate( local_update2 )) {} +#endif + } for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { const auto j = coors1.index( k ); From ef342a8af6f7d557e8f834b93535a09f2fa2ecd0 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 24 Jan 2024 14:34:05 +0100 Subject: [PATCH 31/37] Fix potential bug when (A == B) is true. Thanks to @aleksamilisavljevic for noticing --- include/graphblas/reference/blas3.hpp | 46 +++++++++++++++++++-------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 6d8406e5e..a113ec91a 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1001,6 +1001,13 @@ namespace grb { return MISMATCH; } + if( getID(A) == getID(C) || getID(B) == getID(C) ) { +#ifdef _DEBUG + std::cerr << "grb::eWiseApply: The output matrix can not simultaneously be " + << "one of the input matrices\n"; +#endif + } + const auto &A_raw = !trans_left ? internal::getCRS( A ) : internal::getCCS( A ); @@ -1019,9 +1026,11 @@ namespace grb { internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); // end buffer retrieval - // initialisations + // initialisations of the coordinates Coordinates coors1; coors1.set( arr1, false, buf1, n ); + // end initialisations of the coordinates + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel for simd default(none) \ @@ -1279,6 +1288,13 @@ namespace grb { return ILLEGAL; } + if( getID(A) == getID(C) || getID(B) == getID(C) ) { +#ifdef _DEBUG + std::cerr << "grb::eWiseApply: The output matrix can not simultaneously be " + << "one of the input matrices\n"; +#endif + } + // run-time checks const size_t m = nrows( C ); const size_t n = ncols( C ); @@ -1310,20 +1326,22 @@ namespace grb { // retrieve buffers - char * arr1, * arr2, * arr3, * buf1, * buf2, * buf3; - arr1 = arr2 = buf1 = buf2 = nullptr; + char *arr1 = nullptr, *arr3 = nullptr; + char *buf1 = nullptr, *buf3 = nullptr; InputType1 * vbuf1 = nullptr; - InputType2 * vbuf2 = nullptr; - OutputType * valbuf = nullptr; + OutputType * vbuf3 = nullptr; internal::getMatrixBuffers( arr1, buf1, vbuf1, 1, A ); - internal::getMatrixBuffers( arr2, buf2, vbuf2, 1, B ); - internal::getMatrixBuffers( arr3, buf3, valbuf, 1, C ); + internal::getMatrixBuffers( arr3, buf3, vbuf3, 1, C ); // end buffer retrieval - // initialisations + // initialisations of the coordinates + // Note: By using the buffer of the output matrix C, we can + // allow A and B to be the same matrix (with the same buffer) Coordinates< reference > coors1, coors2; coors1.set( arr1, false, buf1, n ); - coors2.set( arr2, false, buf2, n ); + coors2.set( arr3, false, buf3, n ); + // end initialisations of the coordinates + if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n) @@ -1435,7 +1453,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp parallel default(none) \ - shared(coors1, vbuf1, coors2, vbuf2) \ + shared(coors1, vbuf1, coors2, vbuf3) \ firstprivate(i, A_raw, identity_A, B_raw, identity_B ) #endif { @@ -1480,7 +1498,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors2.asyncAssign( k_col, local_update2 ) ) { - assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); + assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); if( ++assigns2 == maxAsyncAssigns2 ) { coors2.joinUpdate( local_update2 ); assigns2 = 0; @@ -1488,7 +1506,7 @@ namespace grb { } #else if( !coors2.assign( k_col ) ) { - assignValue( vbuf2, k_col, B_raw.getValue( k, identity_B ) ); + assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); } #endif } @@ -1500,7 +1518,7 @@ namespace grb { for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { const auto j = coors1.index( k ); const auto A_val = getValue(vbuf1, j, identity_A); - const auto B_val = coors2.assigned(j) ? getValue(vbuf2, j, identity_B) : identity_B; + const auto B_val = coors2.assigned(j) ? getValue(vbuf3, j, identity_B) : identity_B; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); @@ -1529,7 +1547,7 @@ namespace grb { continue; } const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A; - const auto B_val = getValue(vbuf2, j, identity_B); + const auto B_val = getValue(vbuf3, j, identity_B); OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); From 2d63fdfb3547d86ad415c6d1bebc28a50ef96b78 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Wed, 24 Jan 2024 14:54:34 +0100 Subject: [PATCH 32/37] Increase test size in CI --- tests/unit/unittests.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index 5963dc16c..228b40c0b 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -554,10 +554,18 @@ for MODE in ${MODES}; do echo " " echo ">>> [x] [ ] Testing grb::eWiseApply (matrices, Monoid / Operator)" - $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log - head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log - grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" - echo " " + echo " using small matrices (100x100)" + $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_small_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo " " + + echo ">>> [x] [ ] Testing grb::eWiseApply (matrices, Monoid / Operator)" + echo " using large matrices (100'000x100'000)" + $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo " " echo ">>> [x] [ ] Testing grb::zip on two vectors of doubles and" echo " ints of size 10 000 000." From 098481f9838fcdee0e7adc312ba7fc4c5ef81388 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Fri, 2 Feb 2024 00:03:35 +0100 Subject: [PATCH 33/37] Minor fix from the merge --- tests/unit/eWiseApply_matrix.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/unit/eWiseApply_matrix.cpp b/tests/unit/eWiseApply_matrix.cpp index 4fac4fa8d..01f92e9bf 100644 --- a/tests/unit/eWiseApply_matrix.cpp +++ b/tests/unit/eWiseApply_matrix.cpp @@ -42,14 +42,13 @@ void grb_program( const int &, grb::RC &rc ) { << "mixed-domain matrix check\n"; return; } - for( const auto &triple : C ) { - const auto &i = triple.first.first; - const auto &j = triple.first.second; - const auto &v = triple.second; - if( j != i+n ) { - std::cout << "Unexpected entry at position ( " << i << ", " << i+n << " ) " - << "-- only expected entries on the n-th diagonal\n"; + const size_t &i = triple.first.first; + const size_t &j = triple.first.second; + const size_t &v = triple.second; + if( i != j ) { + std::cout << "Unexpected entry at position ( " << i << ", " << j << " ) " + << "-- only expected entries on the diagonal\n"; rc = FAILED; } if( v != 4 ) { @@ -68,14 +67,15 @@ void grb_program( const int &, grb::RC &rc ) { int main( int argc, char ** argv ) { // defaults - size_t input = 1000; // unused + bool printUsage = false; + int input = 0; // unused // error checking if( argc > 1 ) { - input = std::strtoul( argv[ 1 ], nullptr, 10 ); + printUsage = true; } - if( argc > 2 ) { - std::cerr << "Usage: " << argv[ 0 ] << "[n]\n"; + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; return 1; } @@ -94,4 +94,3 @@ int main( int argc, char ** argv ) { return 0; } } - From 40e62c2280806664230756036f87983c60e217a5 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Fri, 2 Feb 2024 16:46:34 +0100 Subject: [PATCH 34/37] Minor changes --- include/graphblas/reference/blas3.hpp | 92 +++++++++++++++++++-------- tests/unit/unittests.sh | 4 +- 2 files changed, 67 insertions(+), 29 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index a113ec91a..445715873 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1358,18 +1358,60 @@ namespace grb { // symbolic phase if( phase == RESIZE ) { nzc = 0; - for( size_t i = 0; i < m; ++i ) { - coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - if( !coors1.assign( k_col ) ) { - (void) ++nzc; + + +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel default(none) \ + shared(coors1, vbuf1, coors2, vbuf3) \ + firstprivate(A_raw, identity_A, B_raw, identity_B, m) \ + reduction(+:nzc) +#endif + { + for( size_t i = 0; i < m; ++i ) { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp single +#endif + { + coors1.clear(); } - } - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( !coors1.assigned( l_col ) ) { - (void) ++nzc; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp barrier +#endif +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + auto local_update1 = coors1.EMPTY_UPDATE(); + const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); + size_t assigns1 = 0; + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait +#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const size_t k_col = A_raw.row_index[ k ]; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + if( !coors1.asyncAssign( k_col, local_update1 ) ) { + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + if( ++assigns1 == maxAsyncAssigns1 ) { + coors1.joinUpdate( local_update1 ); + assigns1 = 0; + } + } +#else + (void)coors1.assign( k_col ); +#endif + (void)++nzc; + } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + while( !coors1.joinUpdate( local_update1 ) ) {} +#endif + +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp barrier + + #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) +#endif + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( !coors1.assigned( l_col ) ) { + (void)++nzc; + } } } } @@ -1392,9 +1434,9 @@ namespace grb { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - if( !coors1.assign( k_col ) ) { - (void) ++nzc; - } + (void)coors1.assign( k_col ); + (void)++nzc; + if( !crs_only ) { (void) ++CCS_raw.col_start[ k_col + 1 ]; } @@ -1402,9 +1444,9 @@ namespace grb { for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; if( !coors1.assigned( l_col ) ) { - (void) ++nzc; + (void)++nzc; if( !crs_only ) { - (void) ++CCS_raw.col_start[ l_col + 1 ]; + (void)++CCS_raw.col_start[ l_col + 1 ]; } } } @@ -1425,11 +1467,13 @@ namespace grb { // prefix sum for CCS_raw.col_start if( !crs_only ) { - assert( CCS_raw.col_start[ 0 ] == 0 ); for( size_t j = 1; j < n; ++j ) { CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; } +#ifndef NDEBUG + assert( CCS_raw.col_start[ 0 ] == 0 ); assert( CCS_raw.col_start[ n ] == nzc ); +#endif } // set C_col_index to all zero @@ -1442,9 +1486,7 @@ namespace grb { } } - // do computations - nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { @@ -1457,7 +1499,6 @@ namespace grb { firstprivate(i, A_raw, identity_A, B_raw, identity_B ) #endif { - #ifdef _H_GRB_REFERENCE_OMP_BLAS3 auto local_update1 = coors1.EMPTY_UPDATE(); const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); @@ -1469,17 +1510,15 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors1.asyncAssign( k_col, local_update1 ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); if( ++assigns1 == maxAsyncAssigns1 ) { coors1.joinUpdate( local_update1 ); assigns1 = 0; } } #else - if( !coors1.assign( k_col ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - } + (void)coors1.assign( k_col ); #endif + assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 @@ -1505,10 +1544,9 @@ namespace grb { } } #else - if( !coors2.assign( k_col ) ) { - assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); - } + (void)coors2.assign( k_col ); #endif + assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 while( !coors2.joinUpdate( local_update2 )) {} diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index a578e00f5..3b43c7872 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -605,8 +605,8 @@ for MODE in ${MODES}; do echo " " echo ">>> [x] [ ] Testing grb::eWiseApply (matrices, Monoid / Operator)" - echo " using large matrices (100'000x100'000)" - $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 100000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log + echo " using large matrices (10'000x10'000)" + $runner ${TEST_BIN_DIR}/eWiseApplyMatrix_variants_${MODE}_${BACKEND} 10000 &> ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log head -1 ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log grep 'Test OK' ${TEST_OUT_DIR}/eWiseApplyMatrix_variants_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " From b2965d939f2a2145bbd95b88196175e7fe144eb1 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Sun, 25 Feb 2024 14:51:30 +0100 Subject: [PATCH 35/37] Review fixes --- include/graphblas/bsp1d/blas3.hpp | 67 +++++++++++++++------ include/graphblas/nonblocking/blas3.hpp | 32 ++++++++-- include/graphblas/reference/blas3.hpp | 79 +++++++++++-------------- include/graphblas/utils.hpp | 18 ++++++ 4 files changed, 128 insertions(+), 68 deletions(-) diff --git a/include/graphblas/bsp1d/blas3.hpp b/include/graphblas/bsp1d/blas3.hpp index 0ddbdf41b..25f5902ab 100644 --- a/include/graphblas/bsp1d/blas3.hpp +++ b/include/graphblas/bsp1d/blas3.hpp @@ -140,14 +140,31 @@ namespace grb { >::type * const = nullptr ) { assert( phase != TRY ); - RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), - internal::getLocal( B ), - mul, - phase - ); - return internal::checkGlobalErrorStateOrClear( C, ret ); + RC local_rc = SUCCESS; + if( phase == RESIZE ) { + RC ret = eWiseApply< descr >( + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + mul, + RESIZE + ); + if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) { + return PANIC; + } else { + return ret; + } + } else { + assert( phase == EXECUTE ); + local_rc = eWiseApply< descr >( + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + mul, + EXECUTE + ); + } + return internal::checkGlobalErrorStateOrClear( C, local_rc ); } /** \internal Simply delegates to process-local backend */ @@ -173,15 +190,31 @@ namespace grb { >::type * const = nullptr ) { assert( phase != TRY ); - RC ret = eWiseApply< descr >( - internal::getLocal( C ), - internal::getLocal( A ), - internal::getLocal( B ), - op, - phase - ); - return internal::checkGlobalErrorStateOrClear( C, ret ); - } + RC local_rc = SUCCESS; + if( phase == RESIZE ) { + RC ret = eWiseApply< descr >( + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + op, + RESIZE + ); + if( collectives<>::allreduce( ret, operators::any_or< RC >() ) != SUCCESS ) { + return PANIC; + } else { + return ret; + } + } else { + assert( phase == EXECUTE ); + local_rc = eWiseApply< descr >( + internal::getLocal( C ), + internal::getLocal( A ), + internal::getLocal( B ), + op, + EXECUTE + ); + } + return internal::checkGlobalErrorStateOrClear( C, local_rc ); } // namespace grb diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp index aaebe569c..50c640ae0 100644 --- a/include/graphblas/nonblocking/blas3.hpp +++ b/include/graphblas/nonblocking/blas3.hpp @@ -472,10 +472,10 @@ namespace grb { // second, delegate to the reference backend return eWiseApply< descr >( - internal::getRefMatrix( C ), - internal::getRefMatrix( A ), + internal::getRefMatrix( C ), + internal::getRefMatrix( A ), internal::getRefMatrix( B ), - mulmono, + mulmono, phase ); } @@ -511,16 +511,36 @@ namespace grb { std::cout << "In grb::eWiseApply (nonblocking, op)\n"; #endif + // static checks + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D1, InputType1 >::value ), + "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", + "called with a prefactor input matrix A that does not match the first " + "domain of the given multiplication operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D2, InputType2 >::value ), + "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", + "called with a postfactor input matrix B that does not match the first " + "domain of the given multiplication operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D3, OutputType >::value ), + "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)", + "called with an output matrix C that does not match the output domain " + "of the given multiplication operator" + ); + // nonblocking execution is not supported // first, execute any computation that is not completed internal::le.execution(); // second, delegate to the reference backend return eWiseApply< descr >( - internal::getRefMatrix( C ), - internal::getRefMatrix( A ), + internal::getRefMatrix( C ), + internal::getRefMatrix( A ), internal::getRefMatrix( B ), - mulOp, + mulOp, phase ); } diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 445715873..70d7a3642 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -57,32 +57,6 @@ "********************************************************************" \ "******************************\n" ); -#ifndef _H_GRB_REFERENCE_BLAS3_ACCESSORS -#define _H_GRB_REFERENCE_BLAS3_ACCESSORS - -namespace grb::internal -{ - template< typename D, typename T > - static void assignValue( - D *array, size_t i, const T& value, - typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr - ) { array[i] = value; } - - template< typename T > - static void assignValue( void *, size_t, const T& ) { /* do nothing */ } - - template< typename D, typename T > - static T getValue( - const D *array, size_t i, const T&, - typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr - ) { return array[i]; } - - template< typename T > - static T getValue( const void *, size_t, const T& identity ) { return identity; } - -} // namespace grb::internal - -#endif // _H_GRB_REFERENCE_BLAS3_ACCESSORS namespace grb { @@ -1077,11 +1051,18 @@ namespace grb { if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd default(none) \ - shared(C_col_index) firstprivate(n) +#pragma omp parallel #endif - for( size_t j = 0; j < n; ++j ) { - C_col_index[ j ] = 0; + { + size_t start = 0; + size_t end = n + 1; +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + config::OMP::localRange( start, end, 0, n + 1 ); + #pragma omp parallel for simd +#endif + for( size_t j = start; j < end; ++j ) { + C_col_index[ j ] = 0; + } } } @@ -1149,7 +1130,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors1.asyncAssign( k_col, local_update ) ) { - assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) ); + utils::assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) ); if( ++assigns == maxAsyncAssigns ) { coors1.joinUpdate( local_update ); assigns = 0; @@ -1157,7 +1138,7 @@ namespace grb { } #else if( !coors1.assign( k_col ) ) { - assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) ); + utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) ); } #endif } @@ -1387,7 +1368,7 @@ namespace grb { const size_t k_col = A_raw.row_index[ k ]; #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors1.asyncAssign( k_col, local_update1 ) ) { - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); if( ++assigns1 == maxAsyncAssigns1 ) { coors1.joinUpdate( local_update1 ); assigns1 = 0; @@ -1518,7 +1499,7 @@ namespace grb { #else (void)coors1.assign( k_col ); #endif - assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 @@ -1537,7 +1518,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 if( !coors2.asyncAssign( k_col, local_update2 ) ) { - assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); + utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); if( ++assigns2 == maxAsyncAssigns2 ) { coors2.joinUpdate( local_update2 ); assigns2 = 0; @@ -1546,7 +1527,7 @@ namespace grb { #else (void)coors2.assign( k_col ); #endif - assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); + utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); } #ifdef _H_GRB_REFERENCE_OMP_BLAS3 while( !coors2.joinUpdate( local_update2 )) {} @@ -1555,8 +1536,8 @@ namespace grb { for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { const auto j = coors1.index( k ); - const auto A_val = getValue(vbuf1, j, identity_A); - const auto B_val = coors2.assigned(j) ? getValue(vbuf3, j, identity_B) : identity_B; + const auto A_val = utils::getValue(vbuf1, j, identity_A); + const auto B_val = coors2.assigned(j) ? utils::getValue(vbuf3, j, identity_B) : identity_B; OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); @@ -1584,8 +1565,8 @@ namespace grb { if( coors1.assigned(j) ) { // Intersection case: already handled continue; } - const auto A_val = coors1.assigned(j) ? getValue(vbuf1, j, identity_A) : identity_A; - const auto B_val = getValue(vbuf3, j, identity_B); + const auto A_val = coors1.assigned(j) ? utils::getValue(vbuf1, j, identity_A) : identity_A; + const auto B_val = utils::getValue(vbuf3, j, identity_B); OutputType result_value; (void)grb::apply( result_value, A_val, B_val, oper ); @@ -1753,8 +1734,14 @@ namespace grb { void >::type * const = nullptr ) { - typedef typename std::conditional::value, typename Operator::D1, InputType1>::type ActualInputType1; - typedef typename std::conditional::value, typename Operator::D2, InputType1>::type ActualInputType2; + typedef typename std::conditional< + std::is_void::value, + typename Operator::D1, + InputType1>::type ActualInputType1; + typedef typename std::conditional< + std::is_void::value, + typename Operator::D2, + InputType1>::type ActualInputType2; // static checks NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, ActualInputType1 >::value ), @@ -1775,9 +1762,11 @@ namespace grb { "of the given multiplication operator" ); static_assert( - !std::is_void< OutputType >::value, - "grb::eWiseApply: the elementwise mxm cannot be used if the" - " output matrix is a pattern matrix (of type void)" + !std::is_void< OutputType >::value || + ( std::is_void< InputType1 >::value && std::is_void< InputType2 >::value ), + "grb::eWiseApply: the elementwise mxm only support" + " output pattern-matrix (of type void) if both" + " input matrices are also pattern matrices" ); #ifdef _DEBUG std::cout << "In grb::eWiseApply( reference, operator )\n"; diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp index c5239afdc..a82aa3125 100644 --- a/include/graphblas/utils.hpp +++ b/include/graphblas/utils.hpp @@ -54,6 +54,24 @@ namespace grb { */ namespace utils { + template< typename D, typename T > + static void assignValue( + D *array, size_t i, const T& value, + typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr + ) { array[i] = value; } + + template< typename T > + static void assignValue( void *, size_t, const T& ) { /* do nothing */ } + + template< typename D, typename T > + static T getValue( + const D *array, size_t i, const T&, + typename std::enable_if< !std::is_void< D >::value >::type * const = nullptr + ) { return array[i]; } + + template< typename T > + static T getValue( const void *, size_t, const T& identity ) { return identity; } + /** * Checks whether two values are equal. * From 35f3a9b202afb5f377c0891a216a55310f45ed60 Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 26 Feb 2024 11:14:07 +0100 Subject: [PATCH 36/37] Coordinates assign bugfix --- include/graphblas/reference/blas3.hpp | 292 +++++--------------------- 1 file changed, 49 insertions(+), 243 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 70d7a3642..4167c076e 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1007,8 +1007,7 @@ namespace grb { if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd default(none) \ - shared(CCS_raw) firstprivate(n) + #pragma omp parallel for simd #endif for( size_t j = 0; j <= n; ++j ) { CCS_raw.col_start[ j ] = 0; @@ -1047,22 +1046,15 @@ namespace grb { if( phase == EXECUTE ) { nzc = 0; // retrieve additional buffer - auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); + config::NonzeroIndexType * const C_col_index = + getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 -#pragma omp parallel -#endif - { - size_t start = 0; - size_t end = n + 1; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - config::OMP::localRange( start, end, 0, n + 1 ); - #pragma omp parallel for simd + #pragma omp parallel for simd #endif - for( size_t j = start; j < end; ++j ) { - C_col_index[ j ] = 0; - } + for( size_t j = 0; j < n+1; ++j ) { + C_col_index[ j ] = 0; } } @@ -1112,50 +1104,22 @@ namespace grb { CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel default(none) \ - shared(coors1, valbuf) \ - firstprivate(i, A_raw, dummy_identity) -#endif - { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update = coors1.EMPTY_UPDATE(); - const size_t maxAsyncAssigns = coors1.maxAsyncAssigns(); - size_t assigns = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update ) ) { - utils::assignValue( valbuf, k_col , A_raw.getValue( k, dummy_identity ) ); - if( ++assigns == maxAsyncAssigns ) { - coors1.joinUpdate( local_update ); - assigns = 0; - } - } -#else - if( !coors1.assign( k_col ) ) { - utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) ); - } -#endif + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( !coors1.assign( k_col ) ) { + utils::assignValue( valbuf, k_col, A_raw.getValue( k, dummy_identity ) ); } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors1.joinUpdate( local_update ) ) {} -#endif } for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t j = B_raw.row_index[ l ]; + const auto j = B_raw.row_index[ l ]; if( !coors1.assigned( j ) ) { // Union case: ignored continue; } const auto valbuf_value_before = valbuf[ j ]; OutputType result_value; - (void)grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); + (void) grb::apply( result_value, valbuf_value_before, B_raw.getValue( l, dummy_identity ), oper ); // update CRS CRS_raw.row_index[ nzc ] = j; @@ -1170,51 +1134,17 @@ namespace grb { } // update count - (void)++nzc; + (void) ++nzc; } CRS_raw.col_start[ i + 1 ] = nzc; - - } - - -#ifdef _DEBUG - std::cout << "CRS_raw.col_start = [ "; - for( size_t j = 0; j <= m; ++j ) - std::cout << CRS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "CRS_raw.row_index = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CRS_raw.row_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CRS_raw.values = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CRS_raw.values[ j ] << " "; - std::cout << "]\n"; - if( !crs_only ) { - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.col_start = [ "; - for( size_t j = 0; j <= n; ++j ) - std::cout << CCS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.row_index = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CCS_raw.row_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.values = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CCS_raw.values[ j ] << " "; - std::cout << "]\n"; } -#endif #ifndef NDEBUG if( !crs_only ) { - for( size_t j = 0; j < n; ++j ) + for( size_t j = 0; j < n; ++j ) { assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); + } } #endif @@ -1305,7 +1235,6 @@ namespace grb { auto &CRS_raw = internal::getCRS( C ); auto &CCS_raw = internal::getCCS( C ); - // retrieve buffers char *arr1 = nullptr, *arr3 = nullptr; char *buf1 = nullptr, *buf3 = nullptr; @@ -1325,7 +1254,7 @@ namespace grb { if( !crs_only ) { #ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel for simd default(none) shared(CCS_raw) firstprivate(n) + #pragma omp parallel for simd #endif for( size_t j = 0; j < n + 1; ++j ) { CCS_raw.col_start[ j ] = 0; @@ -1339,60 +1268,18 @@ namespace grb { // symbolic phase if( phase == RESIZE ) { nzc = 0; - - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel default(none) \ - shared(coors1, vbuf1, coors2, vbuf3) \ - firstprivate(A_raw, identity_A, B_raw, identity_B, m) \ - reduction(+:nzc) -#endif - { - for( size_t i = 0; i < m; ++i ) { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp single -#endif - { - coors1.clear(); - } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp barrier -#endif -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update1 = coors1.EMPTY_UPDATE(); - const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); - size_t assigns1 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update1 ) ) { - utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - if( ++assigns1 == maxAsyncAssigns1 ) { - coors1.joinUpdate( local_update1 ); - assigns1 = 0; - } - } -#else - (void)coors1.assign( k_col ); -#endif - (void)++nzc; + for( size_t i = 0; i < m; ++i ) { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( !coors1.assign( k_col ) ) { + (void) ++nzc; } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors1.joinUpdate( local_update1 ) ) {} -#endif - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp barrier - - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) -#endif - for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { - const size_t l_col = B_raw.row_index[ l ]; - if( !coors1.assigned( l_col ) ) { - (void)++nzc; - } + } + for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { + const size_t l_col = B_raw.row_index[ l ]; + if( !coors1.assigned( l_col ) ) { + (void) ++nzc; } } } @@ -1407,7 +1294,8 @@ namespace grb { // computational phase if( phase == EXECUTE ) { // retrieve additional buffer - auto* const C_col_index = getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); + config::NonzeroIndexType * const C_col_index = + getReferenceBuffer< config::NonzeroIndexType >( n + 1 ); // perform column-wise nonzero count nzc = 0; @@ -1415,8 +1303,8 @@ namespace grb { coors1.clear(); for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { const size_t k_col = A_raw.row_index[ k ]; - (void)coors1.assign( k_col ); - (void)++nzc; + (void) coors1.assign( k_col ); + (void) ++nzc; if( !crs_only ) { (void) ++CCS_raw.col_start[ k_col + 1 ]; @@ -1425,9 +1313,9 @@ namespace grb { for( size_t l = B_raw.col_start[ i ]; l < B_raw.col_start[ i + 1 ]; ++l ) { const size_t l_col = B_raw.row_index[ l ]; if( !coors1.assigned( l_col ) ) { - (void)++nzc; + (void) ++nzc; if( !crs_only ) { - (void)++CCS_raw.col_start[ l_col + 1 ]; + (void) ++CCS_raw.col_start[ l_col + 1 ]; } } } @@ -1472,66 +1360,17 @@ namespace grb { CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { coors1.clear(); - coors2.clear(); - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - #pragma omp parallel default(none) \ - shared(coors1, vbuf1, coors2, vbuf3) \ - firstprivate(i, A_raw, identity_A, B_raw, identity_B ) -#endif - { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update1 = coors1.EMPTY_UPDATE(); - const size_t maxAsyncAssigns1 = coors1.maxAsyncAssigns(); - size_t assigns1 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = A_raw.row_index[ k ]; - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors1.asyncAssign( k_col, local_update1 ) ) { - if( ++assigns1 == maxAsyncAssigns1 ) { - coors1.joinUpdate( local_update1 ); - assigns1 = 0; - } - } -#else - (void)coors1.assign( k_col ); -#endif - utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - } - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors1.joinUpdate( local_update1 )) {} -#endif - - -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - auto local_update2 = coors2.EMPTY_UPDATE(); - const size_t maxAsyncAssigns2 = coors2.maxAsyncAssigns(); - size_t assigns2 = 0; - #pragma omp for simd schedule( dynamic, config::CACHE_LINE_SIZE::value() ) nowait -#endif - for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { - const size_t k_col = B_raw.row_index[ k ]; + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + (void) coors1.assign( k_col ); + utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - if( !coors2.asyncAssign( k_col, local_update2 ) ) { - utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); - if( ++assigns2 == maxAsyncAssigns2 ) { - coors2.joinUpdate( local_update2 ); - assigns2 = 0; - } - } -#else - (void)coors2.assign( k_col ); -#endif - utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); - } -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - while( !coors2.joinUpdate( local_update2 )) {} -#endif + coors2.clear(); + for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = B_raw.row_index[ k ]; + (void) coors2.assign( k_col ); + utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); } for( size_t k = 0; k < coors1.nonzeroes(); ++k ) { @@ -1540,7 +1379,7 @@ namespace grb { const auto B_val = coors2.assigned(j) ? utils::getValue(vbuf3, j, identity_B) : identity_B; OutputType result_value; - (void)grb::apply( result_value, A_val, B_val, oper ); + (void) grb::apply( result_value, A_val, B_val, oper ); // update CRS CRS_raw.row_index[ nzc ] = j; @@ -1558,7 +1397,7 @@ namespace grb { CCS_raw.setValue( CCS_index, result_value ); } // update count - (void)++nzc; + (void) ++nzc; } for( size_t k = 0; k < coors2.nonzeroes(); ++k ) { const auto j = coors2.index( k ); @@ -1569,7 +1408,7 @@ namespace grb { const auto B_val = utils::getValue(vbuf3, j, identity_B); OutputType result_value; - (void)grb::apply( result_value, A_val, B_val, oper ); + (void) grb::apply( result_value, A_val, B_val, oper ); // update CRS CRS_raw.row_index[ nzc ] = j; @@ -1587,52 +1426,19 @@ namespace grb { CCS_raw.setValue( CCS_index, result_value ); } // update count - (void)++nzc; + (void) ++nzc; } CRS_raw.col_start[ i + 1 ] = nzc; } - if( !crs_only ) { -#ifdef _DEBUG - std::cout << "CRS_raw.col_start = [ "; - for( size_t j = 0; j <= m; ++j ) - std::cout << CRS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "CRS_raw.row_index = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CRS_raw.row_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CRS_raw.values = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CRS_raw.values[ j ] << " "; - std::cout << "]\n"; - if( !crs_only ) { - std::cout << "C_col_index = [ "; - for( size_t j = 0; j < n; ++j ) - std::cout << C_col_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.col_start = [ "; - for( size_t j = 0; j <= n; ++j ) - std::cout << CCS_raw.col_start[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.row_index = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CCS_raw.row_index[ j ] << " "; - std::cout << "]\n"; - std::cout << "CCS_raw.values = [ "; - for( size_t j = 0; j < nzc; ++j ) - std::cout << CCS_raw.values[ j ] << " "; - std::cout << "]\n"; - } -#endif - #ifndef NDEBUG + if( !crs_only ) { for( size_t j = 0; j < n; ++j ) { assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == C_col_index[ j ] ); } -#endif } +#endif // set final number of nonzeroes in output matrix #ifdef _DEBUG From 4eaaf9166e3f0563b340a5deaf4269474286c1cd Mon Sep 17 00:00:00 2001 From: Benjamin Lozes Date: Mon, 26 Feb 2024 11:18:06 +0100 Subject: [PATCH 37/37] Enabling little threadnig using two tasks --- include/graphblas/reference/blas3.hpp | 38 ++++++++++++++++++--------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 4167c076e..563241ca3 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -1359,18 +1359,32 @@ namespace grb { nzc = 0; CRS_raw.col_start[ 0 ] = 0; for( size_t i = 0; i < m; ++i ) { - coors1.clear(); - for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { - const auto k_col = A_raw.row_index[ k ]; - (void) coors1.assign( k_col ); - utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); - } - - coors2.clear(); - for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { - const auto k_col = B_raw.row_index[ k ]; - (void) coors2.assign( k_col ); - utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel +#endif + { +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp single nowait +#endif + { + coors1.clear(); + for( size_t k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + (void) coors1.assign( k_col ); + utils::assignValue( vbuf1, k_col, A_raw.getValue( k, identity_A ) ); + } + } +#ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp single nowait +#endif + { + coors2.clear(); + for( size_t k = B_raw.col_start[ i ]; k < B_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = B_raw.row_index[ k ]; + (void) coors2.assign( k_col ); + utils::assignValue( vbuf3, k_col, B_raw.getValue( k, identity_B ) ); + } + } } for( size_t k = 0; k < coors1.nonzeroes(); ++k ) {