From b156ab4004eaf1154d95241c66203705f7d315c8 Mon Sep 17 00:00:00 2001 From: Jeff Eaton Date: Wed, 3 Mar 2021 17:42:19 +0000 Subject: [PATCH 1/6] explicitly reference tmbutils::array to avoid Eigen clash --- TMB/DESCRIPTION | 4 ++-- TMB/NEWS | 33 +++++++++++++++++++++++++++ TMB/inst/include/tmbutils/R_inla.hpp | 2 +- TMB/inst/include/tmbutils/density.hpp | 4 ++-- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/TMB/DESCRIPTION b/TMB/DESCRIPTION index 5ccde6b48..fd35233cd 100644 --- a/TMB/DESCRIPTION +++ b/TMB/DESCRIPTION @@ -1,8 +1,8 @@ Package: TMB Type: Package Title: Template Model Builder: A General Random Effect Tool Inspired by 'ADMB' -Version: 1.7.19 -Date: 2021-02-05 +Version: 1.7.19-2 +Date: 2021-03-03 Authors@R: c(person("Kasper","Kristensen",role=c("aut","cre","cph"),email="kaskr@dtu.dk"), person("Brad","Bell",role="cph"), person("Hans","Skaug",role="ctb"), diff --git a/TMB/NEWS b/TMB/NEWS index 3614c7da9..75aa2aa0b 100644 --- a/TMB/NEWS +++ b/TMB/NEWS @@ -1,3 +1,36 @@ +------------------------------------------------------------------------ +TMB 1.7.19-2 (2021-03-03) +*** FORKED PATCH VERSION *** +------------------------------------------------------------------------ + +* Forked repository to work around namespace clash for 'array' between + Eigen:: and tmbutils:: when Eigen/Tensor headers are included. + + > TMB/include/tmbutils/density.hpp:106:3: error: + > reference to 'array' is ambiguous + +1. Explicitly reference tmbutils::array in tmbutils/density.hpp and + tmbutils/R_inla.hpp. + +2. Must include before including . + + If this is included before , compiler throws an + error (compiler Apple clang version 12.0.0): + + > unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h:122:8: error: + > explicit specialization of non-template struct 'Rf_eval' + + Not investigated the cause / solution for this. + +3. Add compiler flag -Wno-invalid-partial-specialization. + + This is an error in an earlier Eigen release that has been patched: + https://github.com/owlbarn/eigen/issues/2#issuecomment-353763166 + + Updating the Eigen headers included with TMB might resolve. + + + ------------------------------------------------------------------------ TMB 1.7.19 (2021-02-05) ------------------------------------------------------------------------ diff --git a/TMB/inst/include/tmbutils/R_inla.hpp b/TMB/inst/include/tmbutils/R_inla.hpp index 64c14dff9..4295e0d16 100644 --- a/TMB/inst/include/tmbutils/R_inla.hpp +++ b/TMB/inst/include/tmbutils/R_inla.hpp @@ -97,7 +97,7 @@ template // Calculate new SPDE matrices // Calculate G1 - pt. 1 - array Gtmp(n_tri,3,3); + tmbutils::array Gtmp(n_tri,3,3); for(i=0; i vectortype; \ typedef matrix matrixtype; \ -typedef array arraytype +typedef tmbutils::array arraytype #define VARIANCE_NOT_YET_IMPLEMENTED \ private: \ @@ -730,7 +730,7 @@ class contAR2_t{ scalartype operator()(vectortype x){ /* x.dim=[2,n] */ vector dim(2); dim << 2 , x.size()/2 ; - array y(x,dim); + tmbutils::array y(x,dim); y=y.transpose(); return this->operator()(y.col(0),y.col(1)); } From 0e2f3941b7155ac4b9ce8d973e545cce4337abb2 Mon Sep 17 00:00:00 2001 From: Jeff Eaton Date: Sat, 12 Feb 2022 18:44:23 +0000 Subject: [PATCH 2/6] Eigen: update inst/include/unsupported to Eigen 3.4.0 To bring the headers in inst/include/unsupported in line with inst/include/Eigen --- TMB/inst/include/unsupported/CMakeLists.txt | 14 +- .../include/unsupported/Eigen/AdolcForward | 5 +- .../include/unsupported/Eigen/AlignedVector3 | 12 +- .../include/unsupported/Eigen/ArpackSupport | 11 +- TMB/inst/include/unsupported/Eigen/AutoDiff | 6 + TMB/inst/include/unsupported/Eigen/BVH | 6 +- .../include/unsupported/Eigen/CXX11/Tensor | 79 +- .../unsupported/Eigen/CXX11/TensorSymmetry | 6 +- .../unsupported/Eigen/CXX11/ThreadPool | 21 +- .../Eigen/CXX11/src/Tensor/README.md | 420 ++- .../Eigen/CXX11/src/Tensor/Tensor.h | 37 +- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 78 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 94 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 274 +- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 1559 +++++++++ .../CXX11/src/Tensor/TensorBroadcasting.h | 767 +++- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 278 +- .../CXX11/src/Tensor/TensorConcatenation.h | 84 +- .../CXX11/src/Tensor/TensorContraction.h | 623 +++- .../src/Tensor/TensorContractionBlocking.h | 39 +- .../CXX11/src/Tensor/TensorContractionCuda.h | 1393 +------- .../CXX11/src/Tensor/TensorContractionGpu.h | 1413 ++++++++ .../src/Tensor/TensorContractionMapper.h | 242 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 1650 +++++++++ .../src/Tensor/TensorContractionThreadPool.h | 1585 ++++++--- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 269 +- .../CXX11/src/Tensor/TensorConvolution.h | 190 +- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 544 +++ .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 90 +- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 69 + .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 337 +- .../CXX11/src/Tensor/TensorDeviceDefault.h | 33 +- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 389 +++ .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1070 +++++- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 354 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 166 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 111 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 630 +++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 695 +++- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 23 +- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 86 +- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 34 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 174 +- .../src/Tensor/TensorForwardDeclarations.h | 90 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 191 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 151 +- .../src/Tensor/TensorGpuHipCudaDefines.h | 99 + .../src/Tensor/TensorGpuHipCudaUndefines.h | 44 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 164 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 203 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 28 +- .../CXX11/src/Tensor/TensorInitializer.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 28 +- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 71 +- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 52 +- .../Eigen/CXX11/src/Tensor/TensorMap.h | 126 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 129 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 568 ++- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 375 +- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 32 +- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 198 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 383 +- .../CXX11/src/Tensor/TensorReductionCuda.h | 750 +--- .../CXX11/src/Tensor/TensorReductionGpu.h | 966 +++++ .../CXX11/src/Tensor/TensorReductionSycl.h | 744 ++-- .../Eigen/CXX11/src/Tensor/TensorRef.h | 37 +- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 265 +- .../Eigen/CXX11/src/Tensor/TensorScan.h | 423 ++- .../Eigen/CXX11/src/Tensor/TensorScanSycl.h | 513 +++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 325 +- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 35 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 80 +- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 82 - .../TensorSyclConvertToDeviceExpression.h | 121 - .../src/Tensor/TensorSyclExprConstructor.h | 239 -- .../src/Tensor/TensorSyclExtractAccessor.h | 204 -- .../src/Tensor/TensorSyclExtractFunctors.h | 177 - .../CXX11/src/Tensor/TensorSyclLeafCount.h | 114 - .../src/Tensor/TensorSyclPlaceHolderExpr.h | 181 - .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 70 - .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 234 -- .../Eigen/CXX11/src/Tensor/TensorTrace.h | 303 ++ .../Eigen/CXX11/src/Tensor/TensorTraits.h | 44 +- .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 + .../CXX11/src/Tensor/TensorVolumePatch.h | 103 +- .../TensorSymmetry/util/TemplateGroupTheory.h | 9 +- .../Eigen/CXX11/src/ThreadPool/Barrier.h | 67 + .../Eigen/CXX11/src/ThreadPool/EventCount.h | 196 +- .../src/ThreadPool/NonBlockingThreadPool.h | 398 ++- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 100 +- .../CXX11/src/ThreadPool/SimpleThreadPool.h | 154 - .../Eigen/CXX11/src/ThreadPool/ThreadCancel.h | 23 + .../CXX11/src/ThreadPool/ThreadEnvironment.h | 2 + .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 289 +- .../src/ThreadPool/ThreadPoolInterface.h | 15 + .../Eigen/CXX11/src/util/CXX11Meta.h | 93 +- .../Eigen/CXX11/src/util/CXX11Workarounds.h | 8 +- .../Eigen/CXX11/src/util/EmulateArray.h | 54 +- .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 311 -- .../Eigen/CXX11/src/util/MaxSizeVector.h | 51 +- .../include/unsupported/Eigen/EulerAngles | 8 +- TMB/inst/include/unsupported/Eigen/FFT | 17 +- .../unsupported/Eigen/IterativeSolvers | 21 +- .../unsupported/Eigen/LevenbergMarquardt | 16 +- .../include/unsupported/Eigen/MPRealSupport | 6 +- .../include/unsupported/Eigen/MatrixFunctions | 25 +- .../unsupported/Eigen/MoreVectorization | 2 +- .../unsupported/Eigen/NonLinearOptimization | 44 +- .../include/unsupported/Eigen/NumericalDiff | 2 +- .../include/unsupported/Eigen/OpenGLSupport | 6 +- .../include/unsupported/Eigen/Polynomials | 9 +- TMB/inst/include/unsupported/Eigen/Skyline | 6 +- .../include/unsupported/Eigen/SparseExtra | 1 + .../unsupported/Eigen/SpecialFunctions | 46 +- TMB/inst/include/unsupported/Eigen/Splines | 4 + .../Eigen/src/AutoDiff/AutoDiffScalar.h | 111 +- .../include/unsupported/Eigen/src/BVH/KdBVH.h | 3 +- .../ArpackSelfAdjointEigenSolver.h | 25 +- .../Eigen/src/EulerAngles/CMakeLists.txt | 4 +- .../Eigen/src/EulerAngles/EulerAngles.h | 257 +- .../Eigen/src/EulerAngles/EulerSystem.h | 197 +- .../unsupported/Eigen/src/FFT/ei_fftw_impl.h | 4 +- .../Eigen/src/FFT/ei_kissfft_impl.h | 53 +- .../IterativeSolvers/ConstrainedConjGrad.h | 10 +- .../Eigen/src/IterativeSolvers/DGMRES.h | 122 +- .../Eigen/src/IterativeSolvers/GMRES.h | 38 +- .../Eigen/src/IterativeSolvers/IDRS.h | 436 +++ .../IterativeSolvers/IterationController.h | 2 +- .../Eigen/src/IterativeSolvers/MINRES.h | 38 +- .../Eigen/src/IterativeSolvers/Scaling.h | 6 + .../KroneckerProduct/KroneckerTensorProduct.h | 4 +- .../Eigen/src/LevenbergMarquardt/LMqrsolv.h | 2 +- .../LevenbergMarquardt/LevenbergMarquardt.h | 6 +- .../src/MatrixFunctions/MatrixExponential.h | 51 +- .../src/MatrixFunctions/MatrixFunction.h | 35 +- .../src/MatrixFunctions/MatrixLogarithm.h | 22 +- .../Eigen/src/MatrixFunctions/MatrixPower.h | 32 +- .../src/MatrixFunctions/MatrixSquareRoot.h | 34 +- .../HybridNonLinearSolver.h | 6 +- .../Eigen/src/NonLinearOptimization/qrsolv.h | 2 +- .../Eigen/src/NonLinearOptimization/r1updt.h | 2 +- .../Eigen/src/Polynomials/Companion.h | 94 +- .../Eigen/src/Polynomials/PolynomialSolver.h | 46 +- .../Eigen/src/Polynomials/PolynomialUtils.h | 8 +- .../Eigen/src/Skyline/SkylineInplaceLU.h | 4 +- .../Eigen/src/Skyline/SkylineMatrix.h | 18 +- .../Eigen/src/Skyline/SkylineMatrixBase.h | 10 +- .../Eigen/src/Skyline/SkylineStorage.h | 2 +- .../Eigen/src/SparseExtra/BlockSparseMatrix.h | 2 +- .../src/SparseExtra/DynamicSparseMatrix.h | 22 +- .../Eigen/src/SparseExtra/MarketIO.h | 96 +- .../Eigen/src/SparseExtra/RandomSetter.h | 54 +- .../BesselFunctionsArrayAPI.h | 286 ++ .../BesselFunctionsBFloat16.h | 68 + .../BesselFunctionsFunctors.h | 357 ++ .../SpecialFunctions/BesselFunctionsHalf.h | 66 + .../SpecialFunctions/BesselFunctionsImpl.h | 1959 +++++++++++ .../BesselFunctionsPacketMath.h | 118 + .../SpecialFunctions/HipVectorCompatibility.h | 67 + .../SpecialFunctionsArrayAPI.h | 55 +- .../SpecialFunctionsBFloat16.h | 58 + .../SpecialFunctionsFunctors.h | 140 +- .../SpecialFunctions/SpecialFunctionsHalf.h | 11 + .../SpecialFunctions/SpecialFunctionsImpl.h | 1048 ++++-- .../SpecialFunctionsPacketMath.h | 23 +- .../arch/AVX/BesselFunctions.h | 46 + .../arch/AVX/SpecialFunctions.h | 16 + .../arch/AVX512/BesselFunctions.h | 46 + .../arch/AVX512/SpecialFunctions.h | 16 + .../arch/CUDA/CudaSpecialFunctions.h | 165 - .../arch/GPU/SpecialFunctions.h | 369 ++ .../arch/NEON/BesselFunctions.h | 54 + .../arch/NEON/SpecialFunctions.h | 34 + .../unsupported/Eigen/src/Splines/Spline.h | 9 +- .../Eigen/src/Splines/SplineFitting.h | 11 +- .../unsupported/Eigen/src/Splines/SplineFwd.h | 2 +- TMB/inst/include/unsupported/README.txt | 2 +- .../include/unsupported/bench/bench_svd.cpp | 2 +- TMB/inst/include/unsupported/doc/Overview.dox | 3 + TMB/inst/include/unsupported/doc/SYCL.dox | 9 + .../unsupported/doc/examples/CMakeLists.txt | 22 +- .../unsupported/doc/examples/EulerAngles.cpp | 4 +- .../include/unsupported/doc/examples/FFT.cpp | 6 +- .../doc/examples/SYCL/CMakeLists.txt | 37 + .../doc/examples/SYCL/CwiseMul.cpp | 63 + .../unsupported/doc/snippets/CMakeLists.txt | 24 +- TMB/inst/include/unsupported/test/BVH.cpp | 2 +- .../include/unsupported/test/CMakeLists.txt | 378 +- .../include/unsupported/test/EulerAngles.cpp | 308 +- TMB/inst/include/unsupported/test/FFTW.cpp | 2 +- .../test/NonLinearOptimization.cpp | 123 +- .../unsupported/test/NumericalDiff.cpp | 4 +- .../unsupported/test/alignedvector3.cpp | 5 +- .../include/unsupported/test/autodiff.cpp | 42 +- .../unsupported/test/autodiff_scalar.cpp | 5 +- .../unsupported/test/bessel_functions.cpp | 370 ++ .../unsupported/test/cxx11_eventcount.cpp | 12 +- .../unsupported/test/cxx11_maxsizevector.cpp | 77 + .../include/unsupported/test/cxx11_meta.cpp | 2 +- .../test/cxx11_non_blocking_thread_pool.cpp | 83 +- .../unsupported/test/cxx11_runqueue.cpp | 2 +- .../unsupported/test/cxx11_tensor_argmax.cpp | 2 +- ...max_cuda.cu => cxx11_tensor_argmax_gpu.cu} | 93 +- .../test/cxx11_tensor_argmax_sycl.cpp | 258 ++ .../unsupported/test/cxx11_tensor_assign.cpp | 2 +- .../test/cxx11_tensor_block_access.cpp | 576 +++ .../test/cxx11_tensor_block_eval.cpp | 858 +++++ .../test/cxx11_tensor_block_io.cpp | 445 +++ .../test/cxx11_tensor_broadcast_sycl.cpp | 118 +- .../test/cxx11_tensor_broadcasting.cpp | 141 +- .../test/cxx11_tensor_builtins_sycl.cpp | 354 ++ ...da.cu => cxx11_tensor_cast_float16_gpu.cu} | 13 +- .../unsupported/test/cxx11_tensor_casts.cpp | 83 +- .../test/cxx11_tensor_chipping.cpp | 10 +- .../test/cxx11_tensor_chipping_sycl.cpp | 623 ++++ .../test/cxx11_tensor_comparisons.cpp | 2 +- ... => cxx11_tensor_complex_cwise_ops_gpu.cu} | 21 +- ...ex_cuda.cu => cxx11_tensor_complex_gpu.cu} | 49 +- .../test/cxx11_tensor_concatenation.cpp | 10 +- .../test/cxx11_tensor_concatenation_sycl.cpp | 180 + .../unsupported/test/cxx11_tensor_const.cpp | 2 +- ...t_cuda.cu => cxx11_tensor_contract_gpu.cu} | 96 +- .../test/cxx11_tensor_contract_sycl.cpp | 1026 ++++++ .../test/cxx11_tensor_contraction.cpp | 120 +- .../test/cxx11_tensor_convolution.cpp | 5 +- .../test/cxx11_tensor_convolution_sycl.cpp | 469 +++ .../test/cxx11_tensor_custom_index.cpp | 2 +- .../test/cxx11_tensor_custom_op.cpp | 2 +- .../test/cxx11_tensor_custom_op_sycl.cpp | 170 + .../unsupported/test/cxx11_tensor_device.cu | 66 +- .../test/cxx11_tensor_device_sycl.cpp | 64 +- .../test/cxx11_tensor_dimension.cpp | 21 +- .../unsupported/test/cxx11_tensor_empty.cpp | 2 +- .../test/cxx11_tensor_executor.cpp | 731 ++++ .../unsupported/test/cxx11_tensor_expr.cpp | 180 +- .../unsupported/test/cxx11_tensor_fft.cpp | 33 +- .../test/cxx11_tensor_fixed_size.cpp | 4 +- .../test/cxx11_tensor_forced_eval.cpp | 4 +- .../test/cxx11_tensor_forced_eval_sycl.cpp | 63 +- .../test/cxx11_tensor_generator.cpp | 8 +- .../test/cxx11_tensor_generator_sycl.cpp | 147 + ...x11_tensor_cuda.cu => cxx11_tensor_gpu.cu} | 958 +++-- .../unsupported/test/cxx11_tensor_ifft.cpp | 2 +- .../test/cxx11_tensor_image_op_sycl.cpp | 103 + .../test/cxx11_tensor_image_patch.cpp | 54 +- .../test/cxx11_tensor_image_patch_sycl.cpp | 1092 ++++++ .../test/cxx11_tensor_index_list.cpp | 35 +- .../test/cxx11_tensor_inflation.cpp | 2 +- .../test/cxx11_tensor_inflation_sycl.cpp | 136 + .../unsupported/test/cxx11_tensor_intdiv.cpp | 2 +- .../unsupported/test/cxx11_tensor_io.cpp | 2 +- .../test/cxx11_tensor_layout_swap.cpp | 2 +- .../test/cxx11_tensor_layout_swap_sycl.cpp | 126 + .../unsupported/test/cxx11_tensor_lvalue.cpp | 2 +- .../unsupported/test/cxx11_tensor_map.cpp | 68 +- .../unsupported/test/cxx11_tensor_math.cpp | 2 +- .../test/cxx11_tensor_math_sycl.cpp | 105 + .../test/cxx11_tensor_mixed_indices.cpp | 2 +- .../test/cxx11_tensor_morphing.cpp | 198 +- .../test/cxx11_tensor_morphing_sycl.cpp | 386 ++ .../unsupported/test/cxx11_tensor_move.cpp | 76 + .../test/cxx11_tensor_notification.cpp | 39 +- .../test/cxx11_tensor_of_complex.cpp | 2 +- .../test/cxx11_tensor_of_const_values.cpp | 2 +- ...cuda.cu => cxx11_tensor_of_float16_gpu.cu} | 144 +- .../test/cxx11_tensor_of_strings.cpp | 2 +- .../unsupported/test/cxx11_tensor_padding.cpp | 2 +- .../test/cxx11_tensor_padding_sycl.cpp | 157 + .../unsupported/test/cxx11_tensor_patch.cpp | 2 +- .../test/cxx11_tensor_patch_sycl.cpp | 249 ++ .../unsupported/test/cxx11_tensor_random.cpp | 20 +- ...dom_cuda.cu => cxx11_tensor_random_gpu.cu} | 34 +- .../test/cxx11_tensor_random_sycl.cpp | 100 + .../test/cxx11_tensor_reduction.cpp | 88 +- ..._cuda.cu => cxx11_tensor_reduction_gpu.cu} | 13 +- .../test/cxx11_tensor_reduction_sycl.cpp | 1002 +++++- .../unsupported/test/cxx11_tensor_ref.cpp | 2 +- .../unsupported/test/cxx11_tensor_reverse.cpp | 2 +- .../test/cxx11_tensor_reverse_sycl.cpp | 253 ++ .../test/cxx11_tensor_roundings.cpp | 2 +- .../unsupported/test/cxx11_tensor_scan.cpp | 2 +- ..._scan_cuda.cu => cxx11_tensor_scan_gpu.cu} | 29 +- .../test/cxx11_tensor_scan_sycl.cpp | 141 + .../test/cxx11_tensor_shuffling.cpp | 67 +- .../test/cxx11_tensor_shuffling_sycl.cpp | 117 + .../unsupported/test/cxx11_tensor_simple.cpp | 2 +- .../test/cxx11_tensor_striding.cpp | 2 +- .../test/cxx11_tensor_striding_sycl.cpp | 203 ++ .../unsupported/test/cxx11_tensor_sugar.cpp | 2 +- .../unsupported/test/cxx11_tensor_sycl.cpp | 308 +- .../test/cxx11_tensor_symmetry.cpp | 2 +- .../test/cxx11_tensor_thread_local.cpp | 149 + .../test/cxx11_tensor_thread_pool.cpp | 390 ++- .../unsupported/test/cxx11_tensor_trace.cpp | 172 + .../unsupported/test/cxx11_tensor_uint128.cpp | 4 +- .../test/cxx11_tensor_volume_patch.cpp | 8 +- .../test/cxx11_tensor_volume_patch_sycl.cpp | 222 ++ TMB/inst/include/unsupported/test/dgmres.cpp | 4 +- .../unsupported/test/forward_adolc.cpp | 6 +- TMB/inst/include/unsupported/test/gmres.cpp | 2 +- TMB/inst/include/unsupported/test/idrs.cpp | 27 + .../unsupported/test/kronecker_product.cpp | 28 +- .../unsupported/test/levenberg_marquardt.cpp | 2 +- .../unsupported/test/matrix_exponential.cpp | 2 +- .../unsupported/test/matrix_function.cpp | 52 +- .../include/unsupported/test/matrix_power.cpp | 42 +- .../unsupported/test/matrix_square_root.cpp | 2 +- TMB/inst/include/unsupported/test/minres.cpp | 2 +- .../include/unsupported/test/mpreal/mpreal.h | 3104 ----------------- .../unsupported/test/mpreal_support.cpp | 3 +- .../unsupported/test/openglsupport.cpp | 639 +++- .../unsupported/test/polynomialsolver.cpp | 58 +- .../unsupported/test/polynomialutils.cpp | 2 +- .../include/unsupported/test/sparse_extra.cpp | 91 +- .../unsupported/test/special_functions.cpp | 234 +- .../unsupported/test/special_packetmath.cpp | 149 + TMB/inst/include/unsupported/test/splines.cpp | 2 +- 318 files changed, 40173 insertions(+), 13988 deletions(-) create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h create mode 100755 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h create mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h delete mode 100644 TMB/inst/include/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h create mode 100755 TMB/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRS.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h delete mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h create mode 100644 TMB/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h create mode 100644 TMB/inst/include/unsupported/doc/SYCL.dox create mode 100644 TMB/inst/include/unsupported/doc/examples/SYCL/CMakeLists.txt create mode 100644 TMB/inst/include/unsupported/doc/examples/SYCL/CwiseMul.cpp create mode 100644 TMB/inst/include/unsupported/test/bessel_functions.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_maxsizevector.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_argmax_cuda.cu => cxx11_tensor_argmax_gpu.cu} (71%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_argmax_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_block_access.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_block_eval.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_block_io.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_builtins_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_cast_float16_cuda.cu => cxx11_tensor_cast_float16_gpu.cu} (88%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_chipping_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_complex_cwise_ops_cuda.cu => cxx11_tensor_complex_cwise_ops_gpu.cu} (89%) rename TMB/inst/include/unsupported/test/{cxx11_tensor_complex_cuda.cu => cxx11_tensor_complex_gpu.cu} (76%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_concatenation_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_contract_cuda.cu => cxx11_tensor_contract_gpu.cu} (68%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_contract_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_convolution_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_custom_op_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_executor.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_generator_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_cuda.cu => cxx11_tensor_gpu.cu} (56%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_image_op_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_image_patch_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_inflation_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_math_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_morphing_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_move.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_of_float16_cuda.cu => cxx11_tensor_of_float16_gpu.cu} (84%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_padding_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_patch_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_random_cuda.cu => cxx11_tensor_random_gpu.cu} (63%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_random_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_reduction_cuda.cu => cxx11_tensor_reduction_gpu.cu} (94%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_reverse_sycl.cpp rename TMB/inst/include/unsupported/test/{cxx11_tensor_scan_cuda.cu => cxx11_tensor_scan_gpu.cu} (73%) create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_scan_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_shuffling_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_striding_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_thread_local.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_trace.cpp create mode 100644 TMB/inst/include/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp create mode 100644 TMB/inst/include/unsupported/test/idrs.cpp delete mode 100644 TMB/inst/include/unsupported/test/mpreal/mpreal.h create mode 100644 TMB/inst/include/unsupported/test/special_packetmath.cpp diff --git a/TMB/inst/include/unsupported/CMakeLists.txt b/TMB/inst/include/unsupported/CMakeLists.txt index 4fef40a86..34408c017 100644 --- a/TMB/inst/include/unsupported/CMakeLists.txt +++ b/TMB/inst/include/unsupported/CMakeLists.txt @@ -1,7 +1,11 @@ add_subdirectory(Eigen) -add_subdirectory(doc EXCLUDE_FROM_ALL) -if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest -else() - add_subdirectory(test EXCLUDE_FROM_ALL) +if(EIGEN_BUILD_DOC) + add_subdirectory(doc EXCLUDE_FROM_ALL) +endif() +if(BUILD_TESTING) + if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) + add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest + else() + add_subdirectory(test EXCLUDE_FROM_ALL) + endif() endif() diff --git a/TMB/inst/include/unsupported/Eigen/AdolcForward b/TMB/inst/include/unsupported/Eigen/AdolcForward index 15f5f0731..56caeaebf 100644 --- a/TMB/inst/include/unsupported/Eigen/AdolcForward +++ b/TMB/inst/include/unsupported/Eigen/AdolcForward @@ -40,7 +40,7 @@ # undef realloc #endif -#include +#include "../../Eigen/Core" namespace Eigen { @@ -74,6 +74,9 @@ inline adouble imag(const adouble&) { return 0.; } inline adouble abs(const adouble& x) { return fabs(x); } inline adouble abs2(const adouble& x) { return x*x; } +inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); } +inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); } + } namespace Eigen { diff --git a/TMB/inst/include/unsupported/Eigen/AlignedVector3 b/TMB/inst/include/unsupported/Eigen/AlignedVector3 index 47a86d4c0..4fa1842ac 100644 --- a/TMB/inst/include/unsupported/Eigen/AlignedVector3 +++ b/TMB/inst/include/unsupported/Eigen/AlignedVector3 @@ -10,7 +10,9 @@ #ifndef EIGEN_ALIGNED_VECTOR3 #define EIGEN_ALIGNED_VECTOR3 -#include +#include "../../Eigen/Geometry" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" namespace Eigen { @@ -76,6 +78,9 @@ template class AlignedVector3 { return m_coeffs.coeffRef(index);} + inline AlignedVector3() + {} + inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, Scalar(0)) {} @@ -129,6 +134,9 @@ template class AlignedVector3 inline AlignedVector3 operator-(const AlignedVector3& other) const { return AlignedVector3(m_coeffs - other.m_coeffs); } + inline AlignedVector3 operator-() const + { return AlignedVector3(-m_coeffs); } + inline AlignedVector3 operator-=(const AlignedVector3& other) { m_coeffs -= other.m_coeffs; return *this; } @@ -221,4 +229,6 @@ struct evaluator > } +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + #endif // EIGEN_ALIGNED_VECTOR3 diff --git a/TMB/inst/include/unsupported/Eigen/ArpackSupport b/TMB/inst/include/unsupported/Eigen/ArpackSupport index 37a2799ef..67c4ac838 100644 --- a/TMB/inst/include/unsupported/Eigen/ArpackSupport +++ b/TMB/inst/include/unsupported/Eigen/ArpackSupport @@ -9,9 +9,7 @@ #ifndef EIGEN_ARPACKSUPPORT_MODULE_H #define EIGEN_ARPACKSUPPORT_MODULE_H -#include - -#include +#include "../../Eigen/Core" /** \defgroup ArpackSupport_Module Arpack support module * @@ -22,10 +20,11 @@ * \endcode */ -#include +#include "../../Eigen/SparseCholesky" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" -#include +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_ARPACKSUPPORT_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/TMB/inst/include/unsupported/Eigen/AutoDiff b/TMB/inst/include/unsupported/Eigen/AutoDiff index abf5b7d67..7a4ff460c 100644 --- a/TMB/inst/include/unsupported/Eigen/AutoDiff +++ b/TMB/inst/include/unsupported/Eigen/AutoDiff @@ -28,11 +28,17 @@ namespace Eigen { //@{ } +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + #include "src/AutoDiff/AutoDiffScalar.h" // #include "src/AutoDiff/AutoDiffVector.h" #include "src/AutoDiff/AutoDiffJacobian.h" +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + + namespace Eigen { //@} } diff --git a/TMB/inst/include/unsupported/Eigen/BVH b/TMB/inst/include/unsupported/Eigen/BVH index 0161a5402..666c9835f 100644 --- a/TMB/inst/include/unsupported/Eigen/BVH +++ b/TMB/inst/include/unsupported/Eigen/BVH @@ -10,9 +10,9 @@ #ifndef EIGEN_BVH_MODULE_H #define EIGEN_BVH_MODULE_H -#include -#include -#include +#include "../../Eigen/Core" +#include "../../Eigen/Geometry" +#include "../../Eigen/StdVector" #include #include diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/Tensor b/TMB/inst/include/unsupported/Eigen/CXX11/Tensor index 7ecb4c74d..0938bb554 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/Tensor +++ b/TMB/inst/include/unsupported/Eigen/CXX11/Tensor @@ -13,21 +13,11 @@ #include "../../../Eigen/Core" -#ifdef EIGEN_USE_SYCL -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include -#include -#include -#include -#endif - -#include +#if EIGEN_HAS_CXX11 #include "../SpecialFunctions" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" @@ -39,46 +29,29 @@ * \code * #include * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". */ +#include +#include #include #include #include - -#ifdef _WIN32 -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#else -#include -#endif - -#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 #include -#endif - -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif +#include -#ifdef EIGEN_USE_THREADS +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif #ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif + #include + #if defined(EIGEN_USE_HIP) + #include + #else + #include + #endif #endif #include "src/Tensor/TensorMacros.h" @@ -88,7 +61,10 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" +#include "src/Tensor/TensorDeviceGpu.h" +#ifndef gpu_assert +#define gpu_assert(x) +#endif #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -101,18 +77,19 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" +#include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" +#include "src/Tensor/TensorContractionGpu.h" #include "src/Tensor/TensorConversion.h" #include "src/Tensor/TensorConvolution.h" #include "src/Tensor/TensorFFT.h" @@ -134,8 +111,15 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" #include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" + +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif -#include "src/Tensor/TensorSycl.h" #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" @@ -147,6 +131,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorIO.h" -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#endif // EIGEN_HAS_CXX11 //#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/TensorSymmetry b/TMB/inst/include/unsupported/Eigen/CXX11/TensorSymmetry index fb1b0c0fb..b09c5e472 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/TensorSymmetry +++ b/TMB/inst/include/unsupported/Eigen/CXX11/TensorSymmetry @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE #define EIGEN_CXX11_TENSORSYMMETRY_MODULE -#include +#include "Tensor" -#include +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/util/CXX11Meta.h" @@ -33,7 +33,7 @@ #include "src/TensorSymmetry/StaticSymmetry.h" #include "src/TensorSymmetry/DynamicSymmetry.h" -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/ThreadPool b/TMB/inst/include/unsupported/Eigen/CXX11/ThreadPool index 09d637e9a..c5cafb2a1 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/ThreadPool +++ b/TMB/inst/include/unsupported/Eigen/CXX11/ThreadPool @@ -12,7 +12,7 @@ #include "../../../Eigen/Core" -#include +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module * @@ -30,10 +30,9 @@ // The code depends on CXX11, so only include the module if the // compiler supports it. -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#if (EIGEN_COMP_CXXVER >= 11) #include #include -#include #include #include @@ -44,22 +43,32 @@ #include #include #include +#include + +// There are non-parenthesized calls to "max" in the header, +// which trigger a check in test/main.h causing compilation to fail. +// We work around the check here by removing the check for max in +// the case where we have to emulate thread_local. +#ifdef max +#undef max +#endif +#include #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/ThreadCancel.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/Barrier.h" #include "src/ThreadPool/NonBlockingThreadPool.h" #endif -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_THREADPOOL_MODULE - diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md index 98e83811b..2f65b1b0e 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1,14 +1,12 @@ -# Eigen Tensors +# Eigen Tensors {#eigen_tensors} Tensors are multidimensional arrays of elements. Elements are typically scalars, but more complex types such as strings are also supported. -[TOC] - ## Tensor Classes You can manipulate a tensor with one of the following classes. They all are in -the namespace ```::Eigen.``` +the namespace `::Eigen.` ### Class Tensor @@ -23,8 +21,8 @@ different size to a Tensor, that tensor is resized to match its new value. #### Constructor Tensor(size0, size1, ...) -Constructor for a Tensor. The constructor must be passed ```rank``` integers -indicating the sizes of the instance along each of the the ```rank``` +Constructor for a Tensor. The constructor must be passed `rank` integers +indicating the sizes of the instance along each of the the `rank` dimensions. // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns @@ -38,7 +36,7 @@ dimensions. Constructor where the sizes for the constructor are specified as an array of values instead of an explicitly list of parameters. The array type to use is -```Eigen::array```. The array can be constructed automatically +`Eigen::array`. The array can be constructed automatically from an initializer list. // Create a tensor of strings of rank 2 with sizes 5, 7. @@ -83,8 +81,8 @@ large enough to hold all the data. // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. - Tensor> t_4x3; - TensorMap> t_12(t_4x3, 12); + TensorFixedSize> t_4x3; + TensorMap> t_12(t_4x3.data(), 12); #### Class TensorRef @@ -95,8 +93,8 @@ See Assigning to a TensorRef below. #### tensor(index0, index1...) -Return the element at position ```(index0, index1...)``` in tensor -```tensor```. You must pass as many parameters as the rank of ```tensor```. +Return the element at position `(index0, index1...)` in tensor +`tensor`. You must pass as many parameters as the rank of `tensor`. The expression can be used as an l-value to set the value of the element at the specified position. The value returned is of the datatype of the tensor. @@ -121,8 +119,8 @@ specified position. The value returned is of the datatype of the tensor. ## TensorLayout -The tensor library supports 2 layouts: ```ColMajor``` (the default) and -```RowMajor```. Only the default column major layout is currently fully +The tensor library supports 2 layouts: `ColMajor` (the default) and +`RowMajor`. Only the default column major layout is currently fully supported, and it is therefore not recommended to attempt to use the row major layout at the moment. @@ -136,7 +134,7 @@ All the arguments to an expression must use the same layout. Attempting to mix different layouts will result in a compilation error. It is possible to change the layout of a tensor or an expression using the -```swap_layout()``` method. Note that this will also reverse the order of the +`swap_layout()` method. Note that this will also reverse the order of the dimensions. Tensor col_major(2, 4); @@ -173,35 +171,35 @@ the following code computes the elementwise addition of two tensors: Tensor t3 = t1 + t2; While the code above looks easy enough, it is important to understand that the -expression ```t1 + t2``` is not actually adding the values of the tensors. The +expression `t1 + t2` is not actually adding the values of the tensors. The expression instead constructs a "tensor operator" object of the class TensorCwiseBinaryOp, which has references to the tensors -```t1``` and ```t2```. This is a small C++ object that knows how to add -```t1``` and ```t2```. It is only when the value of the expression is assigned -to the tensor ```t3``` that the addition is actually performed. Technically, -this happens through the overloading of ```operator=()``` in the Tensor class. +`t1` and `t2`. This is a small C++ object that knows how to add +`t1` and `t2`. It is only when the value of the expression is assigned +to the tensor `t3` that the addition is actually performed. Technically, +this happens through the overloading of `operator=()` in the Tensor class. This mechanism for computing tensor expressions allows for lazy evaluation and optimizations which are what make the tensor library very fast. -Of course, the tensor operators do nest, and the expression ```t1 + t2 * -0.3f``` is actually represented with the (approximate) tree of operators: +Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` +is actually represented with the (approximate) tree of operators: TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) ### Tensor Operations and C++ "auto" -Because Tensor operations create tensor operators, the C++ ```auto``` keyword +Because Tensor operations create tensor operators, the C++ `auto` keyword does not have its intuitive meaning. Consider these 2 lines of code: Tensor t3 = t1 + t2; auto t4 = t1 + t2; -In the first line we allocate the tensor ```t3``` and it will contain the -result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` +In the first line we allocate the tensor `t3` and it will contain the +result of the addition of `t1` and `t2`. In the second line, `t4` is actually the tree of tensor operators that will compute the addition of -```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get +`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get the values of its elements: Tensor t3 = t1 + t2; @@ -210,8 +208,8 @@ the values of its elements: auto t4 = t1 + t2; cout << t4(0, 0, 0); // Compilation error! -When you use ```auto``` you do not get a Tensor as a result but instead a -non-evaluated expression. So only use ```auto``` to delay evaluation. +When you use `auto` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use `auto` to delay evaluation. Unfortunately, there is no single underlying concrete type for holding non-evaluated expressions, hence you have to use auto in the case when you do @@ -257,9 +255,9 @@ There are several ways to control when expressions are evaluated: #### Assigning to a Tensor, TensorFixedSize, or TensorMap. The most common way to evaluate an expression is to assign it to a Tensor. In -the example below, the ```auto``` declarations make the intermediate values +the example below, the `auto` declarations make the intermediate values "Operations", not Tensors, and do not cause the expressions to be evaluated. -The assignment to the Tensor ```result``` causes the evaluation of all the +The assignment to the Tensor `result` causes the evaluation of all the operations. auto t3 = t1 + t2; // t3 is an Operation. @@ -272,7 +270,7 @@ Operation to a TensorFixedSize instead of a Tensor, which is a bit more efficient. // We know that the result is a 4x4x2 tensor! - TensorFixedSize result = t5; + TensorFixedSize> result = t5; Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to @@ -282,7 +280,7 @@ have the rank and sizes of the expression that are assigned to them. When you compute large composite expressions, you sometimes want to tell Eigen that an intermediate value in the expression tree is worth evaluating ahead of -time. This is done by inserting a call to the ```eval()``` method of the +time. This is done by inserting a call to the `eval()` method of the expression Operation. // The previous example could have been written: @@ -291,15 +289,15 @@ expression Operation. // If you want to compute (t1 + t2) once ahead of time you can write: Tensor result = ((t1 + t2).eval() * 0.2f).exp(); -Semantically, calling ```eval()``` is equivalent to materializing the value of +Semantically, calling `eval()` is equivalent to materializing the value of the expression in a temporary Tensor of the right size. The code above in effect does: // .eval() knows the size! - TensorFixedSize tmp = t1 + t2; + TensorFixedSize> tmp = t1 + t2; Tensor result = (tmp * 0.2f).exp(); -Note that the return value of ```eval()``` is itself an Operation, so the +Note that the return value of `eval()` is itself an Operation, so the following code does not do what you may think: // Here t3 is an evaluation Operation. t3 has not been evaluated yet. @@ -312,24 +310,24 @@ following code does not do what you may think: // an intermediate tensor to represent t3.x Tensor result = t4; -While in the examples above calling ```eval()``` does not make a difference in +While in the examples above calling `eval()` does not make a difference in performance, in other cases it can make a huge difference. In the expression -below the ```broadcast()``` expression causes the ```X.maximum()``` expression +below the `broadcast()` expression causes the `X.maximum()` expression to be evaluated many times: Tensor<...> X ...; Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) * beta).exp(); -Inserting a call to ```eval()``` between the ```maximum()``` and -```reshape()``` calls guarantees that maximum() is only computed once and +Inserting a call to `eval()` between the `maximum()` and +`reshape()` calls guarantees that maximum() is only computed once and greatly speeds-up execution: Tensor<...> Y = ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta).exp(); -In the other example below, the tensor ```Y``` is both used in the expression +In the other example below, the tensor `Y` is both used in the expression and its assignment. This is an aliasing problem and if the evaluation is not done in the right order Y will be updated incrementally during the evaluation resulting in bogus results: @@ -337,8 +335,8 @@ resulting in bogus results: Tensor<...> Y ...; Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); -Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` -expressions ensures that the sum is computed before any updates to ```Y``` are +Inserting a call to `eval()` between the `sum()` and `reshape()` +expressions ensures that the sum is computed before any updates to `Y` are done. Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); @@ -347,8 +345,8 @@ Note that an eval around the full right hand side expression is not needed because the generated has to compute the i-th value of the right hand side before assigning it to the left hand side. -However, if you were assigning the expression value to a shuffle of ```Y``` -then you would need to force an eval for correctness by adding an ```eval()``` +However, if you were assigning the expression value to a shuffle of `Y` +then you would need to force an eval for correctness by adding an `eval()` call for the right hand side: Y.shuffle(...) = @@ -361,7 +359,7 @@ If you need to access only a few elements from the value of an expression you can avoid materializing the value in a full tensor by using a TensorRef. A TensorRef is a small wrapper class for any Eigen Operation. It provides -overloads for the ```()``` operator that let you access individual values in +overloads for the `()` operator that let you access individual values in the expression. TensorRef is convenient, because the Operation themselves do not provide a way to access individual elements. @@ -390,7 +388,7 @@ such as contractions and convolutions. The implementations are optimized for different environments: single threaded on CPU, multi threaded on CPU, or on a GPU using cuda. Additional implementations may be added later. -You can choose which implementation to use with the ```device()``` call. If +You can choose which implementation to use with the `device()` call. If you do not choose an implementation explicitly the default implementation that uses a single thread on the CPU is used. @@ -406,7 +404,7 @@ single-threaded CPU implementation: Tensor b(30, 40); Tensor c = a + b; -To choose a different implementation you have to insert a ```device()``` call +To choose a different implementation you have to insert a `device()` call before the assignment of the result. For technical C++ reasons this requires that the Tensor for the result be declared on its own. This means that you have to know the size of the result. @@ -414,24 +412,27 @@ have to know the size of the result. Eigen::Tensor c(30, 40); c.device(...) = a + b; -The call to ```device()``` must be the last call on the left of the operator=. +The call to `device()` must be the last call on the left of the operator=. -You must pass to the ```device()``` call an Eigen device object. There are +You must pass to the `device()` call an Eigen device object. There are presently three devices you can use: DefaultDevice, ThreadPoolDevice and GpuDevice. #### Evaluating With the DefaultDevice -This is exactly the same as not inserting a ```device()``` call. +This is exactly the same as not inserting a `device()` call. DefaultDevice my_device; c.device(my_device) = a + b; #### Evaluating with a Thread Pool + // Create the Eigen ThreadPool + Eigen::ThreadPool pool(8 /* number of threads in pool */) + // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */); // Now just use the device when evaluating expressions. Eigen::Tensor c(30, 50); @@ -454,20 +455,20 @@ that are tensor-type specific: #### ::Dimensions -Acts like an array of ints. Has an ```int size``` attribute, and can be +Acts like an array of ints. Has an `int size` attribute, and can be indexed like an array to access individual values. Used to represent the -dimensions of a tensor. See ```dimensions()```. +dimensions of a tensor. See `dimensions()`. #### ::Index -Acts like an ```int```. Used for indexing tensors along their dimensions. See -```operator()```, ```dimension()```, and ```size()```. +Acts like an `int`. Used for indexing tensors along their dimensions. See +`operator()`, `dimension()`, and `size()`. #### ::Scalar Represents the datatype of individual tensor elements. For example, for a -```Tensor```, ```Scalar``` is the type ```float```. See -```setConstant()```. +`Tensor`, `Scalar` is the type `float`. See +`setConstant()`. #### @@ -501,7 +502,7 @@ known as the tensor "rank". ### Dimensions dimensions() Returns an array-like object representing the dimensions of the tensor. -The actual type of the dimensions() result is ::Dimensions. +The actual type of the `dimensions()` result is `::``Dimensions`. Eigen::Tensor a(3, 4); const Eigen::Tensor::Dimensions& d = a.dimensions(); @@ -509,7 +510,7 @@ The actual type of the dimensions() result is ::Dimensions. << ", dim 1: " << d[1]; => Dim size: 2, dim 0: 3, dim 1: 4 -If you use a C++11 compiler, you can use ```auto``` to simplify the code: +If you use a C++11 compiler, you can use `auto` to simplify the code: const auto& d = a.dimensions(); cout << "Dim size: " << d.size << ", dim 0: " << d[0] @@ -519,7 +520,7 @@ If you use a C++11 compiler, you can use ```auto``` to simplify the code: ### Index dimension(Index n) Returns the n-th dimension of the tensor. The actual type of the -```dimension()``` result is ```::Index```, but you can +`dimension()` result is `::``Index`, but you can always use it like an int. Eigen::Tensor a(3, 4); @@ -530,8 +531,8 @@ always use it like an int. ### Index size() Returns the total number of elements in the tensor. This is the product of all -the tensor dimensions. The actual type of the ```size()``` result is -```::Index```, but you can always use it like an int. +the tensor dimensions. The actual type of the `size()` result is +`::``Index`, but you can always use it like an int. Eigen::Tensor a(3, 4); cout << "Size: " << a.size(); @@ -540,11 +541,11 @@ the tensor dimensions. The actual type of the ```size()``` result is ### Getting Dimensions From An Operation -A few operations provide ```dimensions()``` directly, -e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions +A few operations provide `dimensions()` directly, +e.g. `TensorReslicingOp`. Most operations defer calculating dimensions until the operation is being evaluated. If you need access to the dimensions of a deferred operation, you can wrap it in a TensorRef (see Assigning to a -TensorRef above), which provides ```dimensions()``` and ```dimension()``` as +TensorRef above), which provides `dimensions()` and `dimension()` as above. TensorRef can also wrap the plain Tensor types, so this is a useful idiom in @@ -567,11 +568,11 @@ to the rank of the tensor. The content of the tensor is not initialized. ### TensorFixedSize -Creates a tensor of the specified size. The number of arguments in the Size<> +Creates a tensor of the specified size. The number of arguments in the Sizes<> template parameter determines the rank of the tensor. The content of the tensor is not initialized. - Eigen::TensorFixedSize> a; + Eigen::TensorFixedSize> a; cout << "Rank: " << a.rank() << endl; => Rank: 2 cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; @@ -581,14 +582,14 @@ is not initialized. Creates a tensor mapping an existing array of data. The data must not be freed until the TensorMap is discarded, and the size of the data must be large enough -to accomodate of the coefficients of the tensor. +to accommodate the coefficients of the tensor. float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap a(data, 3, 4); + Eigen::TensorMap> a(data, 3, 4); cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; => NumRows: 3 NumCols: 4 cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 9 + => a(1, 2): 7 ## Contents Initialization @@ -604,7 +605,7 @@ result. These are not tensor Operations which delay evaluation. ### setConstant(const Scalar& val) -Sets all elements of the tensor to the constant value ```val```. ```Scalar``` +Sets all elements of the tensor to the constant value `val`. `Scalar` is the type of data stored in the tensor. You can pass any value that is convertible to that type. @@ -618,8 +619,8 @@ Returns the tensor itself in case you want to chain another call. 12.3 12.3 12.3 12.3 12.3 12.3 12.3 12.3 -Note that ```setConstant()``` can be used on any tensor where the element type -has a copy constructor and an ```operator=()```: +Note that `setConstant()` can be used on any tensor where the element type +has a copy constructor and an `operator=()`: Eigen::Tensor a(2, 3); a.setConstant("yolo"); @@ -632,7 +633,7 @@ has a copy constructor and an ```operator=()```: ### setZero() -Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. +Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. Returns the tensor itself in case you want to chain another call. a.setZero(); @@ -653,10 +654,10 @@ If the tensor has rank N, the initializer list must be nested N times. The most deeply nested lists must contains P scalars of the Tensor type where P is the size of the last dimension of the Tensor. -For example, for a ```TensorFixedSize``` the initializer list must +For example, for a `TensorFixedSize` the initializer list must contains 2 lists of 3 floats each. -```setValues()``` returns the tensor itself in case you want to chain another +`setValues()` returns the tensor itself in case you want to chain another call. Eigen::Tensor a(2, 3); @@ -693,16 +694,16 @@ want to chain another call. -0.211234 0.823295 0.536459 -0.0452059 0.566198 -0.604897 -0.444451 0.257742 -You can customize ```setRandom()``` by providing your own random number +You can customize `setRandom()` by providing your own random number generator as a template argument: a.setRandom(); -Here, ```MyRandomGenerator``` must be a struct with the following member -functions, where Scalar and Index are the same as ```::Scalar``` -and ```::Index```. +Here, `MyRandomGenerator` must be a struct with the following member +functions, where Scalar and Index are the same as `::``Scalar` +and `::``Index`. -See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. +See `struct UniformRandomGenerator` in TensorFunctors.h for an example. // Custom number generator for use with setRandom(). struct MyRandomGenerator { @@ -767,7 +768,7 @@ Scalar is the type of data stored in the tensor. ## Tensor Operations -All the methods documented below return non evaluated tensor ```Operations```. +All the methods documented below return non evaluated tensor `Operations`. These can be chained: you can apply another Tensor Operation to the value returned by the method. @@ -778,7 +779,7 @@ their evaluation. ### constant(const Scalar& val) Returns a tensor of the same type and dimensions as the original tensor but -where all elements have the value ```val```. +where all elements have the value `val`. This is useful, for example, when you want to add or subtract a constant from a tensor, or multiply every element of a tensor by a scalar. @@ -810,7 +811,7 @@ but where all elements have random values. This is for example useful to add random values to an existing tensor. The generation of random values can be customized in the same manner -as for ```setRandom()```. +as for `setRandom()`. Eigen::Tensor a(2, 3); a.setConstant(1.0f); @@ -1013,16 +1014,23 @@ multidimensional case. Eigen::Tensor a(2, 3); a.setValues({{1, 2, 3}, {6, 5, 4}}); Eigen::Tensor b(3, 2); - a.setValues({{1, 2}, {4, 5}, {5, 6}}); + b.setValues({{1, 2}, {4, 5}, {5, 6}}); // Compute the traditional matrix product - array, 1> product_dims = { IndexPair(1, 0) }; + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; Eigen::Tensor AB = a.contract(b, product_dims); // Compute the product of the transpose of the matrices - array, 1> transpose_product_dims = { IndexPair(0, 1) }; + Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); + // Contraction to scalar value using a double contraction. + // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. + Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; + Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoubleContractedA(0); ## Reduction Operations @@ -1032,13 +1040,13 @@ original tensor. The values in the returned tensor are computed by applying a the dimensions along which the slices are made. The Eigen Tensor library provides a set of predefined reduction operators such -as ```maximum()``` and ```sum()``` and lets you define additional operators by +as `maximum()` and `sum()` and lets you define additional operators by implementing a few methods from a reductor template. ### Reduction Dimensions All reduction operations take a single parameter of type -```::Dimensions``` which can always be specified as an array of +`::``Dimensions` which can always be specified as an array of ints. These are called the "reduction dimensions." The values are the indices of the dimensions of the input tensor over which the reduction is done. The parameter can have at most as many element as the rank of the input tensor; @@ -1164,10 +1172,62 @@ short-circuiting, so may be significantly inefficient. ### reduce(const Dimensions& new_dims, const Reducer& reducer) -Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` +Reduce a tensor using a user-defined reduction operator. See `SumReducer` in TensorFunctors.h for information on how to implement a reduction operator. +## Trace + +A *Trace* operation returns a tensor with fewer dimensions than the original +tensor. It returns a tensor whose elements are the sum of the elements of the +original tensor along the main diagonal for a list of specified dimensions, the +"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions +are passed as an input parameter to the operation, are of type `::``Dimensions` +, and have the same requirements when passed as an input parameter. In addition, +the trace dimensions must have the same size. + +Example: Trace along 2 dimensions. + + // Create a tensor of 3 dimensions + Eigen::Tensor a(2, 2, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); + // Specify the dimensions along which the trace will be computed. + // In this example, the trace can only be computed along the dimensions + // with indices 0 and 1 + Eigen::array dims({0, 1}); + // The output tensor contains all but the trace dimensions. + Tensor a_trace = a.trace(dims); + cout << "a_trace:" << endl; + cout << a_trace << endl; + => + a_trace: + 11 + 13 + 15 + + +### trace(const Dimensions& new_dims) +### trace() + +As a special case, if no parameter is passed to the operation, trace is computed +along *all* dimensions of the input tensor. + +Example: Trace along all dimensions. + + // Create a tensor of 3 dimensions, with all dimensions having the same size. + Eigen::Tensor a(3, 3, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, + {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); + // Result is a zero dimension tensor + Tensor a_trace = a.trace(); + cout<<"a_trace:"< + a_trace: + 42 + + ## Scan Operations A *Scan* operation returns a tensor with the same dimensions as the original @@ -1191,7 +1251,7 @@ dd a comment to this line => a 1 2 3 - 6 5 4 + 4 5 6 b 1 3 6 @@ -1273,7 +1333,7 @@ the number of elements in the input tensor. This operation does not move any data in the input tensor, so the resulting contents of a reshaped Tensor depend on the data layout of the original Tensor. -For example this is what happens when you ```reshape()``` a 2D ColMajor tensor +For example this is what happens when you `reshape()` a 2D ColMajor tensor to one dimension: Eigen::Tensor a(2, 3); @@ -1314,7 +1374,7 @@ The previous example can be rewritten as follow: Eigen::Tensor a(2, 3); a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); Eigen::array two_dim({2, 3}); - Eigen::Tensor b; + Eigen::Tensor b(6); b.reshape(two_dim) = a; cout << "b" << endl << b << endl; => @@ -1378,7 +1438,7 @@ elements) along each of the dimensions. The argument strides is an array of Index values. The dimensions of the resulting tensor are ceil(input_dimensions[i] / strides[i]). -For example this is what happens when you ```stride()``` a 2D tensor: +For example this is what happens when you `stride()` a 2D tensor: Eigen::Tensor a(4, 3); a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); @@ -1482,7 +1542,7 @@ values that indicates whether or not the order of the coefficients should be reversed along each of the dimensions. This operation preserves the dimensions of the input tensor. -For example this is what happens when you ```reverse()``` the first dimension +For example this is what happens when you `reverse()` the first dimension of a 2D tensor: Eigen::Tensor a(4, 3); @@ -1568,85 +1628,83 @@ dimension in RowMajor layout. For example, given the following input tensor: - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); - cout << "tensor: " << endl << tensor << endl; -=> -tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 + cout << "tensor: " << endl << tensor << endl; + => + tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 Six 2x2 patches can be extracted and indexed using the following code: - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; } - cout << endl; } - } This code results in the following output when the data layout is ColMajor: -patch index: 0 -0 1 -4 5 -patch index: 1 -4 5 -8 9 -patch index: 2 -1 2 -5 6 -patch index: 3 -5 6 -9 10 -patch index: 4 -2 3 -6 7 -patch index: 5 -6 7 -10 11 + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 4 5 + 8 9 + patch index: 2 + 1 2 + 5 6 + patch index: 3 + 5 6 + 9 10 + patch index: 4 + 2 3 + 6 7 + patch index: 5 + 6 7 + 10 11 This code results in the following output when the data layout is RowMajor: (NOTE: the set of patches is the same as in ColMajor, but are indexed differently). -patch index: 0 -0 1 -4 5 -patch index: 1 -1 2 -5 6 -patch index: 2 -2 3 -6 7 -patch index: 3 -4 5 -8 9 -patch index: 4 -5 6 -9 10 -patch index: 5 -6 7 -10 11 - -### extract_image_patches(const Index patch_rows, const Index patch_cols, - const Index row_stride, const Index col_stride, - const PaddingType padding_type) + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 1 2 + 5 6 + patch index: 2 + 2 3 + 6 7 + patch index: 3 + 4 5 + 8 9 + patch index: 4 + 5 6 + 9 10 + patch index: 5 + 6 7 + 10 11 + +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) Returns a tensor of coefficient image patches extracted from the input tensor, which is expected to have dimensions ordered as follows (depending on the data @@ -1676,28 +1734,30 @@ sizes: *) columns: 5 *) batch: 7 - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); 2x2 image patches can be extracted and indexed using the following code: *) 2D patch: ColMajor (patch indexed by second-to-last dimension) - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 *) 2D patch: RowMajor (patch indexed by the second dimension) - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 + + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 ## Special Operations @@ -1737,11 +1797,9 @@ TODO ## Representation of scalar values -Scalar values are often represented by tensors of size 1 and rank 1. It would be -more logical and user friendly to use tensors of rank 0 instead. For example -Tensor::maximum() currently returns a Tensor. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 1d tensor. In the -future these operations might be updated to return 0d tensors instead. +Scalar values are often represented by tensors of size 1 and rank 0.For example +Tensor::maximum() currently returns a Tensor. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 0d tensor. ## Limitations diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 1940a9692..8cac2bb12 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -23,12 +23,12 @@ namespace Eigen { * The %Tensor class encompasses only dynamic-size objects so far. * * The first two template parameters are required: - * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex. + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) * * The remaining template parameters are optional -- in most cases you don't have to worry about them. - * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either * \b #AutoAlign or \b #DontAlign. * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. @@ -42,13 +42,13 @@ namespace Eigen { * \endcode * * This class can be extended with the help of the plugin mechanism described on the page - * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. * * Some notes: * *
*
Relation to other parts of Eigen:
- *
The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor * class does not provide any of these features and is only available as a stand-alone class that just allows for @@ -112,7 +112,7 @@ class Tensor : public TensorBase - EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -388,6 +388,7 @@ class Tensor : public TensorBase(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) @@ -398,6 +399,20 @@ class Tensor : public TensorBase::run(assign, DefaultDevice()); } + #if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(Self&& other) + : m_storage(std::move(other.m_storage)) + { + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) + { + m_storage = std::move(other.m_storage); + return *this; + } + #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { @@ -462,6 +477,18 @@ class Tensor : public TensorBase + EIGEN_DEVICE_FUNC + void resize(const Eigen::IndexList& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + /** Custom Dimension */ #ifdef EIGEN_HAS_SFINAE template > : public traits template struct eval, Eigen::Dense> { - typedef const TensorIndexTupleOp& type; + typedef const TensorIndexTupleOpEIGEN_DEVICE_REF type; }; template @@ -82,28 +82,35 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -117,7 +124,13 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: TensorEvaluator m_impl; @@ -147,7 +160,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorTupleReducerOp& type; + typedef const TensorTupleReducerOpEIGEN_DEVICE_REF type; }; template @@ -172,7 +185,7 @@ class TensorTupleReducerOp : public TensorBase, Devi typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; static const int NumDims = internal::array_size::value; typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory TupleStorageMem; enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { - + m_return_dim(op.return_dim()) + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); @@ -231,19 +252,22 @@ struct TensorEvaluator, Devi } else { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = m_strides[m_return_dim]; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = ((m_return_dim >= 0) && + (m_return_dim < static_cast(m_strides.size()))) + ? m_strides[m_return_dim] : 1; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -252,7 +276,13 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_orig_impl.bind(cgh); + } +#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -288,7 +318,7 @@ struct TensorEvaluator, Devi protected: TensorEvaluator, Device> m_orig_impl; TensorEvaluator >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 166be200c..e5811d63f 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -34,6 +34,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const std::size_t NumDimensions = internal::traits::NumDimensions; static const int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; enum { Flags = 0 @@ -67,6 +68,8 @@ class TensorAssignOp : public TensorBase typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} @@ -94,20 +97,41 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; - static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int PacketSize = PacketType::size; + static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT( + (static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -118,7 +142,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -127,7 +151,19 @@ struct TensorEvaluator, Device> // by the rhs to the lhs. return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync( + m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } @@ -136,6 +172,7 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); @@ -163,12 +200,41 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && + m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.template AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); + } + + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlock(desc, block); + } + block.cleanup(); + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } +#endif - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator m_leftImpl; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 7a45a5cf4..35b6458e5 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -20,9 +20,11 @@ namespace Eigen { * \brief The tensor base class. * * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangably in expressions. + * making it possible to use either class interchangeably in expressions. */ - +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now template class TensorBase { @@ -133,6 +135,78 @@ class TensorBase return unaryExpr(internal::scalar_digamma_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0() const { + return unaryExpr(internal::scalar_bessel_i0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0e() const { + return unaryExpr(internal::scalar_bessel_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1() const { + return unaryExpr(internal::scalar_bessel_i1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1e() const { + return unaryExpr(internal::scalar_bessel_i1e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j0() const { + return unaryExpr(internal::scalar_bessel_j0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y0() const { + return unaryExpr(internal::scalar_bessel_y0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j1() const { + return unaryExpr(internal::scalar_bessel_j1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y1() const { + return unaryExpr(internal::scalar_bessel_y1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0() const { + return unaryExpr(internal::scalar_bessel_k0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0e() const { + return unaryExpr(internal::scalar_bessel_k0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1() const { + return unaryExpr(internal::scalar_bessel_k1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1e() const { + return unaryExpr(internal::scalar_bessel_k1e_op()); + } + // igamma(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -140,6 +214,20 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_igamma_op()); } + // igamma_der_a(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma_der_a(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); + } + + // gamma_sample_der_alpha(alpha = this, sample = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + gamma_sample_der_alpha(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); + } + // igammac(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -174,9 +262,15 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ndtri() const { + return unaryExpr(internal::scalar_ndtri_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> sigmoid() const { - return unaryExpr(internal::scalar_sigmoid_op()); + return unaryExpr(internal::scalar_logistic_op()); } EIGEN_DEVICE_FUNC @@ -185,6 +279,12 @@ class TensorBase return unaryExpr(internal::scalar_exp_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + expm1() const { + return unaryExpr(internal::scalar_expm1_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> log() const { @@ -197,6 +297,12 @@ class TensorBase return unaryExpr(internal::scalar_log1p_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log2() const { + return unaryExpr(internal::scalar_log2_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> abs() const { @@ -204,9 +310,17 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clamp_op(min, max)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived>::type conjugate() const { - return unaryExpr(internal::scalar_conjugate_op()); + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); } EIGEN_DEVICE_FUNC @@ -287,22 +401,27 @@ class TensorBase return unaryExpr(internal::scalar_mod_op(rhs)); } + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > cwiseMax(Scalar threshold) const { - return cwiseMax(constant(threshold)); + return cwiseMax(constant(threshold)); } + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > cwiseMin(Scalar threshold) const { - return cwiseMin(constant(threshold)); + return cwiseMin(constant(threshold)); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::value, + Derived, + TensorConversionOp >::type cast() const { - return TensorConversionOp(derived()); + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); } EIGEN_DEVICE_FUNC @@ -311,6 +430,12 @@ class TensorBase return unaryExpr(internal::scalar_round_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rint() const { + return unaryExpr(internal::scalar_rint_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> ceil() const { @@ -355,16 +480,16 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_quotient_op()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_max_op()); + return binaryExpr(other.derived(), internal::scalar_max_op()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_min_op()); + return binaryExpr(other.derived(), internal::scalar_min_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -479,9 +604,15 @@ class TensorBase typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp + const TensorContractionOp contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); } // Convolutions. @@ -494,8 +625,8 @@ class TensorBase // Fourier transforms template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorFFTOp - fft(const FFT& fft) const { - return TensorFFTOp(derived(), fft); + fft(const FFT& dims) const { + return TensorFFTOp(derived(), dims); } // Scan. @@ -557,51 +688,53 @@ class TensorBase return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); } - const TensorReductionOp, const DimensionList, const Derived> + template + const TensorReductionOp, const DimensionList, const Derived> maximum() const { DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); } - const TensorReductionOp, const DimensionList, const Derived> + template + const TensorReductionOp, const DimensionList, const Derived> minimum() const { DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > all(const Dims& dims) const { return cast().reduce(dims, internal::AndReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > all() const { DimensionList in_dims; return cast().reduce(in_dims, internal::AndReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > any(const Dims& dims) const { return cast().reduce(dims, internal::OrReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > any() const { DimensionList in_dims; return cast().reduce(in_dims, internal::OrReducer()); @@ -613,7 +746,7 @@ class TensorBase const array, const Derived> argmax() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, @@ -626,7 +759,7 @@ class TensorBase const array, const Derived> argmin() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, @@ -637,7 +770,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -650,7 +783,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -665,10 +798,22 @@ class TensorBase return TensorReductionOp(derived(), dims, reducer); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp - broadcast(const Broadcast& broadcast) const { - return TensorBroadcastingOp(derived(), broadcast); + broadcast(const Broadcast& bcast) const { + return TensorBroadcastingOp(derived(), bcast); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -776,8 +921,8 @@ class TensorBase } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp @@ -818,7 +963,8 @@ class TensorBase protected: template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; @@ -826,6 +972,7 @@ class TensorBase template::value> class TensorBase : public TensorBase { public: + typedef TensorBase Base; typedef internal::traits DerivedTraits; typedef typename DerivedTraits::Scalar Scalar; typedef typename DerivedTraits::Index Index; @@ -834,7 +981,8 @@ class TensorBase : public TensorBase { template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { @@ -972,13 +1120,13 @@ class TensorBase : public TensorBase { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp - shuffle(const Shuffle& shuffle) { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -994,17 +1142,35 @@ class TensorBase : public TensorBase { // Select the device on which to evaluate the expression. template - TensorDevice device(const DeviceType& device) { - return TensorDevice(device, derived()); + TensorDevice device(const DeviceType& dev) { + return TensorDevice(dev, derived()); + } + + // Select the async device on which to evaluate the expression. + template + TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); } protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(derived(), other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return derived(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; - +#endif // EIGEN_PARSED_BY_DOXYGEN } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 000000000..1e55d12c4 --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1559 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides( + const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides( + const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + +#ifdef EIGEN_HIPCC + // For HIPCC, we need to explicitly declare as a "device fun", the constructor + // which is implicitly invoked in the "merge" / "any" routines. else HIPCC + // errors out complaining about the lack of a matching constructor + EIGEN_DEVICE_FUNC + TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, + TensorOpCost cost_) + : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) + {} +#endif + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes, + TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, + size_in_bytes); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, + size_in_bytes); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements& lhs, + const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( + TensorOpCost cost) { + cost_per_coeff += cost; + return *this; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; + } + + private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockShapeType + merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || + rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, + TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, + DestinationBufferKind kind) + : m_data(static_cast(data)), + m_data_type_size(sizeof(Scalar)), + m_strides(strides), + m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, + Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, + const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, + const DestinationBuffer& destination) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = + DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer( + Scalar* dst_base, + const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { + return m_destination.kind() != DestinationBuffer::kEmpty; + } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, + const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Compute block dimensions and the total number of blocks. + InitializeBlockDimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { + return m_total_block_count; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { + return m_block_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& + blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor + blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, + m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + IndexType target_block_size = + numext::maxi(1, static_cast(m_requirements.size)); + + IndexType tensor_size = m_tensor_dimensions.TotalSize(); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (tensor_size == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + m_total_block_count = 0; + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (tensor_size <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + m_total_block_count = 1; + // The only valid block index is `0`, and in this case we do not need + // to compute real strides for tensor or blocks (see blockDescriptor). + for (int i = 0; i < NumDims; ++i) { + m_tensor_strides[i] = 0; + m_block_strides[i] = 1; + } + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = + numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), + 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = + numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = + total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = + divup(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = + numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, + m_tensor_dimensions.TotalSize())); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) + : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, + const Dimensions& dimensions, bool valid_expr = true) + : m_kind(kind), + m_data(data), + m_dimensions(dimensions), + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock( + m_materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage( + TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && + allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, + block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), + storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseUnaryOp >:: + type XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseBinaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, + const RhsTensorBlock& right_block, + const BinaryOp& functor) + : m_left_block(left_block), + m_right_block(right_block), + m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { + return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); + } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum class Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) + : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, + const Src& src, + const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, + src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + if (kind == StridedLinearBufferCopy::Kind::Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = count - 4 * PacketSize; + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; + Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Packet p = pload1(src); + for (; i <= vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static const bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, + IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( + const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor + ? num_size_one_inner_dims + : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = + NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = + NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; \ + num_copied += dst_inner_dim_size) { \ + LinCopy::template Run( \ + typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), \ + dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Gather); + } else { + COPY_INNER_DIM(LinCopy::Kind::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, + const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + input_stride(0), + output_stride(0), + input_span(0), + output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator + TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i <= vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target( + const DSizes& target_dims, + const DSizes& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run(target.data + output_offset, + output_inner_dim_size, eval, + input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 4cfe300eb..a354132f6 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -31,12 +31,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorBroadcastingOp& type; + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; }; template @@ -54,7 +55,7 @@ struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template +template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; @@ -103,27 +104,57 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; + bool isCopy, nByOne, oneByN; + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) + typedef typename internal::remove_const::type ScalarNoConst; + + // We do block based broadcasting using a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + typedef DSizes BroadcastDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : isCopy(false), nByOne(false), oneByN(false), + m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); - const Broadcast& broadcast = op.broadcast(); + isCopy = true; for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * broadcast[i]; + m_dimensions[i] = input_dims[i] * m_broadcast[i]; + if (m_broadcast[i] != 1) { + isCopy = false; + } } if (static_cast(Layout) == static_cast(ColMajor)) { @@ -141,16 +172,58 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (m_broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims-1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + break; + } + } + } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -161,16 +234,24 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return coeffColMajor(index); + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffColMajor(index); + } } else { - return coeffRowMajor(index); + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffRowMajor(index); + } } } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -195,12 +276,17 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return m_impl.coeff(inputIndex); + return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -215,17 +301,22 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); } } - return m_impl.coeff(inputIndex); + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); } template @@ -236,9 +327,148 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetNByOne(index); + } else if (!oneByN && nByOne) { + return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetColMajor(index); + } } else { - return packetRowMajor(index); + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetOneByN(index); + } else if (!oneByN && nByOne) { + return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne + (Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index dim, inputIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = NumDims - 1; + } else { + dim = 0; + } + + inputIndex = index % m_inputStrides[dim]; + if (inputIndex + PacketSize <= m_inputStrides[dim]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > m_inputStrides[dim]-1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index dim, inputIndex, outputOffset; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = 1; + } else { + dim = NumDims - 2; + } + + inputIndex = index / m_outputStrides[dim]; + outputOffset = index % m_outputStrides[dim]; + if (outputOffset + PacketSize <= m_outputStrides[dim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[dim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); } } @@ -253,6 +483,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -288,8 +519,13 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffColMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -305,6 +541,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -340,8 +577,13 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffRowMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -351,7 +593,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost(); - if (NumDims > 0) { + if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); if (internal::index_statically_eq(i, 1)) { @@ -372,14 +615,472 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + const size_t target_size = m_device.firstLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + m_impl.getResourceRequirements(), + internal::TensorBlockResourceRequirements::skewed(target_size)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + BlockBroadcastingParams params = blockBroadcastingParams(desc); + + if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { + return emptyBlock(); + } + + // Prepare storage for the materialized broadcasting result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); + + // We potentially will need to materialize input blocks. + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + // Initialize block broadcating iterator state for outer dimensions (outer + // with regard to bcast dimension). Dimension in this array are always in + // inner_most -> outer_most order (col major layout). + array it; + int idx = 0; + + for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { + const Index dim = IsColMajor ? i : NumDims - 1 - i; + it[idx].size = params.output_dims[dim]; + it[idx].count = 0; + it[idx].output_stride = m_outputStrides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // Write output into the beginning of `materialized_output`. + Index output_offset = 0; + + // We will fill output block by broadcasting along the bcast dim, and + // iterating over outer dimension. + const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); + + for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { + ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; + Index bcast_offset = desc.offset() + output_offset; + + // Broadcast along the bcast dimension. + num_output_coeffs += BroadcastBlockAlongBcastDim( + params, bcast_offset, scratch, bcast_output, &materialized_input, + &materialized_input_size); + + // Switch to the next outer dimension. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( + cl::sycl::handler& cgh) const { + m_impl.bind(cgh); + } +#endif + private: + static const bool IsColMajor = + static_cast(Layout) == static_cast(ColMajor); + + // We will build a general case block broadcasting on top of broadcasting + // primitive that will do broadcasting only for the inner dimension(s) along + // the first dimension smaller than the input size (it's called `bcast_dim`). + // + // Example: + // dim: 0 1 2 (ColMajor) + // input size: [9, 3, 6] + // block size: [9, 2, 6] + // + // We will compute broadcasted block by iterating over the outer dimensions + // before `bcast_dim` (only dimension `2` in this example) and computing + // broadcasts along the `bcast_dim` (dimension `1` in this example). + + // BlockBroadcastingParams holds precomputed parameters for broadcasting a + // single block along the broadcasting dimension. Sizes and strides along the + // `bcast_dim` might be invalid, they will be adjusted later in + // `BroadcastBlockAlongBcastDim`. + struct BlockBroadcastingParams { + Dimensions input_dims; // input expression dimensions + Dimensions output_dims; // output block sizes + Dimensions output_strides; // output block strides + + int inner_dim_count; // count inner dimensions matching in size + int bcast_dim; // broadcasting dimension index + Index bcast_dim_size; // broadcasting dimension size + Index inner_dim_size; // inner dimensions size + + // Block sizes and strides for the input block where all dimensions before + // `bcast_dim` are equal to `1`. + Dimensions input_block_sizes; + Dimensions input_block_strides; + + // Block sizes and strides for blocks with extra dimensions and strides `0`. + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + }; + + struct BlockBroadcastingIteratorState { + Index size; + Index count; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams + blockBroadcastingParams(TensorBlockDesc& desc) const { + BlockBroadcastingParams params; + + params.input_dims = Dimensions(m_impl.dimensions()); + + // Output block sizes and strides. + params.output_dims = desc.dimensions(); + params.output_strides = internal::strides(params.output_dims); + + // Find the broadcasting dimension (first dimension with output size smaller + // that the input size). + params.bcast_dim = 0; + params.bcast_dim_size = 1; + params.inner_dim_size = 1; + + // Count the number of inner dimensions that have the same size in the block + // and in the broadcast expression. + params.inner_dim_count = 0; + + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + if (params.output_dims[dim] == m_dimensions[dim]) { + params.inner_dim_size *= params.output_dims[dim]; + ++params.inner_dim_count; + continue; + } + + // First non-matching dimension is the broadcasting dimension. + eigen_assert(params.output_dims[dim] < m_dimensions[dim]); + params.bcast_dim = dim; + params.bcast_dim_size = params.output_dims[dim]; + break; + } + + // Calculate the input block size for looking into the input. + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = params.input_dims[dim]; + } + for (int i = params.inner_dim_count; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = 1; + } + params.input_block_strides = + internal::strides(params.input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; + + params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; + params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + params.bcast_block_strides[copy_dim] = params.output_strides[dim]; + params.bcast_block_strides[broadcast_dim] = + params.output_strides[dim] * params.input_dims[dim]; + params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; + params.bcast_input_strides[broadcast_dim] = 0; + } + + for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { + const int dim = IsColMajor ? i : 2 * NumDims - i - 1; + params.bcast_block_sizes[dim] = 1; + params.bcast_block_strides[dim] = 0; + params.bcast_input_strides[dim] = 0; + } + + return params; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( + BlockBroadcastingParams params, Index bcast_offset, + TensorBlockScratch& scratch, ScalarNoConst* materialized_output, + ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + if (params.bcast_dim_size == 1) { + // We just need one block read using the ready-set values above. + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else if (params.input_dims[params.bcast_dim] == 1) { + // Broadcast bcast dimension (< NumDims) by bcast_dim_size. + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim]; + + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else { + // Keep track of the total number of the coefficients written to the + // output block. + Index num_output_coeffs = 0; + + // The general case. Let's denote the output block as + // + // x[..., a:a+bcast_dim_size, :, ..., :] + // + // where a:a+bcast_dim_size is a slice on the bcast_dim dimension + // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 + // sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] + // in [a, a+bcast_dim_size]. + // + // (3) c:a+bcast_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index bcast_dim_left_index = + bcast_offset / m_outputStrides[params.bcast_dim]; + + // Find b and c. + const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; + + // First multiple after a. This is b when <= bcast_dim_left_index + + // bcast_dim_size. + const Index first_multiple = + divup(bcast_dim_left_index, input_bcast_dim_size) * + input_bcast_dim_size; + + if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = + (bcast_dim_left_index + params.bcast_dim_size) / + input_bcast_dim_size * input_bcast_dim_size; + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + if (first_multiple > bcast_dim_left_index) { + const Index head_size = first_multiple - bcast_dim_left_index; + params.input_block_sizes[params.bcast_dim] = head_size; + params.bcast_block_sizes[copy_bcast_dim] = head_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (first_multiple < last_multiple) { + params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = + (last_multiple - first_multiple) / input_bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (first_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { + const Index tail_size = + bcast_dim_left_index + params.bcast_dim_size - last_multiple; + params.input_block_sizes[params.bcast_dim] = tail_size; + params.bcast_block_sizes[copy_bcast_dim] = tail_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (last_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + + return num_output_coeffs; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( + const Dimensions& input_block_sizes, + const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, + const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index bcast_offset, + Index offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = bcast_offset + offset; + TensorBlockDesc input_desc( + IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || + *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, + *materialized_input), + input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIO + TensorBlockIO; + + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, + materialized_output + offset); + + return TensorBlockIO::Copy(dst, src); + } - protected: - const Broadcast m_broadcast; +protected: + const Device EIGEN_DEVICE_REF m_device; + const typename internal::remove_reference::type m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 1ba7ef170..376457341 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -32,12 +32,13 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions - 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorChippingOp& type; + typedef const TensorChippingOp EIGEN_DEVICE_REF type; }; template @@ -50,6 +51,7 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -78,44 +80,28 @@ template class TensorChippingOp : public TensorBase > { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) { - } + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } - EIGEN_DEVICE_FUNC - const Index dim() const { return m_dim.actualDim(); } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) protected: typename XprType::Nested m_xpr; @@ -136,20 +122,49 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + Layout = TensorEvaluator::Layout, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + // Chipping of outer-most dimension is a trivial operation, because we can + // read and write directly from the underlying tensor using single offset. + IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || + (static_cast(Layout) == RowMajor && DimId == 0), + // Chipping inner-most dimension. + IsInnerChipping = (static_cast(Layout) == ColMajor && DimId == 0) || + (static_cast(Layout) == RowMajor && DimId == NumInputDims - 1), + // Prefer block access if the underlying expression prefers it, otherwise + // only if chipping is not trivial. + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + !IsOuterChipping, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor + ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -185,12 +200,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -205,21 +220,20 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + if (isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } PacketReturnType rslt = internal::pload(values); return rslt; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); } else { @@ -231,6 +245,7 @@ struct TensorEvaluator, Device> } else { // Cross the stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; @@ -263,29 +278,100 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { - CoeffReturnType* result = const_cast(m_impl.data()); - if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && - result) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] + = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] + = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer( + desc.destination().template data(), + arg_destination_strides); + } + + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlock(arg_block.kind(), arg_block.data(), + desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Prepare storage for the materialized chipping result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + arg_desc.dimensions(), + internal::strides(arg_desc.dimensions()), + block_storage.data()), + arg_block.expr()); + + return block_storage.AsTensorMaterializedBlock(); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (isOuterChipping() && result) { return result + m_inputOffset; } else { return NULL; } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + if (isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer + // division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -297,13 +383,25 @@ struct TensorEvaluator, Device> return inputIndex; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { + return IsInnerChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == 0) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { + return IsOuterChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == 0); + } + Dimensions m_dimensions; Index m_stride; Index m_inputOffset; Index m_inputStride; TensorEvaluator m_impl; const internal::DimensionId m_dim; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; }; @@ -321,15 +419,21 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -343,20 +447,19 @@ struct TensorEvaluator, Device> { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + if (this->isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (this->isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); } else { @@ -369,6 +472,7 @@ struct TensorEvaluator, Device> // Cross stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; @@ -376,6 +480,36 @@ struct TensorEvaluator, Device> } } } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + typedef TensorReshapingOp, + const typename TensorBlock::XprType> + TensorBlockExpr; + + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target( + input_block_dims, + internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 59bf90d93..5235a8e6f 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,6 +37,8 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -58,6 +60,7 @@ template class TensorConcatenationOp : public TensorBase, WriteAccessors> { public: + typedef TensorBase, WriteAccessors> Base; typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; @@ -79,25 +82,7 @@ class TensorConcatenationOp : public TensorBase Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; @@ -117,14 +102,24 @@ struct TensorEvaluator::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -177,14 +172,14 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } } else { left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } @@ -231,11 +228,13 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } } else { right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } @@ -248,11 +247,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); } @@ -275,7 +275,15 @@ struct TensorEvaluator XprType; typedef typename Base::Dimensions Dimensions; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -344,7 +360,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 20b29e5fd..8b35f7985 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,8 +21,8 @@ namespace Eigen { */ namespace internal { -template -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename gebp_traits::type, @@ -38,53 +38,305 @@ struct traits > typedef typename remove_reference::type _RhsNested; // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type + PointerType; enum { Flags = 0 }; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorContractionOp& type; + typedef const TensorContractionOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorContractionOp type; + typedef TensorContractionOp type; }; -template -struct traits, Device_> > { +template +struct traits, Device_> > { typedef Indices_ Indices; typedef LeftArgType_ LeftArgType; typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; typedef Device_ Device; // From NumDims below. static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, + const Index bk, + const Index bn, + LhsScalar** lhs_block, + RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + eigen_assert(block_mem); + *lhs_block = reinterpret_cast(block_mem); + *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( + Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, const Index num_slices, + std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0); + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate( + (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = reinterpret_cast(mem); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = reinterpret_cast(mem); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, + const Index bk, + const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` + // (otherwise beta should be always equal to 1). + enum { HasBeta = false }; + + EIGEN_DEVICE_FUNC + TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, + StorageIndex bm_, StorageIndex bk_, StorageIndex bn_) + : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator + BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs< + LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, + RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( + Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices( + d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( + LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( + RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( + const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha, const ResScalar beta) { + // Default GEBP kernel does not support beta. + eigen_assert(beta == ResScalar(1)); + static const int kComputeStrideFromBlockDimensions = -1; + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, + /*offsetA*/ 0, /*offsetB*/ 0); + } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; +}; + } // end namespace internal -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// The NoOpOutputKernel implements an output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper& output_mapper, + const TensorContractionParams& params, Index i, + Index j, Index num_rows, Index num_cols) const { + EIGEN_UNUSED_VARIABLE(output_mapper); + EIGEN_UNUSED_VARIABLE(params); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(num_rows); + EIGEN_UNUSED_VARIABLE(num_cols); + } +}; + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), + m_output_kernel(output_kernel) {} EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } @@ -98,35 +350,48 @@ class TensorContractionOp : public TensorBase::type& rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const OutputKernelType& outputKernel() const { return m_output_kernel; } + protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; const Indices m_indices; + const OutputKernelType m_output_kernel; }; template -struct TensorContractionEvaluatorBase +struct TensorContractionEvaluatorBase : internal::no_assignment_operator { typedef typename internal::traits::Indices Indices; typedef typename internal::traits::LeftArgType LeftArgType; typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; typedef typename internal::traits::Device Device; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + // Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code @@ -136,6 +401,9 @@ struct TensorContractionEvaluatorBase typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; + static const int LDims = internal::array_size::Dimensions>::value; static const int RDims = @@ -149,16 +417,17 @@ struct TensorContractionEvaluatorBase typedef DSizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), m_device(device), + m_output_kernel(op.outputKernel()), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -233,7 +502,7 @@ struct TensorContractionEvaluatorBase // dimensions and right non-contracting dimensions. m_lhs_inner_dim_contiguous = true; int dim_idx = 0; - unsigned int nocontract_idx = 0; + Index nocontract_idx = 0; for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor @@ -323,64 +592,144 @@ struct TensorContractionEvaluatorBase numext::swap(m_dimensions[i], m_dimensions[j]); } } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { evalTo(data); return false; } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); } - } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } +#endif + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this) + ->template evalProductAsync(buffer, + std::move(done)); + } +#endif // EIGEN_USE_THREADS + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + } else { + this->template evalGemm(buffer); } } template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -418,12 +767,41 @@ struct TensorContractionEvaluatorBase internal::general_matrix_vector_product::run( rows, cols, lhs, rhs, buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, + static_cast(0), static_cast(0), rows, + static_cast(1)); } template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; + this->template evalGemmPartial(buffer, 0, k, 1); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( + Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + evalGemmPartial(buffer, k_start, k_end, + num_threads); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); + // columns in slice on left side, rows on right side + const Index k_slice = k_end - k_start; // rows in left side const Index m = this->m_i_size; @@ -431,16 +809,9 @@ struct TensorContractionEvaluatorBase // columns in right side const Index n = this->m_j_size; - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types + // define data mappers for Lhs and Rhs typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; @@ -462,11 +833,9 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, @@ -478,42 +847,72 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); + internal::TensorContractionBlocking + blocking(k_slice, m, n, num_threads); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = + kernel.allocate(this->m_device, &blockA, &blockB); + + // If a contraction kernel does not support beta, explicitly initialize + // output buffer with zeroes. + if (!TensorContractionKernel::HasBeta) { + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + } for(Index i2=0; i2= k_end) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, + actual_mc, actual_nc); + } } } } - this->m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + kernel.deallocate(this->m_device, packed_mem); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -536,11 +935,9 @@ struct TensorContractionEvaluatorBase return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } - protected: - // Prevent assignment - TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); +protected: Dimensions m_dimensions; contract_t m_k_strides; @@ -560,22 +957,25 @@ struct TensorContractionEvaluatorBase Index m_j_size; Index m_k_size; + TensorContractionParams m_tensor_contraction_params; + TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; - const Device& m_device; - Scalar* m_result; + const Device EIGEN_DEVICE_REF m_device; + OutputKernelType m_output_kernel; + EvaluatorPointerType m_result; }; // evaluator for default device -template -struct TensorEvaluator, Device> : +template +struct TensorEvaluator, Device> : public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -609,17 +1009,12 @@ struct TensorEvaluator Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - template - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); } }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 5cf7b4f71..974feb0ad 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,14 +21,28 @@ enum { // Default Blocking Strategy -template +template class TensorContractionBlocking { public: - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; + /* + adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` + requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + (else HIPCC will error out) - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + results in NVCC erroring out with the following error + + ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: + dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function + */ + + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -37,19 +51,22 @@ class TensorContractionBlocking { else { computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? + kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } private: - Index kc_; - Index mc_; - Index nc_; + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; }; - } // end namespace internal } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index d65dbb40f..3f315fedc 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1,1391 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" +#endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -__launch_bounds__(512) -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - lhs_pf0 = internal::pset1(0); - rhs_pf0 = internal::pset1(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and __CUDACC__ -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#include "TensorContractionGpu.h" diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h new file mode 100644 index 000000000..c81803827 --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -0,0 +1,1413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else +__launch_bounds__(512) +#endif +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) + { + EIGEN_STATIC_ASSERT( (internal::is_same::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + +#if defined(EIGEN_USE_HIP) + setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); +#else + setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); +#endif + + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9b2cb3ff6..9ab900b4a 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,8 +22,19 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ - -template struct CoeffLoader { +/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which +/// is scalar * for CoeffLoader. +template class MakePointer_ = MakePointer> +struct CoeffLoader; + +template class MakePointer_ = MakePointer> +class BaseTensorContractionMapper; + +template class MakePointer_> +struct CoeffLoader { enum { DirectOffsets = false }; @@ -34,6 +45,12 @@ template struct CoeffLoader { eigen_assert(false && "unsupported"); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + eigen_assert(false && "unsupported"); + return NULL; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -42,12 +59,19 @@ template struct CoeffLoader { return m_tensor.template packet(index); } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif private: const Tensor m_tensor; }; -template struct CoeffLoader { +template class MakePointer_> +struct CoeffLoader { enum { DirectOffsets = true }; @@ -58,6 +82,11 @@ template struct CoeffLoader { m_data += offset; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + return m_data; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -65,15 +94,23 @@ template struct CoeffLoader { { return internal::ploadt_ro(m_data + index); } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } + #endif private: typedef typename Tensor::Scalar Scalar; - const Scalar* m_data; + + typename MakePointer_::Type m_data; }; template + int packet_size, bool inner_dim_contiguous, int Alignment, template class MakePointer_ = MakePointer> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -89,7 +126,7 @@ class SimpleTensorContractionMapper { m_k_strides(k_strides) { } enum { - DirectOffsets = CoeffLoader::DirectOffsets + DirectOffsets = CoeffLoader::DirectOffsets }; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { @@ -113,8 +150,10 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = nocontract_val / m_ij_strides[i]; linidx += idx * m_nocontract_strides[i]; @@ -131,6 +170,7 @@ class SimpleTensorContractionMapper { Index contract_val = left ? col : row; if(array_size::value > 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = contract_val / m_k_strides[i]; linidx += idx * m_contract_strides[i]; @@ -151,9 +191,11 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = nocontract_val[0] / m_ij_strides[i]; const Index idx1 = nocontract_val[1] / m_ij_strides[i]; @@ -174,6 +216,7 @@ class SimpleTensorContractionMapper { Index contract_val[2] = {left ? col : row, left ? col : row + distance}; if (array_size::value> 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = contract_val[0] / m_k_strides[i]; const Index idx1 = contract_val[1] / m_k_strides[i]; @@ -205,24 +248,41 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + + const CoeffLoader& tensor() const { + return m_tensor; + } + + const nocontract_t& nocontract_strides() const { + return m_nocontract_strides; + } + const nocontract_t& ij_strides() const { return m_ij_strides; } + const contract_t& contract_strides() const { return m_contract_strides; } + const contract_t& k_strides() const { return m_k_strides; } + protected: - CoeffLoader m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; const contract_t m_k_strides; }; - template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment, template class MakePointer_> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -232,12 +292,11 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size==packet_size,PacketT>::type + load(Index i, Index j) const + { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -252,7 +311,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper indexPair = this->computeIndexPair(i, j, packet_size - 1); const Index first = indexPair.first; - const Index last = indexPair.second; + const Index lastIdx = indexPair.second; // We can always do optimized packet reads from left hand side right now, because // the vertical matrix dimension on the left hand side is never contracting. @@ -260,7 +319,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { + (lastIdx - first) == (packet_size - 1)) { return this->m_tensor.template packet(first); } @@ -268,31 +327,44 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapperm_tensor.coeff(first); + EIGEN_UNROLL_LOOP for (Index k = 1; k < packet_size - 1; k += 2) { const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); data[k] = this->m_tensor.coeff(internal_pair.first); data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - data[packet_size - 1] = this->m_tensor.coeff(last); + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); - return pload(data); + return pload(data); } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size!=packet_size,PacketT>::type + load(Index i, Index j) const + { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - return pload(data); + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i,j); } }; @@ -301,11 +373,12 @@ template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment, template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -315,16 +388,17 @@ class BaseTensorContractionMapper EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); } }; @@ -333,14 +407,12 @@ template + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template class MakePointer_=MakePointer> class TensorContractionSubMapper { public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; typedef Self LinearMapper; enum { @@ -372,27 +444,32 @@ class TensorContractionSubMapper { return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); + return m_base_mapper.template loadPacket(i, j); } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); + return m_base_mapper.template load(i, j); } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -408,19 +485,30 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - template + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_base_mapper.bind(cgh); + } + #endif + + const ParentMapper& base_mapper() const { return m_base_mapper; } + Index vert_offset() const { return m_vert_offset; } + Index horiz_offset() const { return m_horiz_offset; } + private: ParentMapper m_base_mapper; const Index m_vert_offset; @@ -432,14 +520,14 @@ template + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template class MakePointer_=MakePointer> class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, @@ -457,9 +545,29 @@ class TensorContractionInputMapper EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(*this, i, j); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } }; +template struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait > { + + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; + }; + } // end namespace internal } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h new file mode 100755 index 000000000..473c22849 --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -0,0 +1,1650 @@ +// This file is part of Eigen, a lightweight C++ template library for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorContractionSycl.h + * + * \brief: + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend + * + *****************************************************************/ + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if
::type + write(PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size != 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size == 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef typename Eigen::internal::conditional::type OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef + typename ::Eigen::internal::conditional::type + tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &, local_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &thread_properties, local_ptr block_start_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type sync_thread( + const cl::sycl::nd_item<1> &) { + return; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, bool &db_offset) { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisable by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determins the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC + GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_res.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); + + typedef Eigen::SyclDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index StorageIndex; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; + enum { + Layout = TensorEvaluator::Layout, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + }; + + static EIGEN_CONSTEXPR int LDims = Base::LDims; + static EIGEN_CONSTEXPR int RDims = Base::RDims; + static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef typename Eigen::internal::remove_const::type LhsScalar; + typedef typename Eigen::internal::remove_const::type RhsScalar; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; + } + evalToSycl(data); + return (this->m_result != NULL); + } + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } + } + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif + } + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device().template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); + + device().deallocate_temp(temp_pointer); + } + } + +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups); + + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device().template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + } + } +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); + + device().deallocate_temp(temp_pointer); + } else { + device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K); + } + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } + // The placeholder accessors must bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + this->m_leftImpl.bind(cgh); + this->m_rightImpl.bind(cgh); + this->m_result.bind(cgh); + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index ee16cde9b..21be6ea42 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,57 +15,16 @@ namespace Eigen { -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { typedef ThreadPoolDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -112,40 +71,35 @@ struct TensorEvaluator + template void evalProduct(Scalar* buffer) const { - typedef - typename internal::remove_const::type - LhsScalar; - typedef - typename internal::remove_const::type - RhsScalar; - typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, requried for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = + std::is_same::value; const Index m = this->m_i_size; const Index n = this->m_j_size; @@ -181,14 +135,14 @@ struct TensorEvaluator blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); @@ -204,35 +158,45 @@ struct TensorEvaluator::numThreads( static_cast(n) * m, cost, this->m_device.numThreads()); + int num_threads_by_k = numThreadsInnerDim(m, n, k); + if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { + // We are in the scenario where it is more effective to shard by the + // inner dimension. + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = new EvalShardedByInnerDimContext( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + + return; + } // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost // model is not tuned. Remove this when the cost model is tuned. if (n == 1) num_threads = 1; if (num_threads == 1) { - // The single-threaded algorithm should be faster in this case. - if (n == 1) - this->template evalGemv(buffer); - else - this->template evalGemm(buffer); + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, + Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); return; } // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); @@ -264,6 +228,26 @@ struct TensorEvaluatorm_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = + num_worker_threads <= 4 ? 8.0 : + num_worker_threads <= 8 ? 4.0 : + num_worker_threads <= 16 ? 2.0 : + num_worker_threads <= 32 ? 1.0 : + num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -279,40 +263,139 @@ struct TensorEvaluatorm_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); + } else { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ + nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ + std::move(done)) + TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, + Alignment, CONTEXT_ARGS, run()); +#undef CONTEXT_ARGS + } + } - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); + // ------------------------------------------------------------------------ // - Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); - } + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { + eigen_assert(false && "NoCallback should never be called"); + } + }; + + // ------------------------------------------------------------------------ // - // Context coordinates a single parallel gemm operation. - template - class Context { + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) + : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { public: - Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) - : device_(device), - lhs_(lhs), - rhs_(rhs), + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, + Index tm, Index tn, Index tk, Index bm, Index bn, + Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack, + bool parallelize_by_sharding_dim_only, + DoneCallback done) + : created_by_thread_id_(std::this_thread::get_id()), + done_(this, std::move(done)), + device_(self->m_device), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, + self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, + self->m_j_strides, self->m_right_contracting_strides, + self->m_k_strides), buffer_(buffer), output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), m_(tm), n_(tn), k_(tk), @@ -325,13 +408,29 @@ struct TensorEvaluator(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); + + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // + /*rhs_blocks=*/nullptr); } } } - ~Context() { + ~EvalParallelContext() { for (Index x = 0; x < P; x++) { for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; delete[] state_kernel_[x]; } - internal::aligned_free(packed_mem_); + kernel_.deallocate(device_, packed_mem_); + if (parallelize_by_sharding_dim_only_) { + kernel_.deallocate(device_, thread_local_pre_alocated_mem_); + delete[] can_use_thread_local_packed_; + } } void run() { // Kick off packing of the first slice. signal_switch(0, 1); + // Wait for overall completion. - // TODO(dvyukov): this wait can lead to deadlock. - // If nthreads contractions are concurrently submitted from worker - // threads, this wait will block all worker threads and the system will - // deadlock. + // + // If parallel evaluation is executed in async mode, this is a no-op, and + // Wait() will return immediately. In synchronous mode it will block the + // caller thread until it will receive notification from last task. + // + // In async mode, last task when completed will call done callback from + // the same thread, and will delete this context. + // + // TODO(dvyukov): This wait can lead to deadlock if contraction is + // evaluated in synchronous mode. If nthreads contractions are + // concurrently submitted from worker threads, this wait will block all + // worker threads and the system will deadlock. done_.Wait(); } private: - Notification done_; + std::thread::id created_by_thread_id_; + + // This notification is specialized on the type of DoneCallback and can be + // blocking or non-blocking. + EvalParallelNotification done_; + const Device& device_; - LhsMapper& lhs_; - RhsMapper& rhs_; + LhsMapper lhs_; + RhsMapper rhs_; Scalar* const buffer_; OutputMapper output_; + OutputKernelType output_kernel_; + TensorContractionParams tensor_contraction_params_; const int num_threads_; const bool shard_by_col_; const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; // Matrix sizes. const Index m_; const Index n_; @@ -423,6 +562,8 @@ struct TensorEvaluator packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + + // Handle for pre-allocated thread local memory buffers. + BlockMemHandle thread_local_pre_alocated_mem_; + + // Only one of these will be initialized depending on shard_by_col value + // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). + std::vector lhs_thread_local_pre_allocated_; + std::vector rhs_thread_local_pre_allocated_; + + // How many thread local blocks were already allocated. + std::atomic num_thread_local_allocations_; + const int thread_local_capacity; + + // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of + // unique threads in a system is below or equal to the number of threads in + // a thread pool. We will fallback on dynamic memory allocation after that. + + // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its + // size is equal to the grain size in Lhs/Rhs sharding dimension. + template + class ThreadLocalBlocks { + public: + ThreadLocalBlocks() = default; + + ThreadLocalBlocks(BlockType* base, size_t grain_size) + : is_pre_allocated_(true), + thread_local_pre_allocated_base_(base), + grain_size_(grain_size) {} + + ThreadLocalBlocks(BlockMemHandle mem_handle, + std::vector blocks) + : is_pre_allocated_(false), + mem_handle_(std::move(mem_handle)), + blocks_(std::move(blocks)) {} + + BlockType& block(int grain_index) { + eigen_assert(grain_index >= 0); + eigen_assert(static_cast(grain_index) < size()); + return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] + : blocks_[grain_index]; + } + + void Release(EvalParallelContext& ctx) const { + if (!is_pre_allocated_) { + ctx.kernel_.deallocate(ctx.device_, mem_handle_); + } + } + + size_t size() const { + return is_pre_allocated_ ? grain_size_ : blocks_.size(); + } + + private: + bool is_pre_allocated_; + + // Reuse pre-allocated thread local buffers. + BlockType* thread_local_pre_allocated_base_ = nullptr; + size_t grain_size_ = 0; + + // These will be initialized only if `is_pre_allocated == false`. + BlockMemHandle mem_handle_{}; + std::vector blocks_; + }; + + // ThreadLocalBlocksInitialize callable does custom thread local blocks + // initialization, and will reuse pre-allocated buffers if possible, or will + // dynamically allocate new memory. + // + // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly + // for what side do we plan to do block allocation. + template + class ThreadLocalBlocksInitialize { + static constexpr bool kIsLhs = + !is_rhs && std::is_same::value; + static const bool kIsRhs = + is_rhs && std::is_same::value; + static_assert(kIsLhs || kIsRhs, "Unkown block type"); + + using Blocks = ThreadLocalBlocks; + + public: + ThreadLocalBlocksInitialize(EvalParallelContext& ctx) + : ctx_(ctx), + num_worker_threads_(ctx_.device_.numThreadsInPool()) {} + + void operator()(Blocks& blocks) { + const int n = ctx_.num_thread_local_allocations_.fetch_add( + 1, std::memory_order_relaxed); + + if (n >= num_worker_threads_) { + ThreadLocalBlocksAllocator::allocate(ctx_, blocks); + } else { + ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); + } + } + + private: + // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to + // TensorContractionKernel::allocateSlices into template specializations. + // Also explicit specializations are not allowed at class scope in C++03, + // EvalCtx type parameter is just a workaround for that limitation. + template + struct ThreadLocalBlocksAllocator; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector rhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/0, + /*num_rhs=*/ctx.gn_, + /*num_slices=*/1, + /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(rhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gn_); + } + }; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector lhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/ctx.gm_, + /*num_rhs=*/0, + /*num_slices=*/1, + /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(lhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gm_); + } + }; + + EvalParallelContext& ctx_; + const int num_worker_threads_; + }; + + template + class ThreadLocalBlocksRelease { + public: + using Blocks = ThreadLocalBlocks; + ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} + void operator()(Blocks& blocks) { blocks.Release(ctx_); } + + private: + EvalParallelContext& ctx_; + }; + + // ThreadLocalBlocks initialization callables. + using ThreadLocalLhsInit = + ThreadLocalBlocksInitialize; + using ThreadLocalRhsInit = + ThreadLocalBlocksInitialize; + + // ThreadLocalBlocks release callables. + using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; + using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; + + // Thread local containers for Lhs/Rhs block packs. In practice only one of + // them will be used, depending on the shard_by_col value. + Eigen::ThreadLocal, ThreadLocalLhsInit, + ThreadLocalLhsRelease> + lhs_thread_local_blocks_; + Eigen::ThreadLocal, ThreadLocalRhsInit, + ThreadLocalRhsRelease> + rhs_thread_local_blocks_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + std::atomic** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache @@ -470,69 +817,168 @@ struct TensorEvaluator state_packing_ready_[P]; std::atomic state_switch_[P]; + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); + + Index grain_index = m1 - m * gm_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); + + Index grain_index = n1 - n * gn_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, + std::memory_order_relaxed); + } + } + const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { + assert(!use_thread_local); signal_packing(k); } else { signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } } void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, + std::memory_order_relaxed); + } + } + const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { - if (k == 0) { - // Zero the output memory in parallel. - // On 10000x2x10000 mm zeroing can easily take half of time. - // Zero (bn x m) row. Safe to do here because all kernels that will - // write to this memory depend on completion of this task. - // Note: don't call device_.memset() here. device_.memset() blocks on - // thread pool worker thread, which can lead to underutilization and - // deadlocks. + if (!TensorContractionKernel::HasBeta && k == 0) { + // Zero the output memory in parallel, only if contraction kernel does + // not support `beta`. Otherwise we will pass beta 0.0 to the first + // call to the `TensorContractionKernel::invoke()`. + // + // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn + // x m) row. Safe to do here because all kernels that will write to + // this memory depend on completion of this task. Note: don't call + // device_.memset() here. device_.memset() blocks on thread pool + // worker thread, which can lead to underutilization and deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } else { + assert(!use_thread_local); signal_packing(k); } } - void kernel(Index m, Index n, Index k) { + void kernel(Index m, Index n, Index k, bool use_thread_local) { // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consequetive tasks + // because we want to reuse the same packed rhs in consecutive tasks // (rhs fits into L2$ while lhs only into L3$). const Index nend = n * gn_ + gn(n); const Index mend = m * gm_ + gm(m); + + // NOTE: output = alpha * LHS * RHS + beta * output. + const Scalar alpha = Scalar(1); + const Scalar beta = + (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); + if (shard_by_col_) { for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + for (Index m1 = m * gm_; m1 < mend; m1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } } } else { for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } } } - signal_kernel(m, n, k + 1, false); + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); signal_switch(k + 2); } @@ -545,16 +991,23 @@ struct TensorEvaluator* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification( + [=]() { kernel(m, n, k, use_thread_local); }); + } } void signal_switch(Index k, Index v = 1) { @@ -604,11 +1057,32 @@ struct TensorEvaluator 1) { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + end = mid; + } + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool to get pre-allocated thead local buffers. + bool pack_async = + (start == 0) && + (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && + (k > 0 || std::this_thread::get_id() == created_by_thread_id_); + + if (pack_async) { + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, end, k, rhs); }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } } } @@ -620,10 +1094,364 @@ struct TensorEvaluator + using SyncEvalParallelContext = + EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* self, int num_threads, + Scalar* result_buffer, + Index m_size, Index n_size, Index k_size, + DoneCallback done_callback) + : evaluator(self), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(divup(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(divup(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast(evaluator->m_device.allocate( + buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + eval(barrier, 0, num_blocks); + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + evalAsync(0, num_blocks); + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + + TENSOR_CONTRACTION_DISPATCH( + evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel( + OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, + Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, + Scalar* tgt_buf) { + const int output_packet_size = + internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = + internal::pload(src_buf + i); + const PacketReturnType tgt_val = + internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, + sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, + const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = + padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + template + void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, &barrier, mid_block_idx, end_block_idx]() { + eval(barrier, mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + } + + template + void evalAsync(Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, mid_block_idx, end_block_idx]() { + evalAsync(mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; }; + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + // Decide whether we want to shard m x n contraction by columns or by rows. static bool shardByCol(Index m, Index n, Index num_threads) { // Note: we are comparing both n and m against Traits::nr, it is not @@ -727,304 +1555,15 @@ struct TensorEvaluator - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { const int packed_size = std::min(PacketType::size, PacketType::size); const int output_packet_size = internal::unpacket_traits::size; const double kd = static_cast(bk); - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = bk == 1 ? 4.0 : - (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif + double compute_bandwidth = computeBandwidth(false, bm, bn, bk); // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); // Output stores. cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); if (prepacked) { @@ -1044,6 +1583,94 @@ struct TensorEvaluator l3CacheSize() / num_threads_by_k || // need more buffer space + // than L3 cache or... + k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. + shard_by_k = false; + } else if (numext::maxi(m, n) / num_threads < + Traits::nr || // both other dimensions are tiny or... + // k per thread is not small and... + (k / num_threads_by_k > 8 * Traits::nr && + // one of the outer dimensions is tiny or sharding by k offers + // more parallelism. + (numext::mini(m, n) < 2 * Traits::nr || + num_threads_by_k > num_threads))) { + shard_by_k = true; + } + return shard_by_k; + } + + TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { + // Compute cost. + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; + // Since the inner gemm kernel is always sharded by column, the lhs + // load cost is negligible. + lhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + int numThreadsInnerDim(Index m, Index n, Index k) const { + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost = contractionCostPerInnerDim(m, n, k); + double total_parallel_cost = + TensorCostModel::totalCost(k, cost); + // Cost of reduction step accumulating the m*n per-thread buffers into the + // result. + double reduction_cost = TensorCostModel::totalCost( + m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); + int num_threads = 1; + double min_cost = total_parallel_cost; + double kPerThreadOverHead = 3000; + double kFixedOverHead = 100000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { + double sequential_cost = + kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); + double parallel_cost = total_parallel_cost / nt + sequential_cost; + if (parallel_cost < min_cost) { + num_threads = nt; + min_cost = parallel_cost; + } + } + return num_threads; + } + + double computeBandwidth(bool shard_by_col, Index bm, Index bn, + Index bk) const { + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = + bk == 1 ? 4.0 + : (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr + ? 2.0 + : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such + // instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + return computeBandwidth; + } + }; } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 860a6949a..09d2da9a8 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,6 +32,7 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename TypeConversion::PointerType>::type PointerType; }; template @@ -50,7 +51,10 @@ struct nested, 1, typename eval -struct PacketConverter { +struct PacketConverter; + +template +struct PacketConverter { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} @@ -108,7 +112,33 @@ struct PacketConverter { }; template -struct PacketConverter { +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + 1 * SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + SrcPacket src5 = m_impl.template packet(index + 4 * SrcPacketSize); + SrcPacket src6 = m_impl.template packet(index + 5 * SrcPacketSize); + SrcPacket src7 = m_impl.template packet(index + 6 * SrcPacketSize); + SrcPacket src8 = m_impl.template packet(index + 7 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4, src5, src6, src7, src8); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} @@ -128,6 +158,7 @@ struct PacketConverter { typedef typename internal::unpacket_traits::type TgtType; internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < TgtPacketSize; ++i) { values[i] = converter(m_impl.coeff(index+i)); } @@ -163,19 +194,114 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; +#ifdef EIGEN_USE_THREADS +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); + } +}; + +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +}; +#endif + +namespace internal { + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); + return internal::pload(values); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal // Eval as rvalue template @@ -189,44 +315,98 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + static const bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = + #ifndef EIGEN_USE_SYCL + true, + #else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, + #endif + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef internal::TensorUnaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + ConversionSubExprEvalAsync, + EvaluatorPointerType, + EvalSubExprsCallback>::run(m_impl, data, std::move(done)); } +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); + return internal::CoeffConv::run(m_impl,index); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType + packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = + IsSameType + ? TensorEvaluator::PacketAccess + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); + + return internal::PacketConv::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -244,33 +424,30 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_impl.block(desc, scratch), + TensorConversionOpBlockFactory()); + } - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: TensorEvaluator m_impl; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index abdf742c6..b20f80ba2 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -54,8 +54,8 @@ class IndexMapper { } } - array cudaInputDimensions; - array cudaOutputDimensions; + array gpuInputDimensions; + array gpuOutputDimensions; array tmp = dimensions; array ordering; const size_t offset = static_cast(Layout) == static_cast(ColMajor) @@ -65,8 +65,8 @@ class IndexMapper { const Index index = i + offset; ordering[index] = indices[i]; tmp[indices[i]] = -1; - cudaInputDimensions[index] = input_dims[indices[i]]; - cudaOutputDimensions[index] = dimensions[indices[i]]; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; } int written = static_cast(Layout) == static_cast(ColMajor) @@ -75,8 +75,8 @@ class IndexMapper { for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; - cudaInputDimensions[written] = input_dims[i]; - cudaOutputDimensions[written] = dimensions[i]; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; ++written; } } @@ -89,37 +89,37 @@ class IndexMapper { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (i > NumKernelDims) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + m_gpuInputStrides[i] = + m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + if (static_cast(i + 1) < offset) { + m_gpuInputStrides[i] = + m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[NumKernelDims]; } else { @@ -128,22 +128,22 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[limit]; } return inputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[NumKernelDims]; } else { @@ -152,44 +152,44 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[limit]; } return outputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -197,7 +197,7 @@ class IndexMapper { k * m_inputStrides[offset + 2]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -209,8 +209,8 @@ class IndexMapper { static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; - array m_cudaInputStrides; - array m_cudaOutputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; }; @@ -231,6 +231,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { Flags = 0 @@ -300,17 +302,25 @@ struct TensorEvaluator::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + IsAligned = int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & int(TensorEvaluator::PacketAccess), + BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -374,12 +384,12 @@ struct TensorEvaluator EvalTo; EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); + const bool Vectorize = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); m_kernel = local; m_local_kernel = true; @@ -544,14 +554,14 @@ struct TensorEvaluator struct GetKernelSize { @@ -568,13 +578,17 @@ struct GetKernelSize { template -__global__ void EigenConvolutionKernel1D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -586,18 +600,18 @@ __global__ void EigenConvolutionKernel1D( for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.y * num_x_input; #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); // Compute the convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { @@ -607,7 +621,7 @@ __global__ void EigenConvolutionKernel1D( for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { result += s[k + kernel_offset] * kernel[k]; } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); buffer[tensor_index] = result; } __syncthreads(); @@ -616,14 +630,18 @@ __global__ void EigenConvolutionKernel1D( template -__global__ void EigenConvolutionKernel2D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -640,7 +658,7 @@ __global__ void EigenConvolutionKernel2D( for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.z * num_y_input; // Load inputs to shared memory @@ -649,7 +667,7 @@ __global__ void EigenConvolutionKernel2D( const int input_offset = num_x_input * (j + plane_kernel_offset); #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); s[i + input_offset] = eval.coeff(tensor_index); } } @@ -657,7 +675,7 @@ __global__ void EigenConvolutionKernel2D( __syncthreads(); // Convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -673,7 +691,7 @@ __global__ void EigenConvolutionKernel2D( result += s[k + input_offset] * kernel[k + kernel_offset]; } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); buffer[tensor_index] = result; } } @@ -683,7 +701,7 @@ __global__ void EigenConvolutionKernel2D( }; template -__global__ void EigenConvolutionKernel3D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( InputEvaluator eval, const internal::IndexMapper indexMapper, @@ -691,7 +709,11 @@ __global__ void EigenConvolutionKernel3D( const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif // Load inputs to shared memory const int first_x = blockIdx.x * maxX; @@ -708,13 +730,13 @@ __global__ void EigenConvolutionKernel3D( for (int p = 0; p < numPlanes; ++p) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } } @@ -726,7 +748,7 @@ __global__ void EigenConvolutionKernel3D( const int num_z_output = last_z - first_z + 1; const int num_y_output = last_y - first_y + 1; const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -739,7 +761,7 @@ __global__ void EigenConvolutionKernel3D( } } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); buffer[tensor_index] = result; } } @@ -764,13 +786,19 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -852,9 +880,9 @@ struct TensorEvaluator::Dimensions InputDims; const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); const int warpSize = 32; switch (NumKernelDims) { @@ -889,7 +917,7 @@ struct TensorEvaluator, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } break; @@ -946,7 +974,7 @@ struct TensorEvaluator(1024/(block_size.x*block_size.y), maxP); const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); + gpu_assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int num_y_blocks = ceil(numY, maxY); @@ -967,11 +995,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -980,18 +1008,18 @@ struct TensorEvaluator, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } @@ -1026,7 +1054,7 @@ struct TensorEvaluator indices(m_indices[idxX], m_indices[idxY], @@ -1037,7 +1065,7 @@ struct TensorEvaluator indexMapper( m_inputImpl.dimensions(), kernel_dims, indices); - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 000000000..033318fdc --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,544 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + return (boolean_check[0] && boolean_check[1]); + } + void operator()(cl::sycl::nd_item<2> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); + /// fill the shared memory + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && + in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; + + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; + + for (size_t p = 0; p < numP; p++) { + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + +template +struct TensorEvaluator, Eigen::SyclDevice> { + typedef TensorConvolutionOp XprType; + + static const int NumDims = + internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; + + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate_temp(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate_temp(m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + typename KernelStorage::Type in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{}; + auto local_range = cl::sycl::range<2>{}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + break; + } + + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + break; + } + + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_kernelImpl.bind(cgh); + m_inputImpl.bind(cgh); + m_buf.bind(cgh); + m_kernel.bind(cgh); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf1..195267ce8 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,11 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, + numext::maxi(1, static_cast(threads))); } // taskSize assesses parallel task size. @@ -186,14 +189,13 @@ class TensorCostModel { return totalCost(output_size, cost_per_coeff) / kTaskSize; } - private: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( double output_size, const TensorOpCost& cost_per_coeff) { // Cost of memory fetches from L2 cache. 64 is typical cache line size. // 11 is L2 cache latency on Haswell. // We don't know whether data is in L1, L2 or L3. But we are most interested // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not intersting + // is too small for parallelization, larger time is not interesting // either because we are probably using all available threads already). // And for the target time range, L2 seems to be what matters. Data set // fitting into L1 is too small to take noticeable time. Data set fitting diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..95a8a84ee 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -30,12 +30,13 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorCustomUnaryOp& type; + typedef const TensorCustomUnaryOpEIGEN_DEVICE_REF type; }; template @@ -86,18 +87,26 @@ struct TensorEvaluator, Devi typedef typename internal::remove_const::type Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.expression()); @@ -105,21 +114,21 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); + EIGEN_STRONG_INLINE void cleanup() { + if (m_result) { + m_device.deallocate_temp(m_result); m_result = NULL; } } @@ -138,19 +147,25 @@ struct TensorEvaluator, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } +#endif protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result( - data, m_dimensions); + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } Dimensions m_dimensions; const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; @@ -180,6 +195,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -242,18 +259,27 @@ struct TensorEvaluator::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); @@ -261,20 +287,21 @@ struct TensorEvaluator(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_result != NULL) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } @@ -293,18 +320,25 @@ struct TensorEvaluator > result(data, m_dimensions); + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } Dimensions m_dimensions; const XprType m_op; - const Device& m_device; - CoeffReturnType* m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 29e50a3b2..96fa46c86 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -28,6 +28,8 @@ template class TensorDevice { public: TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) + template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; @@ -63,6 +65,73 @@ template class TensorDevice { ExpressionType& m_expression; }; +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device. Currently only + * ThreadPoolDevice implements proper asynchronous execution, while the default + * and GPU devices just run the expression synchronously and call m_done() on + * completion.. + * + * Example: + * auto done = []() { ... expression evaluation done ... }; + * C.device(thread_pool_device, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorExecutor Executor; + + Assign assign(m_expression, other); + Executor::run(assign, m_device); + m_done(); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; + + +#ifdef EIGEN_USE_THREADS +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; +#endif + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4f5767bc7..f77923933 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -1,337 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - -namespace Eigen { - -static const int kCudaScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - m_deviceProperties = new cudaDeviceProp[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" - << i - << ": " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - } - -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const cudaStream_t default_stream = cudaStreamDefault; - -class CudaStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - cudaGetDevice(&device_); - } else { - int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~CudaStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - return semaphore_; - } - - private: - const cudaStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) - << std::endl; - assert(err == cudaSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" #endif - } - - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the CUDA runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef __CUDACC__ - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - - -// FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ - cudaError_t status = cudaDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H +#include "TensorDeviceGpu.h" diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 9d141395b..46b9d3ab2 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -20,6 +20,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); @@ -33,11 +39,18 @@ struct DefaultDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running on the host CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; #else // Running on a CUDA device return 32; @@ -45,9 +58,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running on the host CPU return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48*1024; // FIXME : update this number for HIP #else // Running on a CUDA device, return the amount of shared memory available. return 48*1024; @@ -55,9 +71,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running single threaded on the host CPU return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP #else // Running on a CUDA device return firstLevelCacheSize(); @@ -65,13 +84,17 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; #else // Running on a CUDA device - return __CUDA_ARCH__ / 100; + return EIGEN_CUDA_ARCH / 100; #endif } }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h new file mode 100644 index 000000000..ec2e3cb14 --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separate header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" + +namespace Eigen { + +static const int kGpuScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP / CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +class GpuDeviceProperties { + public: + GpuDeviceProperties() : + initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { + return device_properties_[device]; + } + + EIGEN_STRONG_INLINE bool isInitialized() const { + return initialized_; + } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" + << i + << ": " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + } + + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + } + } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); +} + +static const gpuStream_t default_stream = gpuStreamDefault; + +class GpuStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuGetDevice(&device_); + } + // Use the default stream on the specified device + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + GpuStreamDevice(const gpuStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + gpuGetDevice(&device_); + } else { + int num_devices; + gpuError_t err = gpuGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); + device_ = device; + } + } + + virtual ~GpuStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { + return GetGpuDeviceProperties(device_); + } + virtual void* allocate(size_t num_bytes) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + void* result; + err = gpuMalloc(&result, num_bytes); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); + err = gpuFree(buffer); + gpu_assert(err == gpuSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + return semaphore_; + } + + private: + const gpuStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + stream_->deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip/cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " + << gpuGetErrorString(err) + << std::endl; + gpu_assert(err == gpuSuccess); + } +#else + gpu_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the GPU runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + gpu_assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + gpu_assert(cudaGetLastError() == cudaSuccess); + +#endif + +// FIXME: Should be device and kernel specific. +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + gpu_assert(status == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 7c039890e..df591c21d 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,109 +14,1035 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include namespace Eigen { -struct SyclDevice { - /// class members - /// sycl queue - mutable cl::sycl::queue m_queue; - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// creating device by using selector - template SyclDevice(dev_Selector s) - : -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; + +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info()), + platform_name(queue.get_device() + .get_platform() + .template get_info()), + device_name(queue.get_device() + .template get_info()), + device_vendor( + queue.get_device() + .template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; +}; + +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } + } + return supported_devices; +#endif +} + +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task(m_prog.get_kernel(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} + +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif + + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + std::lock_guard lock(pmapper_mutex_); + return static_cast(pMapper.add_pointer(buf)); + } + + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; } + } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } - })) #else - m_queue(cl::sycl::queue(s)) + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); #endif - {} - // destructor - ~SyclDevice() { deallocate_all(); } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return static_cast(data.get_virtual_pointer()); + } - template void deallocate(T *p) const { - auto it = buffer_map.find(p); - if (it != buffer_map.end()) { - buffer_map.erase(it); - internal::aligned_free(p); + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); +#endif + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &p) const { + deallocate_temp(p.get_virtual_pointer()); + } + + /// This is used to deallocate the device pointer. p is used as a key inside + /// the map to find the device buffer and delete it. + EIGEN_STRONG_INLINE void deallocate(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + EIGEN_STRONG_INLINE void deallocate_all() const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + write_accessor; + if (n == 0) { + if (callback) callback(); + return; } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); } - void deallocate_all() const { - std::map>::iterator it=buffer_map.begin(); - while (it!=buffer_map.end()) { - auto p=it->first; - buffer_map.erase(it); - internal::aligned_free(const_cast(p)); - it=buffer_map.begin(); + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + read_accessor; + if (n == 0) { + if (callback) callback(); + return; } - buffer_map.clear(); + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor(cgh, src, n); + buffer_scalar_t *ptr = static_cast(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); } - /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - ///the function then adds an entry by creating a sycl buffer for that particular pointer. - template inline cl::sycl::accessor - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor(cgh, src, n); + auto dst_acc = get_range_accessor(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); } - template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { - using Type = cl::sycl::buffer; - std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), - [](void *dataMem) { delete static_cast(dataMem); }))); - (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); - return ret; + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast(static_cast(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); } - template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { - return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast(typed_offset), + reinterpret_cast(start_ptr)); } - /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { - return internal::aligned_malloc(8); + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } - // some runtime conditions that can be applied here - bool isDeviceSuitable() const { return true; } + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access( + cgh); + } + + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_offset(ptr); + } + + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + + EIGEN_STRONG_INLINE void synchronize() const { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + + + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); + rng = n; + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[2] = + static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast(local_range[2] - xMode); + } + pow_of_2 = static_cast( + std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_device_info.max_compute_units; + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_device_info.local_mem_size; + } + + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); - memcpy(host_acc.get_pointer(), src, n); + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; } - /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); - if (it != buffer_map.end()) { - auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); - memcpy(dst,host_acc.get_pointer(), n); - } else{ - eigen_assert("no device memory found. The memory might be destroyed before creation"); + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + synchronize(); } + return !exception_caught_; + } + + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + + // destructor + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { - ::memset(buffer, c, n); + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return 1; + + bool sycl_async_handler(cl::sycl::exception_list exceptions) const { + bool exception_caught = false; + for (const auto &e : exceptions) { + if (e) { + exception_caught = true; + EIGEN_THROW_X(e); + } + } + return exception_caught; } + + /// class members: + bool exception_caught_ = false; + + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map latest_events_; +#endif + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set scratch_buffers; +#endif + /// sycl queue + mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif + + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; }; +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; + } +}; + +// Here is a sycl device struct which accept the sycl queue interface +// as an input +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor(ptr); + } + + // get sycl accessor + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor(cgh, ptr); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); + } + + /// deallocate device memory + EIGEN_STRONG_INLINE void deallocate(void *p) const { + queue_stream()->deallocate(p); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return queue_stream()->get(data); + } + + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); + } + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); + } + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); + } + /// the memset function + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + queue_stream()->memset(data, c, n); + } + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); + } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on sycl devices. + return firstLevelCacheSize(); + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return queue_stream()->getNumSyclMultiProcessors(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); + } + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } + + EIGEN_STRONG_INLINE void synchronize() const { + queue_stream()->synchronize(); + } + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); + } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); + } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher( + var...); + } + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher( + var...); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher( + var...); + } +}; } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 069680a11..e524b535a 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,67 +12,6 @@ namespace Eigen { -// Use the SimpleThreadPool by default. We'll switch to the new non blocking -// thread pool later. -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif - - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_assert(((count << 1) >> 1) == count); - } - ~Barrier() { - eigen_assert((state_>>1) == 0); - } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - eigen_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - eigen_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; -}; - - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1) {}; -}; - - // Runs an arbitrary function and then calls Notify() on the passed in // Notification. template struct FunctionWrapperWithNotification @@ -102,22 +41,75 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { } } +// An abstract interface to a device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) + : pool_(pool), num_threads_(num_cores), allocator_(allocator) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); + return allocator_ ? allocator_->allocate(num_bytes) + : internal::aligned_malloc(num_bytes); } EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); + if (allocator_) { + allocator_->deallocate(buffer); + } else { + internal::aligned_free(buffer); + } + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ ::memcpy(dst, src, n); +#else + // TODO(rmlarsen): Align blocks on cache lines. + // We have observed that going beyond 4 threads usually just wastes + // CPU cycles due to the threads competing for memory bandwidth, so we + // statically schedule at most 4 block copies here. + const size_t kMinBlockSize = 32768; + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + if (n <= kMinBlockSize || num_threads < 2) { + ::memcpy(dst, src, n); + } else { + const char* src_ptr = static_cast(src); + char* dst_ptr = static_cast(dst); + const size_t blocksize = (n + (num_threads - 1)) / num_threads; + Barrier barrier(static_cast(num_threads - 1)); + // Launch the last 3 blocks on worker threads. + for (size_t i = 1; i < num_threads; ++i) { + enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, + numext::mini(blocksize, n - (i * blocksize))); + }); + } + // Launch the first block on the main thread. + ::memcpy(dst_ptr, src_ptr, blocksize); + barrier.Wait(); + } +#endif } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); @@ -134,6 +126,12 @@ struct ThreadPoolDevice { return num_threads_; } + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { + return pool_->NumThreads(); + } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } @@ -149,23 +147,31 @@ struct ThreadPoolDevice { } template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, + Args&&... args) const { Notification* n = new Notification(); - pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); + pool_->Schedule( + std::bind(&FunctionWrapperWithNotification::run, n, + std::move(f), args...)); return n; } template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, - Function&& f, + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { - pool_->Schedule(std::bind( - &FunctionWrapperWithBarrier::run, b, f, args...)); + pool_->Schedule( + std::bind(&FunctionWrapperWithBarrier::run, b, + std::move(f), args...)); } template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, + Args&&... args) const { + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(std::move(f), args...)); + } else { + pool_->Schedule(std::move(f)); + } } // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if @@ -174,45 +180,193 @@ struct ThreadPoolDevice { return pool_->CurrentThreadId(); } - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel // efficiency. If block_align is not nullptr, it is called to round up the // block size. void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, std::function f) const { - typedef TensorCostModel CostModel; + if (EIGEN_PREDICT_FALSE(n <= 0)){ + return; + // Compute small problems directly in the caller thread. + } else if (n == 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, + Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f, + std::function done) const { + // Compute small problems directly in the caller thread. if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { f(0, n); + done(); return; } - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = + new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule( + [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function f, + std::function done) const { + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } + + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index block_count, + std::function block_f, + std::function done_callback) + : count(block_count), + f(std::move(block_f)), + done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock( + const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi( + divup(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); - double block_size_f = 1.0 / CostModel::taskSize(1, cost); - Index block_size = numext::mini(n, numext::maxi(1, block_size_f)); - const Index max_block_size = - numext::mini(n, numext::maxi(1, 2 * block_size_f)); if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); block_size = numext::mini(n, new_block_size); } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast(block_count) / (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; prev_block_count > 1;) { + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. Index coarser_block_size = divup(n, prev_block_count - 1); @@ -241,36 +395,12 @@ struct ThreadPoolDevice { } } - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { - // Single block or less, execute directly. - f(first, last); - barrier.Notify(); - return; - } - // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); - }; - handleRange(0, n); - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); + return {block_size, block_count}; } - private: ThreadPoolInterface* pool_; int num_threads_; + Allocator* allocator_; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index b24cdebf1..f0f1e832a 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -32,16 +32,16 @@ namespace Eigen { // Boilerplate code namespace internal { -template struct dget { - static const std::size_t value = get::value; +template struct dget { + static const std::ptrdiff_t value = get::value; }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const& indices, + static EIGEN_STRONG_INLINE Index run(array const& indices, const Dimensions& dimensions) { return array_get(indices) + @@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper } }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const&, const Dimensions&) + static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) { return 0; } }; -template +template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index index, + static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) { const Index mult = (index == n-1) ? 1 : 0; @@ -77,7 +77,7 @@ template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index, + static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) { return 0; @@ -90,9 +90,11 @@ struct fixed_size_tensor_index_extraction_helper // Fixed size #ifndef EIGEN_EMULATE_CXX11_META_H template -struct Sizes : internal::numeric_list { +struct Sizes { typedef internal::numeric_list Base; + const Base t = Base(); static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + static const ptrdiff_t count = Base::count; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; @@ -119,17 +121,17 @@ struct Sizes : internal::numeric_list { return *this; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, t); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); } }; @@ -142,25 +144,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes +template struct non_zero_size { - typedef internal::type2val type; + typedef internal::type2val type; }; template <> struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes { +template struct Sizes { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; - static const size_t count = Base::count; - static const std::size_t total_size = internal::arg_prod::value; + static const std::ptrdiff_t count = Base::count; + static const std::ptrdiff_t total_size = internal::arg_prod::value; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const { return count; } - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() { return internal::arg_prod::value; } @@ -176,7 +178,7 @@ template Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list) { + explicit Sizes(std::initializer_list) { // todo: add assertion } #else @@ -192,7 +194,7 @@ template ::value; @@ -206,23 +208,23 @@ template ::value; default: eigen_assert(false && "index overflow"); - return static_cast(-1); + return static_cast(-1); } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { + ptrdiff_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { + ptrdiff_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } }; namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { return Sizes::total_size; } } @@ -231,7 +233,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -243,7 +245,7 @@ struct tensor_index_linearization_helper } }; -template +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -262,7 +264,7 @@ struct DSizes : array { typedef array Base; static const int count = NumDims; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; } @@ -282,6 +284,57 @@ struct DSizes : array { (*this)[0] = i0; } + EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + // Enable DSizes index type promotion only if we are promoting to the + // larger type, e.g. allow to promote dimensions of type int to long. + template + EIGEN_DEVICE_FUNC + explicit DSizes(const array& other, + // Default template parameters require c++11. + typename internal::enable_if< + internal::is_same< + DenseIndex, + typename internal::promote_index_type< + DenseIndex, + OtherIndex + >::type + >::value, void*>::type = 0) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = static_cast(other[i]); + } + } + +#ifdef EIGEN_HAS_INDEX_LIST + template + EIGEN_DEVICE_FUNC + explicit DSizes(const Eigen::IndexList& dimensions) { + for (int i = 0; i < dimensions.count; ++i) { + (*this)[i] = dimensions[i]; + } + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { @@ -330,12 +383,21 @@ struct DSizes : array { } }; - - +template +std::ostream& operator<<(std::ostream& os, + const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} // Boilerplate namespace internal { -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -347,7 +409,7 @@ struct tensor_vsize_index_linearization_helper } }; -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -362,10 +424,10 @@ struct tensor_vsize_index_linearization_helper namespace internal { template struct array_size > { - static const size_t value = NumDims; + static const ptrdiff_t value = NumDims; }; template struct array_size > { - static const size_t value = NumDims; + static const ptrdiff_t value = NumDims; }; #ifndef EIGEN_EMULATE_CXX11_META_H template struct array_size > { @@ -375,42 +437,42 @@ template struct array_size::count; }; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { - return get >::value; + return get >::value; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { eigen_assert(false && "should never be called"); return -1; } #else -template struct array_size > { - static const size_t value = Sizes::count; +template struct array_size > { + static const ptrdiff_t value = Sizes::count; }; -template struct array_size > { - static const size_t value = Sizes::count; +template struct array_size > { + static const ptrdiff_t value = Sizes::count; }; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { return get::Base>::value; } #endif -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; } }; -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { - return (array_get(dims1) == array_get(dims2)) & + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) && sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; } }; @@ -419,7 +481,7 @@ struct sizes_match_below_dim { template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 06987132b..a48d035f5 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -32,6 +32,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; enum { Flags = 0 @@ -41,6 +42,8 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; + + }; }; @@ -73,6 +76,8 @@ class TensorEvalToOp : public TensorBase, typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} @@ -98,38 +103,60 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } + static const int NumDims = internal::traits::NumDimensions; - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorBlockAssignment< + CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} + + + EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename internal::traits >::template MakePointer::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType scalar, EvalSubExprsCallback done) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } @@ -137,7 +164,34 @@ struct TensorEvaluator, Device> internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + // Add `m_buffer` as destination buffer to the block descriptor. + desc.template AddDestinationBuffer( + /*dst_base=*/m_buffer + desc.offset(), + /*dst_strides=*/internal::strides(m_impl.dimensions())); + + ArgTensorBlock block = + m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); + + // If block was evaluated into a destination buffer, there is no need to do + // an assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(m_impl.dimensions()), + m_buffer, desc.offset()), + block.expr()); + } + block.cleanup(); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -159,19 +213,20 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } ArgType expression() const { return m_expression; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_buffer.bind(cgh); + } + #endif - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} private: TensorEvaluator m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; + EvaluatorPointerType m_buffer; const ArgType m_expression; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 834ce07df..3aff7fa01 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,44 +32,72 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static const int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast(m.data())))), + m_dims(m.dimensions()), + m_device(device) { } - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits::type>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); return m_data[index]; } @@ -79,6 +107,18 @@ struct TensorEvaluator return internal::ploadt(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -86,7 +126,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -94,8 +134,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& + coeffRef(const array& coords) { + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -105,19 +146,50 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(m_dims), m_data, + desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; namespace { @@ -126,7 +198,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); @@ -140,6 +212,13 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { + return *address; +} +#endif } @@ -152,40 +231,64 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef typename internal::remove_const::type ScalarNoConst; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static const int PacketSize = PacketType::size; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return loadConstant(m_data+index); } @@ -195,8 +298,20 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -204,19 +319,32 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; @@ -229,15 +357,6 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -246,13 +365,42 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -268,16 +416,17 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif private: const NullaryOp m_functor; @@ -295,32 +444,60 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -341,15 +518,31 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff( + {0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif private: + const Device EIGEN_DEVICE_REF m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -363,16 +556,23 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -384,8 +584,27 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size< + typename TensorEvaluator::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -393,12 +612,25 @@ struct TensorEvaluator + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, + [done](bool) { done(true); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } @@ -421,15 +653,34 @@ struct TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), + m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif private: + const Device EIGEN_DEVICE_REF m_device; const BinaryOp m_functor; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; @@ -444,14 +695,20 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + internal::functor_traits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_arg1Impl(op.arg1Expression(), device), m_arg2Impl(op.arg2Expression(), device), @@ -479,8 +736,14 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -488,13 +751,13 @@ struct TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif private: const TernaryOp m_functor; @@ -547,15 +812,23 @@ struct TensorEvaluator typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + PacketType::HasBlend, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) @@ -569,8 +842,42 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -578,13 +885,26 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); + }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); m_elseImpl.cleanup(); @@ -597,13 +917,15 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -613,14 +935,42 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = + internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = + then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge( + m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock( + m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f01d77c0a..c52fb77dc 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,31 +12,94 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tiling can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorAssignOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseUnaryOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseBinaryOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || + ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; + + // Including `unsupported/Eigen/CXX11/Tensor` in different translation units + // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR + // violation. If this template is instantiated with a non-default device, it + // means that this header file was included without defining + // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. + static_assert(std::is_same::value, + "Default executor instantiated with non-default device. " + "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " + "EIGEN_USE_SYCL before including Eigen headers."); + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -44,35 +107,48 @@ class TensorExecutor } }; - -template -class TensorExecutor -{ +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static EIGEN_STRONG_INLINE void run( + const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -80,55 +156,162 @@ class TensorExecutor } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static const int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlockMapper + TensorBlockMapper; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + Evaluator evaluator(expr, device); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Query expression tree for desired block size/shape. + const TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + const TensorBlockMapper block_mapper( + typename TensorBlockDesc::Dimensions(evaluator.dimensions()), + requirements); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.blockCount(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlockDesc desc = block_mapper.blockDescriptor(i); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + * + * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread + * pool, and will block the caller thread until all tasks are finished. + * + * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to + * the ThreadPoolDevice managed thread pool, and will return immediately. + * It will call 'done' callback after all tasks are finished. + */ #ifdef EIGEN_USE_THREADS -template + +template +struct TensorExecutorTilingContext { + TensorExecutorTilingContext() = default; + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, + const TensorOpCost& b_cost, size_t b_aligned_size) + : block_mapper(b_mapper), + cost(b_cost), + aligned_blocksize(b_aligned_size) {} + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext( + const Evaluator& evaluator) { + // Query expression tree for desired block size/shape. + TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); + + // Update target block size based on cost model. + double taskSize = TensorCostModel::taskSize( + 1, requirements.cost_per_coeff); + requirements.size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper( + typename TensorBlockMapper::Dimensions(evaluator.dimensions()), + requirements); + + size_t block_size = block_mapper.blockTotalSize(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * + divup(block_size * sizeof(typename Evaluator::Scalar), align); + + return {block_mapper, requirements.cost_per_coeff * block_size, + aligned_blocksize}; +} + +template struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; +template +struct EvalRange { + static const int PacketSize = + unpacket_traits::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - Index i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - last_chunk_offset = last - PacketSize; + last_chunk_offset = lastIdx - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { + for (; i < lastIdx; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -138,144 +321,376 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); + EvalRange::alignBlockSize, + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&evaluator, firstIdx, lastIdx); }); -#else - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(evaluator); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, + IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); + evaluator.evalBlock(desc, scratch); + scratch.reset(); } - barrier.Wait(); + }; + + // Evaluate small expressions directly as a single block. + if (tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(device); + TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); + evaluator.evalBlock(desc, scratch); + } else { + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, + eval_block); } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } }; -#endif // EIGEN_USE_THREADS +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + typedef EvalRange EvalRange; + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); + }, + [ctx]() { delete ctx; }); + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + Evaluator evaluator; + + private: + DoneCallback on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + ctx->tiling = internal::GetTensorExecutorTilingContext< + Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); + + auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(ctx->device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = + ctx->tiling.block_mapper.blockDescriptor(block_idx); + ctx->evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), + ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : device(thread_pool), + evaluator(expr, thread_pool), + on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + DoneCallback on_done; + }; +}; + +#endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; - -#if defined(__CUDACC__) -template +#if defined(EIGEN_GPUCC) +template struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } }; -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } }; -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ -template -inline void TensorExecutor::run( +template +EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - LAUNCH_CUDA_KERNEL( - (EigenMetaKernel, Index>), + LAUNCH_GPU_KERNEL( + (EigenMetaKernel, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } -#endif // __CUDACC__ +#endif // EIGEN_GPUCC #endif // EIGEN_USE_GPU // SYCL Executor policy #ifdef EIGEN_USE_SYCL -template -class TensorExecutor { -public: - static inline void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); +template +struct ExecExprFunctorKernel { + typedef typename Evaluator::Index Index; + Evaluator evaluator; + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; + Index gId = static_cast(itemID.get_global_linear_id()); + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); + } + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = + Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); + } + evaluator.cleanup(); } }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 85dfc7a69..c9bccfc66 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -38,7 +38,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; @@ -89,6 +89,10 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename TypeConversion::type + PointerType; }; template @@ -161,7 +165,12 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -238,7 +247,12 @@ struct traits::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -314,6 +328,9 @@ struct traits > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type PointerType; }; template diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 08eb5595a..4a1a0687c 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H -// This code requires the ability to initialize arrays of constant -// values directly inside a class. -#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 - namespace Eigen { /** \class TensorFFT @@ -71,6 +67,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; }; template @@ -130,17 +127,24 @@ struct TensorEvaluator, D typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -165,19 +169,19 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); return false; } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); evalToBuf(m_data); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); m_data = NULL; @@ -200,11 +204,16 @@ struct TensorEvaluator, D return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { const bool write_to_out = internal::is_same::value; ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); @@ -230,20 +239,32 @@ struct TensorEvaluator, D // t_n = exp(sqrt(-1) * pi * n^2 / line_len) // for n = 0, 1,..., line_len-1. // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); } } @@ -253,7 +274,7 @@ struct TensorEvaluator, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -261,7 +282,7 @@ struct TensorEvaluator, D } } - // processs the line + // process the line if (is_power_of_two) { processDataLineCooleyTukey(line_buf, line_len, log_len); } @@ -271,7 +292,7 @@ struct TensorEvaluator, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); @@ -562,12 +583,12 @@ struct TensorEvaluator, D protected: Index m_size; - const FFT& m_fft; + const FFT EIGEN_DEVICE_REF m_fft; Dimensions m_dimensions; array m_strides; TensorEvaluator m_impl; - CoeffReturnType* m_data; - const Device& m_device; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; @@ -645,7 +666,4 @@ struct TensorEvaluator, D } // end namespace Eigen -#endif // EIGEN_HAS_CONSTEXPR - - #endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index fcee5f60d..ca39bb855 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -20,7 +20,7 @@ namespace Eigen { * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is - * Eigen::TensorFixedSize> t; + * Eigen::TensorFixedSize> t; */ template @@ -40,11 +40,18 @@ class TensorFixedSize : public TensorBase0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; @@ -333,27 +340,10 @@ class TensorFixedSize : public TensorBase::run(assign, DefaultDevice()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) + protected: EIGEN_DEVICE_FUNC diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index bbd5eb374..e800dedc6 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -19,15 +19,9 @@ namespace Eigen { * * */ -/// template class MakePointer_ is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler T* is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. -/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is T*. namespace internal { -template class MakePointer_> -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -38,35 +32,31 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; }; -template class MakePointer_> -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template class MakePointer_> -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -87,49 +77,113 @@ class TensorForcedEvalOp : public TensorBase +struct non_integral_type_placement_new{ + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { + // Initialize non-trivially constructible types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); + } +} +}; + +// SYCL does not support non-integral types +// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices +// no matching function for call to 'operator new' +template +struct non_integral_type_placement_new { + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) { +} +}; +} // end namespace internal -template class MakePointer_> -struct TensorEvaluator, Device> +template +struct TensorEvaluator, Device> { - typedef TensorForcedEvalOp XprType; + typedef const typename internal::remove_all::type ArgType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (PacketSize > 1), - Layout = TensorEvaluator::Layout, - RawAccess = true + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + RawAccess = true }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + static const int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), + m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); - // Should initialize the memory in case we're dealing with non POD types. - if (NumTraits::RequireInitialization) { - for (Index i = 0; i < numValues; ++i) { - new(m_buffer+i) CoeffReturnType(); - } - } + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + + internal::non_integral_type_placement_new()(numValues, m_buffer); + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + internal::TensorExecutor< + const EvalTo, typename internal::remove_const::type, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + run(evalToTmp, m_device); + return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( + numValues * sizeof(CoeffReturnType))); + typedef TensorEvalToOp::type> + EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, + std::move(done)); + internal::TensorAsyncExecutor< + const EvalTo, typename internal::remove_const::type, + decltype(on_done), + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + runAsync(evalToTmp, m_device, std::move(on_done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate_temp(m_buffer); m_buffer = NULL; } @@ -144,21 +198,37 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_buffer != NULL); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EvaluatorPointerType data() const { return m_buffer; } - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_buffer.bind(cgh); + m_impl.bind(cgh); + } +#endif private: TensorEvaluator m_impl; const ArgType m_op; - const Device& m_device; - typename MakePointer::Type m_buffer; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_buffer; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 52b803d7f..246ebe44e 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -12,7 +12,7 @@ namespace Eigen { -// MakePointer class is used as a container of the adress space of the pointer +// MakePointer class is used as a container of the address space of the pointer // on the host and on the device. From the host side it generates the T* pointer // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to // T* m_data on the host. It is always called on the device. @@ -20,8 +20,35 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; + typedef const T* ConstType; }; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression +// is a device-agnostic type and need MakePointer class for type conversion, +// the TensorEvaluator class can be specialized for a device, hence it is possible +// to construct different types of temproray storage memory in TensorEvaluator +// for different devices by specializing the following StorageMemory class. +template struct StorageMemory: MakePointer {}; + +namespace internal{ +template struct Pointer_type_promotion { + static const bool val=false; +}; +template struct Pointer_type_promotion { + static const bool val = true; +}; +template struct TypeConversion { + typedef A* type; +}; +} + + template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; template class TensorFixedSize; @@ -37,7 +64,7 @@ template class Ma template class TensorIndexTupleOp; template class TensorTupleReducerOp; template class TensorConcatenationOp; -template class TensorContractionOp; +template class TensorContractionOp; template class TensorConversionOp; template class TensorConvolutionOp; template class TensorFFTOp; @@ -58,21 +85,50 @@ template class TensorInflationOp; template class TensorGeneratorOp; template class TensorAssignOp; template class TensorScanOp; +template class TensorTraceOp; template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class MakePointer_ = MakePointer> class TensorForcedEvalOp; +template class TensorForcedEvalOp; template class TensorDevice; +template class TensorAsyncDevice; template struct TensorEvaluator; +struct NoOpOutputKernel; + struct DefaultDevice; struct ThreadPoolDevice; struct GpuDevice; struct SyclDevice; +#ifdef EIGEN_USE_SYCL + +template struct MakeSYCLPointer { + typedef Eigen::TensorSycl::internal::RangeAccess Type; +}; + +template +EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& +constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { + return data; +} + +template +struct StorageMemory : MakeSYCLPointer {}; +template +struct StorageMemory : StorageMemory {}; + +namespace TensorSycl { +namespace internal{ +template class GenericNondeterministicReducer; +} +} +#endif + + enum FFTResultType { RealPart = 0, ImagPart = 1, @@ -98,10 +154,36 @@ struct IsVectorizable { TensorEvaluator::IsAligned; }; +// Tiled evaluation strategy. +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlock.h) +}; + +template +struct IsTileable { + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool BlockAccess = + TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; + + static const TiledEvaluation value = + BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; +}; + template ::value> + bool Vectorizable = IsVectorizable::value, + TiledEvaluation Tiling = IsTileable::value> class TensorExecutor; +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorAsyncExecutor; + + } // end namespace internal } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index d73f6dc68..d96303224 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -20,7 +20,7 @@ namespace internal { template struct scalar_mod_op { EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } const Scalar m_divisor; }; template @@ -33,8 +33,8 @@ struct functor_traits > */ template struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template struct functor_traits > @@ -42,7 +42,7 @@ struct functor_traits > template struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return numext::fmod(a, b); @@ -54,50 +54,19 @@ struct functor_traits > { PacketAccess = false }; }; - -/** \internal - * \brief Template functor to compute the sigmoid of a scalar - * \sa class CwiseUnaryOp, ArrayBase::sigmoid() - */ -template -struct scalar_sigmoid_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -template -struct functor_traits > { - enum { - Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, - PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && - packet_traits::HasNegate && packet_traits::HasExp - }; -}; - - template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; // Standard reduction functors template struct SumReducer { - static const bool PacketAccess = packet_traits::HasAdd; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_sum_op sum_op; *accum = sum_op(*accum, t); @@ -133,16 +102,14 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger }; }; - template struct MeanReducer { - static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; - static const bool IsStateful = true; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) { } @@ -166,16 +133,20 @@ template struct MeanReducer return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1(packetCount_)); + return pdiv(vaccum, pset1(T(packetCount_))); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); + internal::scalar_quotient_op quotient_op; + return quotient_op( + sum_op(saccum, predux(vaccum)), + T(scalarCount_ + packetCount_ * unpacket_traits::size)); } protected: @@ -187,7 +158,10 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd && + PacketType::HasDiv && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger }; }; @@ -218,20 +192,19 @@ struct MinMaxBottomValue { }; -template struct MaxReducer +template struct MaxReducer { - static const bool PacketAccess = packet_traits::HasMax; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t > *accum) { *accum = t; } + scalar_max_op op; + *accum = op(t, *accum); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmax(*accum, p); + scalar_max_op op; + (*accum) = op.packetOp(*accum, p); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); + return MinMaxBottomValue::IsInteger>::bottom_value(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -246,33 +219,34 @@ template struct MaxReducer } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::maxi(saccum, predux_max(vaccum)); + scalar_max_op op; + return op(saccum, op.predux(vaccum)); } }; -template -struct reducer_traits, Device> { +template + struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) }; }; - -template struct MinReducer +template struct MinReducer { - static const bool PacketAccess = packet_traits::HasMin; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t < *accum) { *accum = t; } + scalar_min_op op; + *accum = op(t, *accum); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmin(*accum, p); + scalar_min_op op; + (*accum) = op.packetOp(*accum, p); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); + return MinMaxBottomValue::IsInteger>::bottom_value(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -287,24 +261,23 @@ template struct MinReducer } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::mini(saccum, predux_min(vaccum)); + scalar_min_op op; + return op(saccum, op.predux(vaccum)); } }; -template -struct reducer_traits, Device> { +template + struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) }; }; - template struct ProdReducer { - static const bool PacketAccess = packet_traits::HasMul; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_product_op prod_op; (*accum) = prod_op(*accum, t); @@ -313,7 +286,6 @@ template struct ProdReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmul(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { internal::scalar_cast_op conv; return conv(1); @@ -340,16 +312,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true }; }; struct AndReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } @@ -365,15 +336,14 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; struct OrReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } @@ -389,19 +359,22 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; - -// Argmin/Argmax reducers +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. template struct ArgMaxTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first ) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::lowest()); @@ -415,18 +388,21 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; template struct ArgMinTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::highest()); @@ -440,7 +416,9 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; @@ -454,6 +432,7 @@ class GaussianGenerator { const array& std_devs) : m_means(means) { + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; } @@ -461,6 +440,7 @@ class GaussianGenerator { EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { T tmp = T(0); + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; @@ -483,6 +463,25 @@ struct functor_traits > { }; }; +template +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > +{ enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; + } // end namespace internal } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index eb1d4934e..174bf0683 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -12,7 +12,7 @@ namespace Eigen { -/** \class TensorGenerator +/** \class TensorGeneratorOp * \ingroup CXX11_Tensor_Module * * \brief Tensor generator class. @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -87,40 +88,58 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_generator(op.generator()) { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const @@ -133,7 +152,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -145,6 +164,97 @@ struct TensorEvaluator, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.firstLevelCacheSize(); + // TODO(ezhulenev): Generator should have a cost. + return internal::TensorBlockResourceRequirements::skewed( + target_size); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = + static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + + static const int packet_size = PacketType::size; + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + const Index inner_dim_vectorized = inner_dim_size - packet_size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + // Generate data for the vectorized part of the inner-most dimension. + for (; i <= inner_dim_vectorized; i += packet_size) { + for (Index j = 0; j < packet_size; ++j) { + array j_coords = coords; // Break loop dependence. + j_coords[inner_dim] += j; + *(block_buffer + offset + i + j) = m_generator(j_coords); + } + coords[inner_dim] += packet_size; + } + // Finalize non-vectorized part of the inner-most dimension. + for (; i < inner_dim_size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make @@ -153,21 +263,26 @@ struct TensorEvaluator, Device> TensorOpCost::MulCost()); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } coords[0] = index; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } @@ -175,8 +290,10 @@ struct TensorEvaluator, Device> } } + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; array m_strides; + array m_fast_strides; Generator m_generator; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 000000000..cb53ce298 --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,99 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obviously not work when trying to compile tensorflow on a system with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuStreamDefault hipStreamDefault +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +// gpu_assert can be overridden +#ifndef gpu_assert + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) assert(COND) +#endif + +#endif // gpu_assert + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 000000000..1d142f2ee --- /dev/null +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuDeviceSynchronize +#undef gpuMemcpy + +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed2..dd51850b7 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -27,6 +27,7 @@ namespace Eigen { * patch_cols, and 1 for all the additional dimensions. */ namespace internal { + template struct traits > : public traits { @@ -38,6 +39,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -52,6 +54,66 @@ struct nested, 1, typename eval type; }; +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = + (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + } // end namespace internal template @@ -70,12 +132,12 @@ class TensorImagePatchOp : public TensorBase, Device> typedef TensorEvaluator Impl; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -238,9 +309,15 @@ struct TensorEvaluator, Device> // Calculate the padding m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warning; + m_outputRows=0; //// silence the uninitialised warning; } } eigen_assert(m_outputRows > 0); @@ -312,12 +389,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -418,20 +503,27 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -449,6 +541,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -500,6 +593,7 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator m_impl; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 3209fecd3..2d8c7b903 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -37,36 +37,36 @@ namespace Eigen { * \sa Tensor */ -template +template struct type2index { - static const DenseIndex value = n; - EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } - EIGEN_DEVICE_FUNC void set(DenseIndex val) { + static const Index value = n; + EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } + EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); } }; // This can be used with IndexPairList to get compile-time constant pairs, // such as IndexPairList, type2indexpair<3,4>>(). -template +template struct type2indexpair { - static const DenseIndex first = f; - static const DenseIndex second = s; + static const Index first = f; + static const Index second = s; - constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { - return IndexPair(f, s); + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); } - EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { eigen_assert(val.first == f); eigen_assert(val.second == s); } }; -template struct NumTraits > +template struct NumTraits > { - typedef DenseIndex Real; + typedef Index Real; enum { IsComplex = 0, RequireInitialization = false, @@ -75,28 +75,28 @@ template struct NumTraits > MulCost = 1 }; - EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static inline Real highest() { return n; } - EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } }; namespace internal { template -EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { - val = new_val; +EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { + val = internal::convert_index(new_val); } -template -EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { val.set(new_val); } template -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { val = new_val; } -template -EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { val.set(new_val); } @@ -106,36 +106,36 @@ struct is_compile_time_constant { static constexpr bool value = false; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; @@ -228,15 +228,15 @@ template -template +template struct tuple_coeff { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); } template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { if (i == Idx) { update_value(array_get(t), value); } else { @@ -245,7 +245,7 @@ struct tuple_coeff { } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { return ((i == Idx) & is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } @@ -268,18 +268,18 @@ struct tuple_coeff { template struct tuple_coeff<0, ValueT> { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return array_get<0>(t)/* * (i == 0)*/; } template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { eigen_assert (i == 0); update_value(array_get<0>(t), value); } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { - return is_compile_time_constant::ValType>::value & (i == 0); + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value && (i == 0); } template @@ -298,32 +298,43 @@ struct tuple_coeff<0, ValueT> { template struct IndexList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { - return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { + return internal::tuple_coeff >::value-1, Index>::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); } EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); + return internal::tuple_coeff >::value-1, Index>::values_up_to_known_statically(*this); } EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); + return internal::tuple_coeff >::value-1, Index>::values_up_to_statically_known_to_increase(*this); } }; +template +std::ostream& operator<<(std::ostream& os, + const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} template constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { @@ -333,26 +344,28 @@ constexpr IndexList make_index_list(FirstType val1, Ot template struct IndexPairList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { - return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); } }; namespace internal { -template size_t array_prod(const IndexList& sizes) { - size_t result = 1; - for (int i = 0; i < array_size >::value; ++i) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { + Index result = 1; + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < array_size >::value; ++i) { result *= sizes[i]; } return result; @@ -372,30 +385,30 @@ template struct array_size >::value; }; -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { return IndexTupleExtractor::get_val(a); } -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { return IndexTupleExtractor::get_val(a); } template struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { return IndexList().value_known_statically(i); } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { return IndexList().value_known_statically(i); } }; @@ -447,14 +460,14 @@ template template struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -462,7 +475,7 @@ struct index_statically_eq_impl > { template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -471,14 +484,14 @@ struct index_statically_eq_impl > { template struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -486,7 +499,7 @@ struct index_statically_ne_impl > { template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -495,14 +508,14 @@ struct index_statically_ne_impl > { template struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -510,7 +523,7 @@ struct index_statically_gt_impl > { template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -520,14 +533,14 @@ struct index_statically_gt_impl > { template struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -535,7 +548,7 @@ struct index_statically_lt_impl > { template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -545,14 +558,14 @@ struct index_statically_lt_impl > { template struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -560,7 +573,7 @@ struct index_pair_first_statically_eq_impl struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -570,14 +583,14 @@ struct index_pair_first_statically_eq_impl struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -585,7 +598,7 @@ struct index_pair_second_statically_eq_impl struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -602,7 +615,7 @@ namespace internal { template struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) { return false; } }; @@ -623,42 +636,42 @@ struct indices_statically_known_to_increase_impl { template struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; @@ -674,7 +687,7 @@ struct index_pair_second_statically_eq_impl { namespace Eigen { namespace internal { template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { return index_known_statically_impl::run(i); } @@ -689,32 +702,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { return index_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { return index_statically_ne_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { return index_statically_gt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { return index_statically_lt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { return index_pair_first_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { return index_pair_second_statically_eq_impl::run(i, value); } diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9ee..c5cb61af5 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -84,18 +85,25 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); @@ -129,11 +137,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -144,6 +152,7 @@ struct TensorEvaluator, Device> eigen_assert(index < dimensions().TotalSize()); *inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -158,6 +167,7 @@ struct TensorEvaluator, Device> *inputIndex += index / m_strides[0]; return true; } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -193,6 +203,7 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -213,7 +224,14 @@ struct TensorEvaluator, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 33edc49e3..26a3818f3 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -32,7 +32,7 @@ struct Initializer { Eigen::array::Index, traits::NumDimensions>* indices, const InitList& vals) { int i = 0; - for (auto v : vals) { + for (const auto& v : vals) { (*indices)[traits::NumDimensions - N] = i++; Initializer::run(tensor, indices, v); } @@ -48,7 +48,7 @@ struct Initializer { const InitList& vals) { int i = 0; // There is likely a faster way to do that than iterating. - for (auto v : vals) { + for (const auto& v : vals) { (*indices)[traits::NumDimensions - 1] = i++; tensor.coeffRef(*indices) = v; } diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ede3939c2..6d5cce4aa 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -21,7 +21,7 @@ namespace Eigen { * \brief Fast integer division by a constant. * * See the paper from Granlund and Montgomery for explanation. - * (at http://dx.doi.org/10.1145/773473.178249) + * (at https://doi.org/10.1145/773473.178249) * * \sa Tensor */ @@ -35,8 +35,10 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; _BitScanReverse(&index, val); @@ -51,8 +53,10 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -86,8 +90,10 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); #else return (static_cast(a) * b) >> 32; #endif @@ -95,9 +101,11 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); -#elif defined(__SIZEOF_INT128__) +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#elif EIGEN_HAS_BUILTIN_INT128 __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast(v >> 64); #else @@ -116,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -159,7 +167,7 @@ struct TensorIntDivisor { shift2 = log_div > 1 ? log_div-1 : 0; } - // Must have 0 <= numerator. On platforms that dont support the __uint128_t + // Must have 0 <= numerator. On platforms that don't support the __uint128_t // type numerator should also be less than 2^32-1. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); @@ -195,8 +203,10 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index cd0109ef4..80106c1a0 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -46,6 +46,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; }; template @@ -68,39 +69,22 @@ template class TensorLayoutSwapOp : public TensorBase, WriteAccessors> { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) - : m_xpr(expr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) protected: typename XprType::Nested m_xpr; }; @@ -118,12 +102,18 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { for(int i = 0; i < NumDims; ++i) { @@ -131,16 +121,25 @@ struct TensorEvaluator, Device> } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -159,7 +158,9 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } const TensorEvaluator& impl() const { return m_impl; } @@ -180,11 +181,17 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index ee0078bbc..73ff3d2db 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC @@ -43,12 +43,56 @@ #define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ typename internal::enable_if< ( __condition__ ) , int >::type = 0 +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) + #define EIGEN_DEVICE_REF +#else + #define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + EIGEN_TRY {X;} \ + EIGEN_CATCH(const cl::sycl::exception& e) { \ + EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \ + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + "\n" + \ + e.what())); \ + } \ + } while (false) -#if EIGEN_HAS_CONSTEXPR -#define EIGEN_CONSTEXPR constexpr +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + +#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; } #else -#define EIGEN_CONSTEXPR + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) #endif +/** \internal + * \brief Macro to manually inherit assignment operators. + * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * This also inherits template operator=(const OtherDerived&) assignments. + * With C++11 or later this also default-implements the copy-constructor + */ +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) #endif diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index a8e55757e..6834c97e4 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -12,37 +12,57 @@ namespace Eigen { +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + /** \class TensorMap * \ingroup CXX11_Tensor_Module * * \brief A tensor expression mapping an existing array of data. * */ -/// template class MakePointer_ is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler T* is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. -/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . /// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is T*. +/// existing code as its default value is `T*`. template class MakePointer_> class TensorMap : public TensorBase > { public: typedef TensorMap Self; - typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; + typedef TensorBase > Base; + #ifdef EIGEN_USE_SYCL + typedef typename Eigen::internal::remove_reference::type>::type Nested; + #else + typedef typename Eigen::internal::nested::type Nested; + #endif + typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; - /* typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType;*/ typedef typename MakePointer_::Type PointerType; - typedef PointerType PointerArgType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + >::type StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar&, + const Scalar& + >::type StorageRefType; static const int Options = Options_; @@ -57,47 +77,47 @@ template class MakePoin }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -114,12 +134,12 @@ template class MakePoin EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerType data() { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -132,14 +152,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const + EIGEN_STRONG_INLINE StorageRefType operator()() const { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -147,9 +167,10 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; @@ -160,7 +181,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -171,7 +192,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -182,7 +203,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -193,7 +214,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -206,7 +227,7 @@ template class MakePoin #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -219,14 +240,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() + EIGEN_STRONG_INLINE StorageRefType operator()() { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -234,9 +255,10 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); @@ -248,7 +270,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -259,7 +281,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -270,7 +292,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -281,7 +303,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -293,26 +315,10 @@ template class MakePoin } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Self& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) private: - typename MakePointer_::Type m_data; + StoragePointerType m_data; Dimensions m_dimensions; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 615559d44..a6181d35e 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,11 +52,13 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) -template <> +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) + +typedef ulonglong2 Packet4h2; +template<> struct PacketType { - typedef half2 type; - static const int size = 2; + typedef Packet4h2 type; + static const int size = 8; enum { HasAdd = 1, HasSub = 1, @@ -75,6 +77,7 @@ struct PacketType { HasSqrt = 1, HasRsqrt = 1, HasExp = 1, + HasExpm1 = 0, HasLog = 1, HasLog1p = 0, HasLog10 = 0, @@ -84,9 +87,57 @@ struct PacketType { #endif #if defined(EIGEN_USE_SYCL) -template - struct PacketType { - typedef T type; + +namespace TensorSycl { +namespace internal { + +template struct PlusOp { + static constexpr Index Value = A + B; +}; + +template struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, + StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static const int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static const int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { + return ((((x) + (y)-1) / (y)) * (y)); +} + +} // namespace internal +} // namespace TensorSycl + +template <> + struct PacketType { + typedef half type; static const int size = 1; enum { HasAdd = 0, @@ -103,8 +154,58 @@ template HasBlend = 0 }; }; -#endif +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; + +}; + +template +struct PacketType : PacketType{}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ +template<> struct PacketType : internal::sycl_packet_traits \ +{\ + typedef typename internal::packet_traits::type type;\ + typedef typename internal::packet_traits::half half;\ +}; + + +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template<> struct PacketType: PacketType{}; +template<> struct PacketType: PacketType{}; +#endif +#endif // Tuple mimics std::pair but works on e.g. nvcc. template struct Tuple { @@ -121,14 +222,6 @@ template struct Tuple { EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple(const U& f, const V& s) : first(f), second(s) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple& operator= (const Tuple& rhs) { - if (&rhs == this) return *this; - first = rhs.first; - second = rhs.second; - return *this; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Tuple& rhs) { using numext::swap; @@ -168,12 +261,12 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { idx[Is]... }; } - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, numeric_list) { return array(); diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d34f1e328..b3f00f77a 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -31,12 +31,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOpEIGEN_DEVICE_REF type; }; template @@ -53,6 +54,7 @@ template class TensorReshapingOp : public TensorBase, WriteAccessors> { public: + typedef TensorBase, WriteAccessors> Base; typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; @@ -69,24 +71,7 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) protected: typename XprType::Nested m_xpr; @@ -101,15 +86,63 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory::type, Device> ConstCastStorage; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = +#if defined(EIGEN_HAS_INDEX_LIST) + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; +#else + Runtime; +#endif + // clang-format on + enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccess = TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef + typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size @@ -117,17 +150,20 @@ struct TensorEvaluator, Device> eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -146,10 +182,53 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + // required in block(OutputTensorBlock* output_block) const + // For C++03 compatibility this must be defined outside the method + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || + (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlock(internal::TensorBlockKind::kView, + m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, + scratch); + } + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -167,14 +246,16 @@ template typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -183,15 +264,38 @@ template typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment< + Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } }; @@ -214,12 +318,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorSlicingOp& type; + typedef const TensorSlicingOpEIGEN_DEVICE_REF type; }; template @@ -236,6 +341,7 @@ template class TensorSlicingOp : public TensorBase > { public: + typedef TensorBase > Base; typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; @@ -254,25 +360,7 @@ class TensorSlicingOp : public TensorBase::type& expression() const { return m_xpr; } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) protected: typename XprType::Nested m_xpr; @@ -283,9 +371,12 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } + EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { + const bool prefer_block_evaluation = BlockAccess && total > 32*1024; + return !prefer_block_evaluation && contiguous > threshold_; + } private: Index threshold_; @@ -294,11 +385,21 @@ template struct MemcpyTriggerForSlicing { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_GPU -template struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_SYCL +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } +}; +#endif + } // Eval as rvalue @@ -308,23 +409,56 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + // FIXME: Temporary workaround for bug in slicing of bool tensors. + !internal::is_same::type, bool>::value, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (std::size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } } + // No strides for scalars. + if (NumDims == 0) return; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -337,7 +471,7 @@ struct TensorEvaluator, Devi m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = 1; @@ -349,23 +483,17 @@ struct TensorEvaluator, Devi m_outputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { + if (!NumTraits::type>::RequireInitialization + && data && m_impl.data()) { Index contiguous_values = 1; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { @@ -383,12 +511,12 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); - if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); - for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(internal::array_prod(dimensions()), contiguous_values)) { + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -396,25 +524,42 @@ struct TensorEvaluator, Devi return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + if (m_is_identity) { + return m_impl.template packet(index); + } + Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -426,6 +571,7 @@ struct TensorEvaluator, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -445,6 +591,7 @@ struct TensorEvaluator, Devi EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -454,12 +601,28 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlock block = m_impl.block(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - Scalar* result = m_impl.data(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -493,12 +656,19 @@ struct TensorEvaluator, Devi } return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -506,6 +676,7 @@ struct TensorEvaluator, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -520,8 +691,9 @@ struct TensorEvaluator, Devi array, NumDims> m_fastOutputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; + bool m_is_identity; const StartIndices m_offsets; }; @@ -535,36 +707,55 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; - enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + + const int packetSize = PacketType::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -576,6 +767,7 @@ struct TensorEvaluator, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -595,14 +787,20 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } } -}; - + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlock(arg_desc, block); + } +}; namespace internal { template @@ -616,12 +814,13 @@ struct traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorStridingSlicingOp& type; + typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; }; template @@ -637,6 +836,7 @@ template > { public: + typedef TensorBase > Base; typedef typename internal::traits::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename internal::nested::type Nested; @@ -660,26 +860,7 @@ class TensorStridingSlicingOp : public TensorBase::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) protected: typename XprType::Nested m_xpr; @@ -694,6 +875,13 @@ struct TensorEvaluator XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -701,43 +889,58 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes startIndicesClamped, stopIndicesClamped; - for (size_t i = 0; i < internal::array_size::value; ++i) { + DSizes startIndicesClamped, stopIndicesClamped; + for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if(m_strides[i]>0){ - startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - }else{ - /* implies m_strides[i]<0 by assert */ - startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + if (m_strides[i] > 0) { + startIndicesClamped[i] = + clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + } else { + /* implies m_strides[i] < 0 by assert */ + startIndicesClamped[i] = + clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); } m_startIndices[i] = startIndicesClamped[i]; } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + typedef typename TensorEvaluator::Dimensions InputDimensions; + const InputDimensions& input_dims = m_impl.dimensions(); - // check for degenerate intervals and compute output tensor shape - bool degenerate = false;; - for(int i = 0; i < NumDims; i++){ + // compute output tensor shape + m_is_identity = true; + for (int i = 0; i < NumDims; i++) { Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ + if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { m_dimensions[i] = 0; - degenerate = true; - }else{ - m_dimensions[i] = interval / m_strides[i] - + (interval % m_strides[i] != 0 ? 1 : 0); + } else { + m_dimensions[i] = + (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); eigen_assert(m_dimensions[i] >= 0); } + if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { + m_is_identity = false; + } } + Strides output_dims = m_dimensions; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -754,8 +957,7 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = m_strides[NumDims-1]; @@ -770,58 +972,58 @@ struct TensorEvaluator= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -831,20 +1033,24 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; + bool m_is_identity; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes m_startIndices; // clamped startIndices DSizes m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - std::size_t m_block_total_size_max; }; // Eval as lvalue @@ -860,25 +1066,33 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Strides Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 647bcf108..ee44382cf 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -31,6 +31,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -90,18 +91,33 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -135,11 +151,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -148,6 +173,7 @@ struct TensorEvaluator, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (isPaddingAtIndexForDim(idx, i)) { @@ -161,6 +187,7 @@ struct TensorEvaluator, Device } inputIndex += (index - m_padding[0].first); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (isPaddingAtIndexForDim(idx, i)) { @@ -189,18 +216,298 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); } else { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); } return cost; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // If one of the dimensions is zero, return empty block view. + if (desc.size() == 0) { + return TensorBlock(internal::TensorBlockKind::kView, NULL, + desc.dimensions()); + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[inner_dim_idx] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = + internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const Index input_inner_dim_size = + static_cast(m_impl.dimensions()[inner_dim_idx]); + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + + output_inner_pad_before_size), + Index(0))); + + eigen_assert(output_inner_copy_size >= 0); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Prepare storage for the materialized padding result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a + // single logical inner dimension. + + // When possible we squeeze writes for the innermost (only if non-padded) + // dimension with the first padded dimension. This allows to reduce the + // number of calls to LinCopy and better utilize vector instructions. + const bool squeeze_writes = + NumDims > 1 && + // inner dimension is not padded + (input_inner_dim_size == m_dimensions[inner_dim_idx]) && + // and equal to the block inner dimension + (input_inner_dim_size == output_inner_dim_size); + + const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; + + // Maximum coordinate on a squeeze dimension that we can write to. + const Index squeeze_max_coord = + squeeze_writes ? numext::mini( + // max non-padded element in the input + static_cast(m_dimensions[squeeze_dim] - + m_padding[squeeze_dim].second), + // max element in the output buffer + static_cast(output_offsets[squeeze_dim] + + desc.dimension(squeeze_dim))) + : static_cast(0); + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size;) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill single innermost dimension with padding value. + size += output_inner_dim_size; + + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + + } else if (squeeze_writes) { + // Squeeze multiple reads from innermost dimensions. + const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; + size += output_inner_dim_size * squeeze_num; + + // Copy `squeeze_num` inner dimensions from input to output. + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(input_offset, 1, m_impl.data()), + output_inner_dim_size * squeeze_num); + + // Update iteration state for only `squeeze_num - 1` processed inner + // dimensions, because we have another iteration state update at the end + // of the loop that will update iteration state for the last inner + // processed dimension. + it[0].count += (squeeze_num - 1); + input_offset += it[0].input_stride * (squeeze_num - 1); + output_offset += it[0].output_stride * (squeeze_num - 1); + output_coord[squeeze_dim] += (squeeze_num - 1); + + } else { + // Single read from innermost dimension. + size += output_inner_dim_size; + + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -262,22 +569,23 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -289,21 +597,21 @@ struct TensorEvaluator, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index lastPaddedRight = m_outputStrides[1]; - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[0].first); return m_impl.template packet(inputIndex); @@ -319,23 +627,23 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; - + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i+1]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -347,21 +655,21 @@ struct TensorEvaluator, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); const Index lastPaddedRight = m_outputStrides[NumDims-1]; - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[NumDims-1].first); return m_impl.template packet(inputIndex); @@ -373,6 +681,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -387,6 +696,8 @@ struct TensorEvaluator, Device PaddingDimensions m_padding; Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f6..413d25dd4 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -87,18 +88,26 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { Index num_patches = 1; @@ -143,12 +152,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -161,6 +170,7 @@ struct TensorEvaluator, Device> Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -169,6 +179,7 @@ struct TensorEvaluator, Device> inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -196,6 +207,7 @@ struct TensorEvaluator, Device> Index inputIndices[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -211,6 +223,7 @@ struct TensorEvaluator, Device> inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -237,6 +250,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -253,7 +267,14 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; @@ -262,6 +283,7 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; + }; } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 1655a813e..37c1d1c3d 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -16,50 +17,23 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. - assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - + gpu_assert(threadIdx.z == 0); + return blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); #else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; + // Rely on Eigen's random implementation. + return random(); #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + *state = current * 6364136223846793005ULL + (stream << 1 | 1); // Generate the random output (using the PCG-XSH-RS scheme) return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -73,34 +47,42 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); +T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); return static_cast(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { - Eigen::half result; - // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); - result.x = static_cast(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast(15) << 10); +Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { + // Generate 10 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x3ffu) | (static_cast(15) << 10); + Eigen::half result = Eigen::numext::bit_cast(half_bits); // Return the final result return result - Eigen::half(1.0f); } +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::bfloat16 RandomToTypeUniform(uint64_t* state, uint64_t stream) { + + // Generate 7 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x7fu) | (static_cast(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::bfloat16(1.0f); +} template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { +float RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); + const unsigned rnd = PCG_XSH_RS_generator(state, stream); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast(127) << 23); @@ -109,7 +91,7 @@ float RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { +double RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint64_t raw; double dp; @@ -118,9 +100,9 @@ double RandomToTypeUniform(uint64_t* state) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); result.raw = (static_cast(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast(1023) << 52); @@ -129,14 +111,14 @@ double RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template class UniformRandomGenerator { @@ -147,17 +129,42 @@ template class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two step to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; + #ifdef EIGEN_USE_SYCL + m_exec_once =other.m_exec_once; + #endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeUniform(&m_state, i); return result; } @@ -165,16 +172,25 @@ template class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); + values[j] = RandomToTypeUniform(&m_state, i); } - m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; template @@ -190,14 +206,14 @@ struct functor_traits > { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { +T RandomToTypeNormal(uint64_t* state, uint64_t stream) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -208,14 +224,14 @@ T RandomToTypeNormal(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } @@ -226,17 +242,38 @@ template class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once=other.m_exec_once; +#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeNormal(&m_state, i); return result; } @@ -244,16 +281,25 @@ template class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); + values[j] = RandomToTypeNormal(&m_state, i); } - m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 41d0d0022..583f46256 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,8 +11,20 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 +#else +#define KERNEL_FRIEND friend +#endif +#endif + + namespace Eigen { + /** \class TensorReduction * \ingroup CXX11_Tensor_Module * @@ -32,6 +44,7 @@ namespace internal { typedef typename XprType::Nested Nested; static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; template struct MakePointer { // Intermediate typedef to workaround MSVC issue. @@ -152,7 +165,9 @@ struct GenericDimReducer<-1, Self, Op> { } }; -template +template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -164,23 +179,100 @@ struct InnerMostDimReducer { }; template -struct InnerMostDimReducer { +struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const int packetSize = internal::unpacket_traits::size; + const typename Self::Index packetSize = internal::unpacket_traits::size; const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType p = reducer.template initializePacket(); + typename Self::PacketReturnType paccum = reducer.template initializePacket(); for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); } typename Self::CoeffReturnType accum = reducer.initialize(); for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizeBoth(accum, p); + return reducer.finalizeBoth(accum, paccum); } }; -template +#if !defined(EIGEN_HIPCC) +static const int kLeafSize = 1024; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce( + reduce(self, firstIndex + half, numValuesToReduce - half, reducer), + &accum); + } else { + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + const typename Self::Index packetSize = + internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), + packetSize); + const typename Self::Index num_left = + numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce( + reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + const typename Self::Index UnrollSize = + (numValuesToReduce / (2*packetSize)) * 2*packetSize; + const typename Self::Index VectorizedSize = + (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType paccum = + reducer.template initializePacket(); + typename Self::PacketReturnType paccum2 = + reducer.template initializePacket(); + for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j), &paccum); + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j + packetSize), + &paccum2); + } + for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { + reducer.reducePacket(self.m_impl.template packet( + firstIndex + j), &paccum); + } + reducer.reducePacket(paccum2, &paccum); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; + ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, paccum); + } + } +}; +#endif + +template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); @@ -215,11 +307,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> { }; // Default full reducer -template +template struct FullReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } @@ -229,7 +321,7 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, @@ -242,8 +334,8 @@ struct FullReducerShard { // Multithreaded full reducer template struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static const Index PacketSize = unpacket_traits::size; // launch one reducer per thread and accumulate the result. @@ -320,29 +412,58 @@ struct OuterReducer { } }; +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static const bool HasOptimizedImplementation = false; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#if defined(EIGEN_HAS_GPU_FP16) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif -template -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -template -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if defined(EIGEN_USE_SYCL) + typedef typename remove_const().initialize())>::type type; +#else + typedef typename remove_const::type type; +#endif +}; + } // end namespace internal @@ -376,11 +497,15 @@ class TensorReductionOp : public TensorBase +struct TensorReductionEvaluatorBase; // Eval as rvalue template class MakePointer_, typename Device> -struct TensorEvaluator, Device> +struct TensorReductionEvaluatorBase, Device> { + typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -390,26 +515,42 @@ struct TensorEvaluator, static const int NumOutputDims = NumInputDims - NumReducedDims; typedef typename internal::conditional, DSizes >::type Dimensions; typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator, Device> Self; + typedef TensorReductionEvaluatorBase, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::ReductionReturnType::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const Index PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; enum { IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -434,11 +575,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -466,6 +609,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -475,13 +619,19 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { - m_impl.evalSubExprsIfNeeded(NULL); - + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { // Use the FullReducer if possible. if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && @@ -489,7 +639,7 @@ struct TensorEvaluator, !RunningOnGPU))) { bool need_assign = false; if (!data) { - m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); + m_result = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); data = m_result; need_assign = true; } @@ -497,20 +647,9 @@ struct TensorEvaluator, internal::FullReducer::run(*this, reducer, m_device, data); return need_assign; } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -524,8 +663,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -533,9 +672,10 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -557,8 +697,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -566,9 +706,10 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -576,21 +717,54 @@ struct TensorEvaluator, return (m_result != NULL); } } + #if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if(RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + #endif } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE + void + evalSubExprsIfNeededAsync(EvaluatorPointerType data, + EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { + done(evalSubExprsIfNeededCommon(data)); + }); + } +#endif + + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + return evalSubExprsIfNeededCommon(data); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + if (( RunningFullReduction || RunningOnGPU) && m_result ) { return *(m_result + index); } Op reducer(m_reducer); @@ -662,37 +836,52 @@ struct TensorEvaluator, } } - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_result.bind(cgh); + } +#endif private: template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimReducer; template friend struct internal::InnerMostDimPreserver; template friend struct internal::FullReducer; #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_HAS_GPU_FP16) + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +#if defined(EIGEN_USE_SYCL) + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template friend struct internal::GenericReducer; +#endif + + template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -741,10 +930,12 @@ struct TensorEvaluator, Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; + array, NumOutputDims> m_fastOutputStrides; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -760,7 +951,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) @@ -770,10 +961,36 @@ static const bool RunningOnGPU = false; static const bool RunningOnGPU = false; static const bool RunningOnSycl = false; #endif - typename MakePointer_::Type m_result; + EvaluatorPointerType m_result; - const Device& m_device; - const Dims& m_xpr_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +: public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} +}; + + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> +: public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } }; } // end namespace Eigen diff --git a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 65638b6a8..68780cd3c 100644 --- a/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/TMB/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -1,750 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple cuda thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread -// updated the content of the output address it will try again. -template -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - assert(0 && "Wordsize not supported"); - } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif -} - -// We extend atomicExch to support extra data types -template -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#ifdef EIGEN_HAS_CUDA_FP16 -template