From 52807002c99563b65b087c7929cd0f5f8bf82176 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 30 Dec 2018 16:39:11 -0700 Subject: [PATCH 01/95] Format tweaks. --- .../standard/pipeline/advance_p_pipeline.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc index a222d76f..e275b116 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc @@ -252,9 +252,13 @@ advance_p_pipeline( species_t * RESTRICT sp, int rank; - if ( !sp || !aa || !ia || sp->g != aa->g || sp->g != ia->g ) + if ( ! sp || + ! aa || + ! ia || + sp->g != aa->g || + sp->g != ia->g ) { - ERROR( ( "Bad args" ) ); + ERROR( ( "Bad args." ) ); } args->p0 = sp->p; @@ -264,10 +268,10 @@ advance_p_pipeline( species_t * RESTRICT sp, args->seg = seg; args->g = sp->g; - args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); - args->cdt_dx = sp->g->cvac*sp->g->dt*sp->g->rdx; - args->cdt_dy = sp->g->cvac*sp->g->dt*sp->g->rdy; - args->cdt_dz = sp->g->cvac*sp->g->dt*sp->g->rdz; + args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac ); + args->cdt_dx = sp->g->cvac * sp->g->dt * sp->g->rdx; + args->cdt_dy = sp->g->cvac * sp->g->dt * sp->g->rdy; + args->cdt_dz = sp->g->cvac * sp->g->dt * sp->g->rdz; args->qsp = sp->q; args->np = sp->np; @@ -300,7 +304,7 @@ advance_p_pipeline( species_t * RESTRICT sp, { if ( args->seg[rank].n_ignored ) { - WARNING( ( "Pipeline %i ran out of storage for %i movers", + WARNING( ( "Pipeline %i ran out of storage for %i movers.", rank, args->seg[rank].n_ignored ) ); } From 49a2f0ff3ff580bb8d8ad3168def1c14274618b8 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 7 Jan 2019 13:28:39 -0700 Subject: [PATCH 02/95] Fix what appears to be a memory error that dates back to the v407 version. --- .../standard/pipeline/advance_p_pipeline.cc | 2 ++ .../standard/pipeline/advance_p_pipeline_v16.cc | 7 +++++-- .../standard/pipeline/advance_p_pipeline_v4.cc | 7 +++++-- .../standard/pipeline/advance_p_pipeline_v8.cc | 7 +++++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc index e275b116..8dde6f27 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc @@ -78,8 +78,10 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args, // The host gets the first accumulator array. if ( pipeline_rank != n_pipeline ) + { a0 += ( 1 + pipeline_rank ) * POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + } // Process particles for this pipeline. diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc index bc152588..ef6f8b1a 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc @@ -101,8 +101,11 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args, // Determine which accumulator array to use. // The host gets the first accumulator array. - a0 += ( 1 + pipeline_rank ) * - POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + if ( pipeline_rank != n_pipeline ) + { + a0 += ( 1 + pipeline_rank ) * + POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + } // Process the particle blocks for this pipeline. diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc index 4e23770e..19d82ade 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc @@ -76,8 +76,11 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args, // Determine which accumulator array to use. // The host gets the first accumulator array. - a0 += ( 1 + pipeline_rank ) * - POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + if ( pipeline_rank != n_pipeline ) + { + a0 += ( 1 + pipeline_rank ) * + POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + } // Process the particle blocks for this pipeline. diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc index 84ed3916..0890d554 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc @@ -80,8 +80,11 @@ advance_p_pipeline_v8( advance_p_pipeline_args_t * args, // Determine which accumulator array to use. // The host gets the first accumulator array. - a0 += ( 1 + pipeline_rank ) * - POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + if ( pipeline_rank != n_pipeline ) + { + a0 += ( 1 + pipeline_rank ) * + POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 ); + } // Process the particle blocks for this pipeline. From d1227049ad99c663034c61f9d2e160565d37307f Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 31 Jan 2019 01:34:43 -0700 Subject: [PATCH 03/95] Add configurable and documented build scripts for building VPIC on LANL ATS-1 and CTS-1 machines. Document how to use these two scripts. --- README.md | 22 +- arch/lanl-ats1 | 967 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/lanl-cts1 | 829 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1815 insertions(+), 3 deletions(-) create mode 100755 arch/lanl-ats1 create mode 100755 arch/lanl-cts1 diff --git a/README.md b/README.md index c36de28b..d20a36b8 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ the top-level source directory: cd build ``` -The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building +The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building. They can be invoked using something like: @@ -97,9 +97,25 @@ After configuration, simply type: make ``` -Advanced users may chose to instead invoke `cmake` directly and hand select options +Two scripts in the `./arch` directory are of particular note: lanl-ats1 and lanl-cts1. These scripts provide a default way to build VPIC +on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL ATS-1 clusters are the first generation +of DOE Advanced Technology Systems and consist of a partition of dual socket Intel Haswell nodes and a partition of single socket +Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of +dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily +documented and can be configured to provide a large variety of custom builds for their respective platform types. These +scripts could also serve as a good starting point for development of a build script for other machine types. Because these +scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make +commands. -GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`) +From the user created build directory, these scripts can be invoked as follows: + +```bash + ../arch/lanl-ats1 +``` + +Advanced users may chose to instead invoke `cmake` directly and hand select options. + +GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`). # Building an example input deck diff --git a/arch/lanl-ats1 b/arch/lanl-ats1 new file mode 100755 index 00000000..981377ae --- /dev/null +++ b/arch/lanl-ats1 @@ -0,0 +1,967 @@ +#! /usr/bin/env bash +#------------------------------------------------------------------------------# +# This script supports building VPIC on ATS-1 machines at Los Alamos National +# Laboratory (LANL). These machines run the Cray Linux Environment Operating +# System and have two compute partitions, a Haswell partition and a Knights +# Landing (KNL) partition. Both processor types are Intel processors. These +# machines provide three compiler choices: Intel, GNU and Cray compilers. Two +# MPI implementations are provided: Cray Mpich and Open MPI. +# +# Normal users should not need to change this script if building VPIC to run +# on the KNL nodes of ATS-1 machines and happy with defaults. +# +# If normal users desire to build VPIC to run on the Haswell nodes of ATS-1 +# machines, they will need to change this script in two places: first in the +# section where a node type is chosen and second in the section where the type +# of vector intrinsics used are chosen. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Get the path to the project from which this script was called. +#------------------------------------------------------------------------------# + +src_dir="${0%/*}/.." + +#------------------------------------------------------------------------------# +# Configure the type of build that we want to perform. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# CCE: Cray compilers +# +# Note that selecting CCE for the Cray compilers currently does not work. The +# main reason why you might want to compile with the Cray compilers is to use +# some of the Cray specific tools like Reveal or a small set of features in +# the CrayPat profiling software. This is not a common use case for users. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="CCE" + +#------------------------------------------------------------------------------# +# Choose a processor node type. +#------------------------------------------------------------------------------# +# One of the node types must be chosen. Valid options are the following. +# +# KNL: Knights Landing nodes +# HSW: Haswell nodes +# +# If HSW, for Haswell, is chosen, you must also change the section on vector +# intrinsics support below to turn off support for V16_AVX512. Normally, you +# would also turn on support for V8_AVX2. See the documentation on the vector +# intrinsics section below for more details. +#------------------------------------------------------------------------------# + +KNL="yes" +#HSW="yes" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# CMPI: Cray Mpich, the Cray supported MPI library +# OMPI: Open MPI +#------------------------------------------------------------------------------# + +VMPI="CMPI" +#VMPI="OMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + +#------------------------------------------------------------------------------# +# Choose format of status update output. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_MORE_DIGITS=OFF, the output has two significant figures. +# +# If SET_MORE_DIGITS=ON, the output has four significant figures. +#------------------------------------------------------------------------------# + +SET_MORE_DIGITS="OFF" +#SET_MORE_DIGITS="ON" + +#------------------------------------------------------------------------------# +# Choose type of vector intrinsics support. +#------------------------------------------------------------------------------# +# Note the following constraints. +# +# Each of the nine variables in this section must have a configured value. +# This is because the corresponding "USE" cmake variable is set on the cmake +# command line below to allow any possible combinations to be configured using +# a single cmake command. +# +# If all values are configured as OFF, the scalar implementations of VPIC +# functions which are not vectorized will be used. +# +# It is possible to have a vector version configured as ON for each of the +# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC +# function has a V16 implementation, that will be used. If there is not a V16 +# implementation but there is a V8 implementation, that will be used. If there +# is not a V16 or V8 implementation but there is a V4 implementation, that +# will be used. Finally, for functions that have no vector implementations, +# the scalar version will be used. +# +# Currently, it is recommended to always configure the appropriate V4 version +# as on if using vector versions because there are key functions that only +# have a V4 version because the current algorithm does not generalize to +# longer vector lengths. An example is the move_p function. Since the V4 +# versions are generally more performant than the scalar versions, it makes +# sense to use them even when using the longer vector length implementations +# for other VPIC functions. +# +# In summary, when using vector versions on a machine with 256 bit SIMD, the +# V4 and V8 implementations should be configured as ON. When using a machine +# with 512 bit SIMD, V4 and V16 implementations should be configured as ON. +# +# First, we turn all of the vector options OFF. Then, we turn on the ones we +# want. +#------------------------------------------------------------------------------# + +SET_V4_PORTABLE="OFF" +SET_V4_SSE="OFF" +SET_V4_AVX="OFF" +SET_V4_AVX2="OFF" +SET_V8_PORTABLE="OFF" +SET_V8_AVX="OFF" +SET_V8_AVX2="OFF" +SET_V16_PORTABLE="OFF" +SET_V16_AVX512="OFF" + +#SET_V4_PORTABLE="ON" +#SET_V4_SSE="ON" +#SET_V4_AVX="ON" +SET_V4_AVX2="ON" +#SET_V8_PORTABLE="ON" +#SET_V8_AVX="ON" +#SET_V8_AVX2="ON" +#SET_V16_PORTABLE="ON" +SET_V16_AVX512="ON" + +#------------------------------------------------------------------------------# +# Choose a particle sort implementation. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are the +# following. +# +# LSORT: legacy, thread serial sort +# TSORT: thread parallel sort +# +# The LSORT particle sort implementation is the thread serial particle sort +# implementation from the legacy v407 version of VPIC. This implementation +# supports both in-place and out-of-place sorting of the particles. It is very +# competitive with the thread parallel sort implementation for a small number +# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting +# the particles in-place allows the fraction of particles stored in High +# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint +# of VPIC is reduced by the memory of a particle array which can be significant +# for particle dominated problems. +# +# The TSORT particle sort implementation is a thread parallel implementation. +# Currently, it can only perform out-of-place sorting of the particles. It will +# be more performant than the LSORT implementation when using many threads per +# MPI rank but uses more memory because of the out-of-place sort. +#------------------------------------------------------------------------------# + +VSORT="LSORT" +#VSORT="TSORT" + +#------------------------------------------------------------------------------# +# Choose type of library to build. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is to build a static library, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_SHARED_LIBS="OFF" +#SET_SHARED_LIBS="ON" + +#------------------------------------------------------------------------------# +# Choose integrated test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the integrated tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_INTEGRATED_TESTS="OFF" +#SET_INTEGRATED_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose unit test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the unit tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_UNIT_TESTS="OFF" +#SET_UNIT_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose the CMake build type. +#------------------------------------------------------------------------------# +# One of the available options must be chosen. Valid options depend on build +# types available in the CMake version but include at least the following. +# +# Release: In general, the default for CMake. +# None: Tells CMake not to use any pre-defined build type and gives VPIC build +# system total control of CMake variables defined on cmake command line. +#------------------------------------------------------------------------------# + +SET_BUILD_TYPE="Release" +#SET_BUILD_TYPE="None" + +#------------------------------------------------------------------------------# +# Choose number of parallel make processes for build. +#------------------------------------------------------------------------------# +# If NJ variable is not defined, "make" will perform a parallel build using +# maximum number of processors on the compilation machine. +# +# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many +# processes writing to STDOUT at the same time and will be difficult to +# interpret. +# +# When using VERBOSE = 1, use of NJ = 1 is recommended. +# +# The default is to use a modest number of processes in the parallel build. +# +# Comment out default below to use all processors on compilation machine. +#------------------------------------------------------------------------------# + +NJ=8 +#NJ=1 + +#------------------------------------------------------------------------------# +# Choose verbosity of "make" output. +#------------------------------------------------------------------------------# +# Setting VERBOSE = 1 causes "make" to output commands it is executing. +# +# This information is useful if debugging a failed build. +# +# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build. +# +# The default is a quiet build. +#------------------------------------------------------------------------------# + +SET_VERBOSE=0 +#SET_VERBOSE=1 + +#------------------------------------------------------------------------------# +# Choose versions of modules to use if default is not desired. +#------------------------------------------------------------------------------# +# No choice is required in this section. +# +# Some possible alternative module versions are provided below. Change as +# needed or desired. +# +# This section may need to be updated periodically as the module enviroment +# evolves because of updates to operating system and programming environment. +#------------------------------------------------------------------------------# + +#VERSION_CMAKE=3.12.1 + +#VERSION_INTEL=19.0.1 +#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0 +#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0 +#VERSION_INTEL_INSPECTOR=2019.1.0 +#VERSION_INTEL_TRACE_ANALYZER=2019.1.022 + +#VERSION_GNU=7.3.0 + +#VERSION_CCE=9.0.0.21672 +#VERSION_CRAY_MPICH=7.7.4.4 +#VERSION_CRAY_PERF_TOOLS=7.0.4 + +#VERSION_OPEN_MPI=3.1.2 + +#VERSION_FORGE=18.3 + +#------------------------------------------------------------------------------# +# Unless the user wants to modify options to the compiler, no changes should +# be needed below this point. +# +# If the user desires to configure compiler options, proceed to the section +# below for the chosen compiler. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure default compiler names to use Cray wrapper scripts. +#------------------------------------------------------------------------------# + +VPIC_COMPILER_C="cc" +VPIC_COMPILER_CXX="CC" + +if [ "$VMPI" = "OMPI" ] +then + VPIC_COMPILER_C="mpicc" + VPIC_COMPILER_CXX="mpicxx" +fi + +#------------------------------------------------------------------------------# +# Configure options for the Intel compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "INT" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to + # be vectorized always, regardless of computation work volume. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -inline-forceinline" + #FLAGS_CXX_COMPILER+=" -vec-threshold0" + FLAGS_CXX_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + # + # Use of "-craype-verbose" causes Cray compiler wrapper script to print + # command it is forwarding to actual compiler for invocation. This is very + # useful for producing a build log to make sure compiler is being invoked + # with expected options. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + FLAGS_CXX_COMPILER+=" -craype-verbose" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -qopt-report=5" + FLAGS_CXX_COMPILER+=" -qopt-report-phase=all" + FLAGS_CXX_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code, + # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512 + # code causes the CPU core to down clock or throttle to avoid overheating. + # The default is for the compiler to use some internal limits on how much + # AVX-512 instructions are used. This is relevant on ATS-1 systems only + # for KNL processors. + #--------------------------------------------------------------------------# + + if [ "$KNL" = "yes" ] + then + FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high" + fi + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to + # be vectorized always, regardless of computation work volume. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -inline-forceinline" + #FLAGS_C_COMPILER+=" -vec-threshold0" + FLAGS_C_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + # + # Use of "-craype-verbose" causes Cray compiler wrapper script to print + # command it is forwarding to actual compiler for invocation. This is very + # useful for producing a build log to make sure compiler is being invoked + # with expected options. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + FLAGS_C_COMPILER+=" -craype-verbose" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -qopt-report=5" + FLAGS_C_COMPILER+=" -qopt-report-phase=all" + FLAGS_C_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code, + # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512 + # code causes the CPU core to down clock or throttle to avoid overheating. + # The default is for the compiler to use some internal limits on how much + # AVX-512 instructions are used. This is relevant on ATS-1 systems only + # for KNL processors. + #--------------------------------------------------------------------------# + + if [ "$KNL" = "yes" ] + then + FLAGS_C_COMPILER+=" -qopt-zmm-usage=high" + fi +fi + +#------------------------------------------------------------------------------# +# Configure options for the GNU compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "GNU" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -ffast-math" + FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=knl" or "-march=haswell" causes g++ to generate code + # specific to and optimized for the specific architecture of either KNL + # or Haswell. It appears that the Cray wrappers already do this correctly + # for KNL but it seems they may not for Haswell. + #--------------------------------------------------------------------------# + + if [ "$KNL" = "yes" ] + then + FLAGS_CXX_COMPILER+=" -march=knl" + else + FLAGS_CXX_COMPILER+=" -march=haswell" + fi + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -ffast-math" + FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=knl" or "-march=haswell" causes gcc to generate code + # specific to and optimized for the specific architecture of either KNL + # or Haswell. It appears that the Cray wrappers already do this correctly + # for KNL but it seems they may not for Haswell. + #--------------------------------------------------------------------------# + + if [ "$KNL" = "yes" ] + then + FLAGS_C_COMPILER+=" -march=knl" + else + FLAGS_C_COMPILER+=" -march=haswell" + fi +fi + +#------------------------------------------------------------------------------# +# Configure options for the Cray compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "CCE" ] +then + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + #FLAGS_CXX_COMPILER+=" -hlist=ad" + #FLAGS_CXX_COMPILER+=" -hipa5" + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + #FLAGS_CXX_COMPILER+=" -rdynamic" + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + #FLAGS_C_COMPILER+=" -hlist=ad" + #FLAGS_C_COMPILER+=" -hipa5" + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" + #FLAGS_C_COMPILER+=" -rdynamic" + FLAGS_C_COMPILER+=" -dynamic" +fi + +#------------------------------------------------------------------------------# +# This ends user configuration section. +# +# No changes required below unless VPIC build system has been extended or the +# module system on ATS-1 machines has changed in some fundamental way. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure thread model. +#------------------------------------------------------------------------------# + +if [ "$VTHR" = "PTH" ] +then + SET_OPENMP="OFF" + SET_PTHREADS="ON" +fi + +if [ "$VTHR" = "OMP" ] +then + SET_OPENMP="ON" + SET_PTHREADS="OFF" +fi + +#------------------------------------------------------------------------------# +# Configure particle sort method. +#------------------------------------------------------------------------------# + +if [ "$VSORT" = "LSORT" ] +then + FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT" + + FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT" +fi + +#------------------------------------------------------------------------------# +# Configure environment using modules. +#------------------------------------------------------------------------------# +# Note that the user may want to modify the module configuration. +# +# Note that module commands used to define the build environment are captured +# in a Bash script named bashrc.modules which is written into the top level +# build directory. This script can be used in run scripts and other scenarios +# where there is a need to reproduce the environment used to build VPIC. +#------------------------------------------------------------------------------# + +echo '#!/bin/bash' >> bashrc.modules +echo "" >> bashrc.modules + +module load friendly-testing +echo "module load friendly-testing" >> bashrc.modules + +module load sandbox +echo "module load sandbox" >> bashrc.modules + +module load cmake +echo "module load cmake" >> bashrc.modules + +if [ ! "x$VERSION_CMAKE" = "x" ] +then + module swap cmake cmake/$VERSION_CMAKE + echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules +fi + +module unload craype-hugepages2M +echo "module unload craype-hugepages2M" >> bashrc.modules + +if [ "$VCOM" = "INT" ] +then + if [ ! "x$VERSION_INTEL" = "x" ] + then + module swap intel intel/$VERSION_INTEL + echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "GNU" ] +then + module swap PrgEnv-intel PrgEnv-gnu + echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules + + if [ ! "x$VERSION_GNU" = "x" ] + then + module swap gcc gcc/$VERSION_GNU + echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "CCE" ] +then + module swap PrgEnv-intel PrgEnv-cray + echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules + + if [ ! "x$VERSION_CCE" = "x" ] + then + module swap cce cce/$VERSION_CCE + echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules + fi +fi + +if [ "$KNL" = "yes" ] +then + module swap craype-haswell craype-mic-knl + echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules +fi + +if [ "$VMPI" = "CMPI" ] +then + if [ ! "x$VERSION_CRAY_MPICH" = "x" ] + then + module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH + echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules + fi + + export MPI_ROOT=$MPICH_DIR +fi + +if [ "$VMPI" = "OMPI" ] +then + module unload cray-mpich + echo "module unload cray-mpich" >> bashrc.modules + + module unload cray-libsci + echo "module unload cray-libsci" >> bashrc.modules + + module load openmpi + echo "module load openmpi" >> bashrc.modules + + if [ ! "x$VERSION_OPEN_MPI" = "x" ] + then + module swap openmpi openmpi/$VERSION_OPEN_MPI + echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules + fi +fi + +module list +echo "" >> bashrc.modules +echo "module list" >> bashrc.modules + +#------------------------------------------------------------------------------# +# Call cmake command. +#------------------------------------------------------------------------------# +# Notes: +# +# Use of the "-LAH" command line option to cmake causes cmake to output the +# values of all of its variables. This is useful information when debugging +# a failed build. +# +# Note that all of the possible VPIC cmake variables relevant to an ATS-1 +# system are set on the command line so that they can all be conditionally +# configured above through user selections. +#------------------------------------------------------------------------------# + +cmake \ + -LAH \ + -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \ + -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \ + -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \ + -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \ + -DUSE_V4_SSE=$SET_V4_SSE \ + -DUSE_V4_AVX=$SET_V4_AVX \ + -DUSE_V4_AVX2=$SET_V4_AVX2 \ + -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \ + -DUSE_V8_AVX=$SET_V8_AVX \ + -DUSE_V8_AVX2=$SET_V8_AVX2 \ + -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \ + -DUSE_V16_AVX512=$SET_V16_AVX512 \ + -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \ + -DUSE_OPENMP=$SET_OPENMP \ + -DUSE_PTHREADS=$SET_PTHREADS \ + -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \ + -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \ + -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \ + -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \ + -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \ + $src_dir + +#------------------------------------------------------------------------------# +# Call make command. +#------------------------------------------------------------------------------# +# Notes: +# +# In general, it is necessary to call the "make" command within this script +# because the module environment has been configured within this script. +# +# Setting VERBOSE=1 causes "make" to output the commands it is executing. +# This information is useful if debugging a failed build. +# +# If the NJ variable is not defined, "make" will perform a parallel build +# using maximum number of processors on the compilation machine. If using +# VERBOSE=1, the verbose output will be garbled by many processes writing +# to STDOUT at the same time and will be difficult to interpret. When using +# VERBOSE=1, it can be helpful to also use NJ=1. +#------------------------------------------------------------------------------# + +make -j $NJ VERBOSE=$SET_VERBOSE + +#------------------------------------------------------------------------------# +# Done. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# vim: syntax=sh +#------------------------------------------------------------------------------# diff --git a/arch/lanl-cts1 b/arch/lanl-cts1 new file mode 100755 index 00000000..edafd8f9 --- /dev/null +++ b/arch/lanl-cts1 @@ -0,0 +1,829 @@ +#! /usr/bin/env bash +#------------------------------------------------------------------------------# +# This script supports building VPIC on CTS-1 machines at Los Alamos National +# Laboratory (LANL). These machines run the Tri-lab TOSS 3.3 Operating System, +# a customized version of Red Hat Enterprise Linux 7.5. CTS-1 machines have +# dual socket 18 core Broadwell nodes. These machines provide three compiler +# choices: Intel, GNU and PGI. Three MPI implementations are provided: Open +# MPI, Intel MPI and Mvapich. +# +# Normal users should not need to change this script if happy with defaults. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Get the path to the project from which this script was called. +#------------------------------------------------------------------------------# + +src_dir="${0%/*}/.." + +#------------------------------------------------------------------------------# +# Configure the type of build that we want to perform. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# PGI: Portland Group compilers, now part of Nvidia +# +# Note that selecting PGI for Portland Group compilers has not been tested +# and probably does not work. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="PGI" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines +# IMPI: Intel MPI +# +# Choose Intel MPI if you want to use the Intel Application Performance +# Snapshot performance analysis tool to analyze MPI performance of VPIC or +# other Intel analysis tools which provide analysis of MPI usage. +#------------------------------------------------------------------------------# + +VMPI="OMPI" +#VMPI="IMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + +#------------------------------------------------------------------------------# +# Choose format of status update output. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_MORE_DIGITS=OFF, the output has two significant figures. +# +# If SET_MORE_DIGITS=ON, the output has four significant figures. +#------------------------------------------------------------------------------# + +SET_MORE_DIGITS="OFF" +#SET_MORE_DIGITS="ON" + +#------------------------------------------------------------------------------# +# Choose type of vector intrinsics support. +#------------------------------------------------------------------------------# +# Note the following constraints. +# +# Each of the eight variables in this section must have a configured value. +# This is because the corresponding "USE" cmake variable is set on the cmake +# command line below to allow any possible combinations to be configured using +# a single cmake command. +# +# If all values are configured as OFF, the scalar implementations of VPIC +# functions which are not vectorized will be used. +# +# It is possible to have a vector version configured as ON for each of the +# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC +# function has a V16 implementation, that will be used. If there is not a V16 +# implementation but there is a V8 implementation, that will be used. If there +# is not a V16 or V8 implementation but there is a V4 implementation, that +# will be used. Finally, for functions that have no vector implementations, +# the scalar version will be used. +# +# Currently, it is recommended to always configure the appropriate V4 version +# as on if using vector versions because there are key functions that only +# have a V4 version because the current algorithm does not generalize to +# longer vector lengths. An example is the move_p function. Since the V4 +# versions are generally more performant than the scalar versions, it makes +# sense to use them even when using the longer vector length implementations +# for other VPIC functions. +# +# In summary, when using vector versions on a machine with 256 bit SIMD, the +# V4 and V8 implementations should be configured as ON. +# +# First, we turn all of the vector options OFF. Then, we turn on the ones we +# want. +#------------------------------------------------------------------------------# + +SET_V4_PORTABLE="OFF" +SET_V4_SSE="OFF" +SET_V4_AVX="OFF" +SET_V4_AVX2="OFF" +SET_V8_PORTABLE="OFF" +SET_V8_AVX="OFF" +SET_V8_AVX2="OFF" +SET_V16_PORTABLE="OFF" + +#SET_V4_PORTABLE="ON" +#SET_V4_SSE="ON" +#SET_V4_AVX="ON" +SET_V4_AVX2="ON" +#SET_V8_PORTABLE="ON" +#SET_V8_AVX="ON" +SET_V8_AVX2="ON" +#SET_V16_PORTABLE="ON" + +#------------------------------------------------------------------------------# +# Choose a particle sort implementation. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are the +# following. +# +# LSORT: legacy, thread serial sort +# TSORT: thread parallel sort +# +# The LSORT particle sort implementation is the thread serial particle sort +# implementation from the legacy v407 version of VPIC. This implementation +# supports both in-place and out-of-place sorting of the particles. It is very +# competitive with the thread parallel sort implementation for a small number +# of threads per MPI rank, i.e. 4 or less. Also, the memory footprint of VPIC +# is reduced by the memory of a particle array which can be significant for +# particle dominated problems. +# +# The TSORT particle sort implementation is a thread parallel implementation. +# Currently, it can only perform out-of-place sorting of the particles. It will +# be more performant than the LSORT implementation when using many threads per +# MPI rank but uses more memory because of the out-of-place sort. +#------------------------------------------------------------------------------# + +VSORT="LSORT" +#VSORT="TSORT" + +#------------------------------------------------------------------------------# +# Choose type of library to build. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is to build a static library, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_SHARED_LIBS="OFF" +#SET_SHARED_LIBS="ON" + +#------------------------------------------------------------------------------# +# Choose integrated test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the integrated tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_INTEGRATED_TESTS="OFF" +#SET_INTEGRATED_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose unit test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the unit tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_UNIT_TESTS="OFF" +#SET_UNIT_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose the CMake build type. +#------------------------------------------------------------------------------# +# One of the available options must be chosen. Valid options depend on build +# types available in the CMake version but include at least the following. +# +# Release: In general, the default for CMake. +# None: Tells CMake not to use any pre-defined build type and gives VPIC build +# system total control of CMake variables defined on cmake command line. +#------------------------------------------------------------------------------# + +SET_BUILD_TYPE="Release" +#SET_BUILD_TYPE="None" + +#------------------------------------------------------------------------------# +# Choose number of parallel make processes for build. +#------------------------------------------------------------------------------# +# If NJ variable is not defined, "make" will perform a parallel build using +# maximum number of processors on the compilation machine. +# +# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many +# processes writing to STDOUT at the same time and will be difficult to +# interpret. +# +# When using VERBOSE = 1, use of NJ = 1 is recommended. +# +# The default is to use a modest number of processes in the parallel build. +# +# Comment out default below to use all processors on compilation machine. +#------------------------------------------------------------------------------# + +NJ=8 +#NJ=1 + +#------------------------------------------------------------------------------# +# Choose verbosity of "make" output. +#------------------------------------------------------------------------------# +# Setting VERBOSE = 1 causes "make" to output commands it is executing. +# +# This information is useful if debugging a failed build. +# +# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build. +# +# The default is a quiet build. +#------------------------------------------------------------------------------# + +SET_VERBOSE=0 +#SET_VERBOSE=1 + +#------------------------------------------------------------------------------# +# Choose versions of modules to use if default is not desired. +#------------------------------------------------------------------------------# +# No choice is required in this section. +# +# Some possible alternative module versions are provided below. Change as +# needed or desired. +# +# This section may need to be updated periodically as the module enviroment +# evolves because of updates to operating system and programming environment. +#------------------------------------------------------------------------------# + +#VERSION_CMAKE=3.12.1 + +#VERSION_INTEL=18.0.3 +#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0 +#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0 +#VERSION_INTEL_INSPECTOR=2019.1.0 +#VERSION_INTEL_TRACE_ANALYZER=2019.1.022 +#VERSION_INTEL_MPI=2019.1 + +#VERSION_GNU=7.3.0 + +#VERSION_PGI=18.10 + +#VERSION_OPEN_MPI=3.1.2 + +#VERSION_FORGE=18.3 + +#------------------------------------------------------------------------------# +# Unless the user wants to modify options to the compiler, no changes should +# be needed below this point. +# +# If the user desires to configure compiler options, proceed to the section +# below for the chosen compiler. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure default compiler names to use Cray wrapper scripts. +#------------------------------------------------------------------------------# + +VPIC_COMPILER_C="mpicc" +VPIC_COMPILER_CXX="mpicxx" + +if [ "$VMPI" = "IMPI" ] +then + VPIC_COMPILER_C="mpiicc" + VPIC_COMPILER_CXX="mpiicpc" +fi + +#------------------------------------------------------------------------------# +# Configure options for the Intel compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "INT" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to + # be vectorized always, regardless of computation work volume. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -inline-forceinline" + #FLAGS_CXX_COMPILER+=" -vec-threshold0" + FLAGS_CXX_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -qopt-report=5" + FLAGS_CXX_COMPILER+=" -qopt-report-phase=all" + FLAGS_CXX_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to + # be vectorized always, regardless of computation work volume. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -inline-forceinline" + #FLAGS_C_COMPILER+=" -vec-threshold0" + FLAGS_C_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -qopt-report=5" + FLAGS_C_COMPILER+=" -qopt-report-phase=all" + FLAGS_C_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" +fi + +#------------------------------------------------------------------------------# +# Configure options for the GNU compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "GNU" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -ffast-math" + FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=broadwell" causes g++ to generate code specific to and + # optimized for the architecture of Broadwell. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -march=broadwell" + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -ffast-math" + FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=broadwell" causes gcc to generate code specific to and + # optimized for the architecture of Broadwell. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -march=broadwell" +fi + +#------------------------------------------------------------------------------# +# Configure options for the PGI compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "PGI" ] +then + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" +fi + +#------------------------------------------------------------------------------# +# This ends user configuration section. +# +# No changes required below unless VPIC build system has been extended or the +# module system on CTS-1 machines has changed in some fundamental way. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure thread model. +#------------------------------------------------------------------------------# + +if [ "$VTHR" = "PTH" ] +then + SET_OPENMP="OFF" + SET_PTHREADS="ON" +fi + +if [ "$VTHR" = "OMP" ] +then + SET_OPENMP="ON" + SET_PTHREADS="OFF" +fi + +#------------------------------------------------------------------------------# +# Configure particle sort method. +#------------------------------------------------------------------------------# + +if [ "$VSORT" = "LSORT" ] +then + FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT" + + FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT" +fi + +#------------------------------------------------------------------------------# +# Configure environment using modules. +#------------------------------------------------------------------------------# +# Note that the user may want to modify the module configuration. +# +# Note that module commands used to define the build environment are captured +# in a Bash script named bashrc.modules which is written into the top level +# build directory. This script can be used in run scripts and other scenarios +# where there is a need to reproduce the environment used to build VPIC. +#------------------------------------------------------------------------------# + +echo '#!/bin/bash' >> bashrc.modules +echo "" >> bashrc.modules + +module purge +echo "module purge" >> bashrc.modules + +module load friendly-testing +echo "module load friendly-testing" >> bashrc.modules + +module load sandbox +echo "module load sandbox" >> bashrc.modules + +if [ ! "x$VERSION_CMAKE" = "x" ] +then + module load cmake/$VERSION_CMAKE + echo "module load cmake/$VERSION_CMAKE" >> bashrc.modules +else + module load cmake + echo "module load cmake" >> bashrc.modules +fi + +if [ "$VCOM" = "INT" ] +then + if [ ! "x$VERSION_INTEL" = "x" ] + then + module load intel/$VERSION_INTEL + echo "module load intel/$VERSION_INTEL" >> bashrc.modules + else + module load intel + echo "module load intel" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "GNU" ] +then + if [ ! "x$VERSION_GNU" = "x" ] + then + module load gcc/$VERSION_GNU + echo "module load gcc/$VERSION_GNU" >> bashrc.modules + else + module load gcc + echo "module load gcc" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "PGI" ] +then + if [ ! "x$VERSION_PGI" = "x" ] + then + module load pgi/$VERSION_PGI + echo "module load pgi/$VERSION_PGI" >> bashrc.modules + else + module load pgi + echo "module load pgi" >> bashrc.modules + fi +fi + +if [ "$VMPI" = "OMPI" ] +then + if [ ! "x$VERSION_OPEN_MPI" = "x" ] + then + module load openmpi/$VERSION_OPEN_MPI + echo "module load openmpi/$VERSION_OPEN_MPI" >> bashrc.modules + else + module load openmpi + echo "module load openmpi" >> bashrc.modules + fi +fi + +if [ "$VMPI" = "IMPI" ] +then + if [ ! "x$VERSION_INTEL_MPI" = "x" ] + then + module load intel-mpi/$VERSION_INTEL_MPI + echo "module load intel-mpi/$VERSION_INTEL_MPI" >> bashrc.modules + else + module load intel-mpi + echo "module load intel-mpi" >> bashrc.modules + fi +fi + +module list +echo "" >> bashrc.modules +echo "module list" >> bashrc.modules + +#------------------------------------------------------------------------------# +# Call cmake command. +#------------------------------------------------------------------------------# +# Notes: +# +# Use of the "-LAH" command line option to cmake causes cmake to output the +# values of all of its variables. This is useful information when debugging +# a failed build. +# +# Note that all of the possible VPIC cmake variables relevant to a CTS-1 +# system are set on the command line so that they can all be conditionally +# configured above through user selections. +#------------------------------------------------------------------------------# + +cmake \ + -LAH \ + -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \ + -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \ + -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \ + -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \ + -DUSE_V4_SSE=$SET_V4_SSE \ + -DUSE_V4_AVX=$SET_V4_AVX \ + -DUSE_V4_AVX2=$SET_V4_AVX2 \ + -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \ + -DUSE_V8_AVX=$SET_V8_AVX \ + -DUSE_V8_AVX2=$SET_V8_AVX2 \ + -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \ + -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \ + -DUSE_OPENMP=$SET_OPENMP \ + -DUSE_PTHREADS=$SET_PTHREADS \ + -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \ + -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \ + -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \ + -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \ + -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \ + $src_dir + +#------------------------------------------------------------------------------# +# Call make command. +#------------------------------------------------------------------------------# +# Notes: +# +# In general, it is necessary to call the "make" command within this script +# because the module environment has been configured within this script. +# +# Setting VERBOSE=1 causes "make" to output the commands it is executing. +# This information is useful if debugging a failed build. +# +# If the NJ variable is not defined, "make" will perform a parallel build +# using maximum number of processors on the compilation machine. If using +# VERBOSE=1, the verbose output will be garbled by many processes writing +# to STDOUT at the same time and will be difficult to interpret. When using +# VERBOSE=1, it can be helpful to also use NJ=1. +#------------------------------------------------------------------------------# + +make -j $NJ VERBOSE=$SET_VERBOSE + +#------------------------------------------------------------------------------# +# Done. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# vim: syntax=sh +#------------------------------------------------------------------------------# From c8d0849607394b86aaf95777dae75ab4c37bce31 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 31 Jan 2019 01:47:35 -0700 Subject: [PATCH 04/95] Additional updates to documentation. --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d20a36b8..b8284968 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ of DOE Advanced Technology Systems and consist of a partition of dual socket Int Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily documented and can be configured to provide a large variety of custom builds for their respective platform types. These -scripts could also serve as a good starting point for development of a build script for other machine types. Because these +scripts could also serve as a good starting point for development of a build script for other platform types. Because these scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make commands. @@ -113,7 +113,14 @@ From the user created build directory, these scripts can be invoked as follows: ../arch/lanl-ats1 ``` -Advanced users may chose to instead invoke `cmake` directly and hand select options. +or + +```bash + ../arch/lanl-cts1 +``` + +Advanced users may choose to instead invoke `cmake` directly and hand select options. Documentation on valid ways +to select these options may be found in the lanl-ats1 and lanl-cts1 build scripts mentioned above. GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`). From aeabefd0205828a82761ce9a095cf05acf8f5b81 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 31 Jan 2019 02:27:08 -0700 Subject: [PATCH 05/95] Update compiler option documentation to make more accurate. --- arch/lanl-ats1 | 10 ++++++---- arch/lanl-cts1 | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/lanl-ats1 b/arch/lanl-ats1 index 981377ae..b6dea6b9 100755 --- a/arch/lanl-ats1 +++ b/arch/lanl-ats1 @@ -353,8 +353,9 @@ then # ignored that are used to limit memory usage and excessive compile times # by the compiler. # - # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to - # be vectorized always, regardless of computation work volume. + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. #--------------------------------------------------------------------------# FLAGS_CXX_COMPILER+=" -inline-forceinline" @@ -459,8 +460,9 @@ then # ignored that are used to limit memory usage and excessive compile times # by the compiler. # - # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to - # be vectorized always, regardless of computation work volume. + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. #--------------------------------------------------------------------------# FLAGS_C_COMPILER+=" -inline-forceinline" diff --git a/arch/lanl-cts1 b/arch/lanl-cts1 index edafd8f9..c18d8999 100755 --- a/arch/lanl-cts1 +++ b/arch/lanl-cts1 @@ -326,8 +326,9 @@ then # ignored that are used to limit memory usage and excessive compile times # by the compiler. # - # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to - # be vectorized always, regardless of computation work volume. + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. #--------------------------------------------------------------------------# FLAGS_CXX_COMPILER+=" -inline-forceinline" @@ -402,8 +403,9 @@ then # ignored that are used to limit memory usage and excessive compile times # by the compiler. # - # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to - # be vectorized always, regardless of computation work volume. + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. #--------------------------------------------------------------------------# FLAGS_C_COMPILER+=" -inline-forceinline" From c3768115200ab19d65331e7c5a988d2dc7f4d236 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 31 Jan 2019 08:12:51 -0700 Subject: [PATCH 06/95] Reorder options a bit. --- arch/lanl-ats1 | 120 ++++++++++++++++++++++++------------------------- arch/lanl-cts1 | 108 ++++++++++++++++++++++---------------------- 2 files changed, 114 insertions(+), 114 deletions(-) diff --git a/arch/lanl-ats1 b/arch/lanl-ats1 index b6dea6b9..7a5cb90e 100755 --- a/arch/lanl-ats1 +++ b/arch/lanl-ats1 @@ -26,26 +26,6 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -# Choose a compiler. -#------------------------------------------------------------------------------# -# One of the compiler choices in this section must be chosen. Valid options -# are the following. -# -# INT: Intel compilers -# GNU: GNU compilers -# CCE: Cray compilers -# -# Note that selecting CCE for the Cray compilers currently does not work. The -# main reason why you might want to compile with the Cray compilers is to use -# some of the Cray specific tools like Reveal or a small set of features in -# the CrayPat profiling software. This is not a common use case for users. -#------------------------------------------------------------------------------# - -VCOM="INT" -#VCOM="GNU" -#VCOM="CCE" - #------------------------------------------------------------------------------# # Choose a processor node type. #------------------------------------------------------------------------------# @@ -63,46 +43,6 @@ VCOM="INT" KNL="yes" #HSW="yes" -#------------------------------------------------------------------------------# -# Choose an MPI implementation. -#------------------------------------------------------------------------------# -# One of the MPI library choices must be chosen. Valid options are the -# following. -# -# CMPI: Cray Mpich, the Cray supported MPI library -# OMPI: Open MPI -#------------------------------------------------------------------------------# - -VMPI="CMPI" -#VMPI="OMPI" - -#------------------------------------------------------------------------------# -# Choose a thread model. -#------------------------------------------------------------------------------# -# One of the two available thread models must be chosen. Valid options are the -# following. -# -# PTH: Pthreads -# OMP: OpenMP -#------------------------------------------------------------------------------# - -VTHR="PTH" -#VTHR="OMP" - -#------------------------------------------------------------------------------# -# Choose format of status update output. -#------------------------------------------------------------------------------# -# One of the two available options must be chosen. Valid options are ON and -# OFF. -# -# If SET_MORE_DIGITS=OFF, the output has two significant figures. -# -# If SET_MORE_DIGITS=ON, the output has four significant figures. -#------------------------------------------------------------------------------# - -SET_MORE_DIGITS="OFF" -#SET_MORE_DIGITS="ON" - #------------------------------------------------------------------------------# # Choose type of vector intrinsics support. #------------------------------------------------------------------------------# @@ -160,6 +100,66 @@ SET_V4_AVX2="ON" #SET_V16_PORTABLE="ON" SET_V16_AVX512="ON" +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# CCE: Cray compilers +# +# Note that selecting CCE for the Cray compilers currently does not work. The +# main reason why you might want to compile with the Cray compilers is to use +# some of the Cray specific tools like Reveal or a small set of features in +# the CrayPat profiling software. This is not a common use case for users. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="CCE" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# CMPI: Cray Mpich, the Cray supported MPI library +# OMPI: Open MPI +#------------------------------------------------------------------------------# + +VMPI="CMPI" +#VMPI="OMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + +#------------------------------------------------------------------------------# +# Choose format of status update output. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_MORE_DIGITS=OFF, the output has two significant figures. +# +# If SET_MORE_DIGITS=ON, the output has four significant figures. +#------------------------------------------------------------------------------# + +SET_MORE_DIGITS="OFF" +#SET_MORE_DIGITS="ON" + #------------------------------------------------------------------------------# # Choose a particle sort implementation. #------------------------------------------------------------------------------# diff --git a/arch/lanl-cts1 b/arch/lanl-cts1 index c18d8999..a846c0ec 100755 --- a/arch/lanl-cts1 +++ b/arch/lanl-cts1 @@ -20,6 +20,60 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +# Choose type of vector intrinsics support. +#------------------------------------------------------------------------------# +# Note the following constraints. +# +# Each of the eight variables in this section must have a configured value. +# This is because the corresponding "USE" cmake variable is set on the cmake +# command line below to allow any possible combinations to be configured using +# a single cmake command. +# +# If all values are configured as OFF, the scalar implementations of VPIC +# functions which are not vectorized will be used. +# +# It is possible to have a vector version configured as ON for each of the +# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC +# function has a V16 implementation, that will be used. If there is not a V16 +# implementation but there is a V8 implementation, that will be used. If there +# is not a V16 or V8 implementation but there is a V4 implementation, that +# will be used. Finally, for functions that have no vector implementations, +# the scalar version will be used. +# +# Currently, it is recommended to always configure the appropriate V4 version +# as on if using vector versions because there are key functions that only +# have a V4 version because the current algorithm does not generalize to +# longer vector lengths. An example is the move_p function. Since the V4 +# versions are generally more performant than the scalar versions, it makes +# sense to use them even when using the longer vector length implementations +# for other VPIC functions. +# +# In summary, when using vector versions on a machine with 256 bit SIMD, the +# V4 and V8 implementations should be configured as ON. +# +# First, we turn all of the vector options OFF. Then, we turn on the ones we +# want. +#------------------------------------------------------------------------------# + +SET_V4_PORTABLE="OFF" +SET_V4_SSE="OFF" +SET_V4_AVX="OFF" +SET_V4_AVX2="OFF" +SET_V8_PORTABLE="OFF" +SET_V8_AVX="OFF" +SET_V8_AVX2="OFF" +SET_V16_PORTABLE="OFF" + +#SET_V4_PORTABLE="ON" +#SET_V4_SSE="ON" +#SET_V4_AVX="ON" +SET_V4_AVX2="ON" +#SET_V8_PORTABLE="ON" +#SET_V8_AVX="ON" +SET_V8_AVX2="ON" +#SET_V16_PORTABLE="ON" + #------------------------------------------------------------------------------# # Choose a compiler. #------------------------------------------------------------------------------# @@ -82,60 +136,6 @@ VTHR="PTH" SET_MORE_DIGITS="OFF" #SET_MORE_DIGITS="ON" -#------------------------------------------------------------------------------# -# Choose type of vector intrinsics support. -#------------------------------------------------------------------------------# -# Note the following constraints. -# -# Each of the eight variables in this section must have a configured value. -# This is because the corresponding "USE" cmake variable is set on the cmake -# command line below to allow any possible combinations to be configured using -# a single cmake command. -# -# If all values are configured as OFF, the scalar implementations of VPIC -# functions which are not vectorized will be used. -# -# It is possible to have a vector version configured as ON for each of the -# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC -# function has a V16 implementation, that will be used. If there is not a V16 -# implementation but there is a V8 implementation, that will be used. If there -# is not a V16 or V8 implementation but there is a V4 implementation, that -# will be used. Finally, for functions that have no vector implementations, -# the scalar version will be used. -# -# Currently, it is recommended to always configure the appropriate V4 version -# as on if using vector versions because there are key functions that only -# have a V4 version because the current algorithm does not generalize to -# longer vector lengths. An example is the move_p function. Since the V4 -# versions are generally more performant than the scalar versions, it makes -# sense to use them even when using the longer vector length implementations -# for other VPIC functions. -# -# In summary, when using vector versions on a machine with 256 bit SIMD, the -# V4 and V8 implementations should be configured as ON. -# -# First, we turn all of the vector options OFF. Then, we turn on the ones we -# want. -#------------------------------------------------------------------------------# - -SET_V4_PORTABLE="OFF" -SET_V4_SSE="OFF" -SET_V4_AVX="OFF" -SET_V4_AVX2="OFF" -SET_V8_PORTABLE="OFF" -SET_V8_AVX="OFF" -SET_V8_AVX2="OFF" -SET_V16_PORTABLE="OFF" - -#SET_V4_PORTABLE="ON" -#SET_V4_SSE="ON" -#SET_V4_AVX="ON" -SET_V4_AVX2="ON" -#SET_V8_PORTABLE="ON" -#SET_V8_AVX="ON" -SET_V8_AVX2="ON" -#SET_V16_PORTABLE="ON" - #------------------------------------------------------------------------------# # Choose a particle sort implementation. #------------------------------------------------------------------------------# From c96f8c5bbb8e5df7d6608e9274081f0ec56bc34d Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 31 Jan 2019 12:02:43 -0700 Subject: [PATCH 07/95] For the lanl-ats1 script, make sure that the Cray programming environment starts out as the Cray default of PrgEnv-intel. This change checks for the case where the user has modified their module environment and swaps it back to the case assumed by the build script. --- arch/lanl-ats1 | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/lanl-ats1 b/arch/lanl-ats1 index 7a5cb90e..d2a4af50 100755 --- a/arch/lanl-ats1 +++ b/arch/lanl-ats1 @@ -798,6 +798,21 @@ then FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT" fi +#------------------------------------------------------------------------------# +# Make sure the Cray programming environment is configured as the default of +# PrgEnv-intel. +#------------------------------------------------------------------------------# + +if [ "$CRAY_PRGENVGNU" = "loaded" ] +then + module swap PrgEnv-gnu PrgEnv-intel +fi + +if [ "$CRAY_PRGENVCRAY" = "loaded" ] +then + module swap PrgEnv-cray PrgEnv-intel +fi + #------------------------------------------------------------------------------# # Configure environment using modules. #------------------------------------------------------------------------------# From bb2baea590785e750c61ed52596f283da4b72d22 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 11 Feb 2019 14:03:34 -0700 Subject: [PATCH 08/95] Fix issues and errors in use of float literals introduced in a previous commit. --- src/species_advance/standard/move_p.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/species_advance/standard/move_p.cc b/src/species_advance/standard/move_p.cc index 085a59ca..dfba3785 100644 --- a/src/species_advance/standard/move_p.cc +++ b/src/species_advance/standard/move_p.cc @@ -240,13 +240,13 @@ move_p( particle_t * ALIGNED(128) p0, s_dir[0] = (s_dispx>0.0f) ? 1.0f : -1.0f; s_dir[1] = (s_dispy>0.0f) ? 1.0f : -1.0f; - s_dir[2] = (s_dispz>0.0) ? 1.0f : -1.0f; + s_dir[2] = (s_dispz>0.0f) ? 1.0f : -1.0f; // Compute the twice the fractional distance to each potential // streak/cell face intersection. - v0 = (s_dispx==0) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx; - v1 = (s_dispy==0) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy; - v2 = (s_dispz==0) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz; + v0 = (s_dispx==0.0f) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx; + v1 = (s_dispy==0.0f) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy; + v2 = (s_dispz==0.0f) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz; // Determine the fractional length and axis of current streak. The // streak ends on either the first face intersected by the @@ -254,10 +254,10 @@ move_p( particle_t * ALIGNED(128) p0, // // axis 0,1 or 2 ... streak ends on a x,y or z-face respectively // axis 3 ... streak ends at end of the particle track - /**/ v3=2.0f, axis=3.0f; - if(v0 Date: Mon, 11 Feb 2019 16:44:24 -0700 Subject: [PATCH 09/95] Add CMake support for configuring a VPIC build with the legacy particle sort implementation. Add build script support for a few more CMake variables that were missing and should be availble to users of the build scripts. --- CMakeLists.txt | 12 ++++++++++++ arch/lanl-ats1 | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- arch/lanl-cts1 | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 106 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b573cd57..2f9902c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,8 @@ option(USE_V16_PORTABLE "Enable V16 Portable" OFF) option(USE_V16_AVX512 "Enable V16 AVX512" OFF) +option(USE_LEGACY_SORT "Enable Legacy Sort Implementation" OFF) + #option(USE_ADVANCE_P_AUTOVEC "Enable Explicit Autovec" OFF) option(VPIC_PRINT_MORE_DIGITS "Print more digits in VPIC timer info" OFF) @@ -109,6 +111,7 @@ endif(DISABLE_DYNAMIC_RESIZING) if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO") add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES}) endif() + #------------------------------------------------------------------------------# # OpenSSL #------------------------------------------------------------------------------# @@ -127,6 +130,15 @@ find_package(Threads REQUIRED) # Act on build options set in project.cmake #------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +# Add options for building with the legacy particle sort implementation. +#------------------------------------------------------------------------------# + +if(USE_LEGACY_SORT) + add_definitions(-DVPIC_USE_LEGACY_SORT) + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT") +endif(USE_LEGACY_SORT) + #------------------------------------------------------------------------------# # Add options for building with a threading model. #------------------------------------------------------------------------------# diff --git a/arch/lanl-ats1 b/arch/lanl-ats1 index d2a4af50..46a2b34f 100755 --- a/arch/lanl-ats1 +++ b/arch/lanl-ats1 @@ -221,6 +221,48 @@ SET_INTEGRATED_TESTS="OFF" SET_UNIT_TESTS="OFF" #SET_UNIT_TESTS="ON" +#------------------------------------------------------------------------------# +# Choose OpenSSL support for checksums. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off. +# +# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on. +#------------------------------------------------------------------------------# + +SET_ENABLE_OPENSSL="OFF" +#SET_ENABLE_OPENSSL="ON" + +#------------------------------------------------------------------------------# +# Choose support for dynamic resizing of particle arrays. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized +# dynamically. +# +# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized +# dynamically and the user will be responsible for ensuring that particle +# arrays have enough space to handle the evolution of a non-uniform particle +# distribution. +#------------------------------------------------------------------------------# + +SET_DISABLE_DYNAMIC_RESIZING="OFF" +#SET_DISABLE_DYNAMIC_RESIZING="ON" + +#------------------------------------------------------------------------------# +# Choose the minimum number of particles to dynamically allocate space for. +#------------------------------------------------------------------------------# +# A value must be chosen. The default is 128 particles which allocates space +# equal to a 4 KByte page size. +#------------------------------------------------------------------------------# + +SET_PARTICLE_MIN_NUM="128" +#SET_PARTICLE_MIN_NUM="32768" + #------------------------------------------------------------------------------# # Choose the CMake build type. #------------------------------------------------------------------------------# @@ -793,9 +835,7 @@ fi if [ "$VSORT" = "LSORT" ] then - FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT" - - FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT" + SET_LEGACY_SORT="ON" fi #------------------------------------------------------------------------------# @@ -936,6 +976,10 @@ cmake \ -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \ -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \ -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \ + -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \ + -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \ + -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \ + -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \ -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \ -DUSE_V4_SSE=$SET_V4_SSE \ -DUSE_V4_AVX=$SET_V4_AVX \ diff --git a/arch/lanl-cts1 b/arch/lanl-cts1 index a846c0ec..74364294 100755 --- a/arch/lanl-cts1 +++ b/arch/lanl-cts1 @@ -195,6 +195,48 @@ SET_INTEGRATED_TESTS="OFF" SET_UNIT_TESTS="OFF" #SET_UNIT_TESTS="ON" +#------------------------------------------------------------------------------# +# Choose OpenSSL support for checksums. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off. +# +# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on. +#------------------------------------------------------------------------------# + +SET_ENABLE_OPENSSL="OFF" +#SET_ENABLE_OPENSSL="ON" + +#------------------------------------------------------------------------------# +# Choose support for dynamic resizing of particle arrays. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized +# dynamically. +# +# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized +# dynamically and the user will be responsible for ensuring that particle +# arrays have enough space to handle the evolution of a non-uniform particle +# distribution. +#------------------------------------------------------------------------------# + +SET_DISABLE_DYNAMIC_RESIZING="OFF" +#SET_DISABLE_DYNAMIC_RESIZING="ON" + +#------------------------------------------------------------------------------# +# Choose the minimum number of particles to dynamically allocate space for. +#------------------------------------------------------------------------------# +# A value must be chosen. The default is 128 particles which allocates space +# equal to a 4 KByte page size. +#------------------------------------------------------------------------------# + +SET_PARTICLE_MIN_NUM="128" +#SET_PARTICLE_MIN_NUM="32768" + #------------------------------------------------------------------------------# # Choose the CMake build type. #------------------------------------------------------------------------------# @@ -664,9 +706,7 @@ fi if [ "$VSORT" = "LSORT" ] then - FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT" - - FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT" + SET_LEGACY_SORT="ON" fi #------------------------------------------------------------------------------# @@ -784,6 +824,10 @@ cmake \ -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \ -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \ -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \ + -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \ + -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \ + -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \ + -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \ -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \ -DUSE_V4_SSE=$SET_V4_SSE \ -DUSE_V4_AVX=$SET_V4_AVX \ From 8515047e87990d170c310f7f8fd0b0587e67dc6b Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 18 Feb 2019 21:19:54 -0700 Subject: [PATCH 10/95] Separate lanl-ats1 script into two separate scripts, one for Haswell nodes and one for KNL nodes. --- README.md | 26 +- arch/lanl-ats1-hsw | 963 ++++++++++++++++++++++++++++++ arch/{lanl-ats1 => lanl-ats1-knl} | 66 +- 3 files changed, 992 insertions(+), 63 deletions(-) create mode 100755 arch/lanl-ats1-hsw rename arch/{lanl-ats1 => lanl-ats1-knl} (95%) diff --git a/README.md b/README.md index 4392ec9b..2383a792 100644 --- a/README.md +++ b/README.md @@ -97,20 +97,26 @@ After configuration, simply type: make ``` -Two scripts in the `./arch` directory are of particular note: lanl-ats1 and lanl-cts1. These scripts provide a default way to build VPIC -on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL ATS-1 clusters are the first generation -of DOE Advanced Technology Systems and consist of a partition of dual socket Intel Haswell nodes and a partition of single socket -Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of -dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily -documented and can be configured to provide a large variety of custom builds for their respective platform types. These -scripts could also serve as a good starting point for development of a build script for other platform types. Because these -scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make -commands. +Three scripts in the `./arch` directory are of particular note: lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1. These scripts +provide a default way to build VPIC on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL +ATS-1 clusters are the first generation of DOE Advanced Technology Systems and consist of a partition of dual socket Intel +Haswell nodes and a partition of single socket Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation +of DOE Commodity Technology Systems and consist of dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. +The lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1 scripts are heavily documented and can be configured to provide a large +variety of custom builds for their respective platform types. These scripts could also serve as a good starting point for +development of a build script for other platform types. Because these scripts also configure the users build environment +via the use of module commands, the scripts run both the cmake and make commands. From the user created build directory, these scripts can be invoked as follows: ```bash - ../arch/lanl-ats1 + ../arch/lanl-ats1-hsw +``` + +or + +```bash + ../arch/lanl-ats1-knl ``` or diff --git a/arch/lanl-ats1-hsw b/arch/lanl-ats1-hsw new file mode 100755 index 00000000..2b69ade9 --- /dev/null +++ b/arch/lanl-ats1-hsw @@ -0,0 +1,963 @@ +#! /usr/bin/env bash +#------------------------------------------------------------------------------# +# This script supports building VPIC on ATS-1 machines at Los Alamos National +# Laboratory (LANL) for Haswell nodes. These machines run the Cray Linux +# Environment Operating System and have two compute partitions, a Haswell +# partition and a Knights Landing (KNL) partition. Both processor types are +# Intel processors. These machines provide three compiler choices: Intel, GNU +# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open +# MPI. +# +# Normal users should not need to change this script if building VPIC to run +# on the Haswell nodes of ATS-1 machines and happy with defaults. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Get the path to the project from which this script was called. +#------------------------------------------------------------------------------# + +src_dir="${0%/*}/.." + +#------------------------------------------------------------------------------# +# Configure the type of build that we want to perform. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Choose type of vector intrinsics support. +#------------------------------------------------------------------------------# +# Note the following constraints. +# +# Each of the nine variables in this section must have a configured value. +# This is because the corresponding "USE" cmake variable is set on the cmake +# command line below to allow any possible combinations to be configured using +# a single cmake command. +# +# If all values are configured as OFF, the scalar implementations of VPIC +# functions which are not vectorized will be used. +# +# It is possible to have a vector version configured as ON for each of the +# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC +# function has a V16 implementation, that will be used. If there is not a V16 +# implementation but there is a V8 implementation, that will be used. If there +# is not a V16 or V8 implementation but there is a V4 implementation, that +# will be used. Finally, for functions that have no vector implementations, +# the scalar version will be used. +# +# Currently, it is recommended to always configure the appropriate V4 version +# as on if using vector versions because there are key functions that only +# have a V4 version because the current algorithm does not generalize to +# longer vector lengths. An example is the move_p function. Since the V4 +# versions are generally more performant than the scalar versions, it makes +# sense to use them even when using the longer vector length implementations +# for other VPIC functions. +# +# In summary, when using vector versions on a machine with 256 bit SIMD, the +# V4 and V8 implementations should be configured as ON. When using a machine +# with 512 bit SIMD, V4 and V16 implementations should be configured as ON. +# +# First, we turn all of the vector options OFF. Then, we turn on the ones we +# want. +#------------------------------------------------------------------------------# + +SET_V4_PORTABLE="OFF" +SET_V4_SSE="OFF" +SET_V4_AVX="OFF" +SET_V4_AVX2="OFF" +SET_V8_PORTABLE="OFF" +SET_V8_AVX="OFF" +SET_V8_AVX2="OFF" +SET_V16_PORTABLE="OFF" +SET_V16_AVX512="OFF" + +#SET_V4_PORTABLE="ON" +#SET_V4_SSE="ON" +#SET_V4_AVX="ON" +SET_V4_AVX2="ON" +#SET_V8_PORTABLE="ON" +#SET_V8_AVX="ON" +SET_V8_AVX2="ON" +#SET_V16_PORTABLE="ON" +#SET_V16_AVX512="ON" + +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# CCE: Cray compilers +# +# Note that selecting CCE for the Cray compilers currently does not work. The +# main reason why you might want to compile with the Cray compilers is to use +# some of the Cray specific tools like Reveal or a small set of features in +# the CrayPat profiling software. This is not a common use case for users. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="CCE" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# CMPI: Cray Mpich, the Cray supported MPI library +# OMPI: Open MPI +#------------------------------------------------------------------------------# + +VMPI="CMPI" +#VMPI="OMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + +#------------------------------------------------------------------------------# +# Choose format of status update output. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_MORE_DIGITS=OFF, the output has two significant figures. +# +# If SET_MORE_DIGITS=ON, the output has four significant figures. +#------------------------------------------------------------------------------# + +SET_MORE_DIGITS="OFF" +#SET_MORE_DIGITS="ON" + +#------------------------------------------------------------------------------# +# Choose a particle sort implementation. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are the +# following. +# +# LSORT: legacy, thread serial sort +# TSORT: thread parallel sort +# +# The LSORT particle sort implementation is the thread serial particle sort +# implementation from the legacy v407 version of VPIC. This implementation +# supports both in-place and out-of-place sorting of the particles. It is very +# competitive with the thread parallel sort implementation for a small number +# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting +# the particles in-place allows the fraction of particles stored in High +# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint +# of VPIC is reduced by the memory of a particle array which can be significant +# for particle dominated problems. +# +# The TSORT particle sort implementation is a thread parallel implementation. +# Currently, it can only perform out-of-place sorting of the particles. It will +# be more performant than the LSORT implementation when using many threads per +# MPI rank but uses more memory because of the out-of-place sort. +#------------------------------------------------------------------------------# + +VSORT="LSORT" +#VSORT="TSORT" + +#------------------------------------------------------------------------------# +# Choose type of library to build. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is to build a static library, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_SHARED_LIBS="OFF" +#SET_SHARED_LIBS="ON" + +#------------------------------------------------------------------------------# +# Choose integrated test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the integrated tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_INTEGRATED_TESTS="OFF" +#SET_INTEGRATED_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose unit test support. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON or OFF. +# +# The default is not to build the unit tests, i.e. OFF. +#------------------------------------------------------------------------------# + +SET_UNIT_TESTS="OFF" +#SET_UNIT_TESTS="ON" + +#------------------------------------------------------------------------------# +# Choose OpenSSL support for checksums. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off. +# +# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on. +#------------------------------------------------------------------------------# + +SET_ENABLE_OPENSSL="OFF" +#SET_ENABLE_OPENSSL="ON" + +#------------------------------------------------------------------------------# +# Choose support for dynamic resizing of particle arrays. +#------------------------------------------------------------------------------# +# One of the two available options must be chosen. Valid options are ON and +# OFF. +# +# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized +# dynamically. +# +# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized +# dynamically and the user will be responsible for ensuring that particle +# arrays have enough space to handle the evolution of a non-uniform particle +# distribution. +#------------------------------------------------------------------------------# + +SET_DISABLE_DYNAMIC_RESIZING="OFF" +#SET_DISABLE_DYNAMIC_RESIZING="ON" + +#------------------------------------------------------------------------------# +# Choose the minimum number of particles to dynamically allocate space for. +#------------------------------------------------------------------------------# +# A value must be chosen. The default is 128 particles which allocates space +# equal to a 4 KByte page size. +#------------------------------------------------------------------------------# + +SET_PARTICLE_MIN_NUM="128" +#SET_PARTICLE_MIN_NUM="32768" + +#------------------------------------------------------------------------------# +# Choose the CMake build type. +#------------------------------------------------------------------------------# +# One of the available options must be chosen. Valid options depend on build +# types available in the CMake version but include at least the following. +# +# Release: In general, the default for CMake. +# None: Tells CMake not to use any pre-defined build type and gives VPIC build +# system total control of CMake variables defined on cmake command line. +#------------------------------------------------------------------------------# + +SET_BUILD_TYPE="Release" +#SET_BUILD_TYPE="None" + +#------------------------------------------------------------------------------# +# Choose number of parallel make processes for build. +#------------------------------------------------------------------------------# +# If NJ variable is not defined, "make" will perform a parallel build using +# maximum number of processors on the compilation machine. +# +# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many +# processes writing to STDOUT at the same time and will be difficult to +# interpret. +# +# When using VERBOSE = 1, use of NJ = 1 is recommended. +# +# The default is to use a modest number of processes in the parallel build. +# +# Comment out default below to use all processors on compilation machine. +#------------------------------------------------------------------------------# + +NJ=8 +#NJ=1 + +#------------------------------------------------------------------------------# +# Choose verbosity of "make" output. +#------------------------------------------------------------------------------# +# Setting VERBOSE = 1 causes "make" to output commands it is executing. +# +# This information is useful if debugging a failed build. +# +# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build. +# +# The default is a quiet build. +#------------------------------------------------------------------------------# + +SET_VERBOSE=0 +#SET_VERBOSE=1 + +#------------------------------------------------------------------------------# +# Choose versions of modules to use if default is not desired. +#------------------------------------------------------------------------------# +# No choice is required in this section. +# +# Some possible alternative module versions are provided below. Change as +# needed or desired. +# +# This section may need to be updated periodically as the module enviroment +# evolves because of updates to operating system and programming environment. +#------------------------------------------------------------------------------# + +#VERSION_CMAKE=3.12.1 + +#VERSION_INTEL=19.0.1 +#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0 +#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0 +#VERSION_INTEL_INSPECTOR=2019.1.0 +#VERSION_INTEL_TRACE_ANALYZER=2019.1.022 + +#VERSION_GNU=7.3.0 + +#VERSION_CCE=9.0.0.21672 +#VERSION_CRAY_MPICH=7.7.4.4 +#VERSION_CRAY_PERF_TOOLS=7.0.4 + +#VERSION_OPEN_MPI=3.1.2 + +#VERSION_FORGE=18.3 + +#------------------------------------------------------------------------------# +# Unless the user wants to modify options to the compiler, no changes should +# be needed below this point. +# +# If the user desires to configure compiler options, proceed to the section +# below for the chosen compiler. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure default compiler names to use Cray wrapper scripts. +#------------------------------------------------------------------------------# + +VPIC_COMPILER_C="cc" +VPIC_COMPILER_CXX="CC" + +if [ "$VMPI" = "OMPI" ] +then + VPIC_COMPILER_C="mpicc" + VPIC_COMPILER_CXX="mpicxx" +fi + +#------------------------------------------------------------------------------# +# Configure options for the Intel compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "INT" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -inline-forceinline" + #FLAGS_CXX_COMPILER+=" -vec-threshold0" + FLAGS_CXX_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + # + # Use of "-craype-verbose" causes Cray compiler wrapper script to print + # command it is forwarding to actual compiler for invocation. This is very + # useful for producing a build log to make sure compiler is being invoked + # with expected options. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + FLAGS_CXX_COMPILER+=" -craype-verbose" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -qopt-report=5" + FLAGS_CXX_COMPILER+=" -qopt-report-phase=all" + FLAGS_CXX_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O3" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O3" could be + # "-O2" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O3" + + #--------------------------------------------------------------------------# + # Use of "-inline-forceinline" overrides default heuristics of compiler + # and forces inlining of functions marked with inline keyword if compiler + # is able to inline. For VPIC, this option has mainly been used when using + # a portable implementation to force inlining by compiler and also when + # use of "-Winline" option identifies functions not being inlined that are + # marked with inline keyword. + # + # Use of "-qoverride-limits" cause certain internal compiler limits to be + # ignored that are used to limit memory usage and excessive compile times + # by the compiler. + # + # Use of "-vec-threshold0" ignores compiler heuristics and causes loops + # which can be vectorized to always be vectorized, regardless of the + # amount of computational work in the loop. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -inline-forceinline" + #FLAGS_C_COMPILER+=" -vec-threshold0" + FLAGS_C_COMPILER+=" -qoverride-limits" + + #--------------------------------------------------------------------------# + # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI + # aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -no-ansi-alias" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + # + # Use of "-craype-verbose" causes Cray compiler wrapper script to print + # command it is forwarding to actual compiler for invocation. This is very + # useful for producing a build log to make sure compiler is being invoked + # with expected options. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + FLAGS_C_COMPILER+=" -craype-verbose" + + #--------------------------------------------------------------------------# + # Use of "-qopt-report=5" specifies level of detail in compiler reports. + # This is the maximum level of detail. + # + # Use of "-qopt-report-phase=all" causes all phases of compilation process + # to provide output for compiler reports. Compiler reports are useful for + # understanding how compiler is optimizing various parts of VPIC. + # + # Use of "-diag-disable 10397" disables printing of diagnostic message + # that compiler reports are being generated. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -qopt-report=5" + FLAGS_C_COMPILER+=" -qopt-report-phase=all" + FLAGS_C_COMPILER+=" -diag-disable 10397" + + #--------------------------------------------------------------------------# + # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -dynamic" +fi + +#------------------------------------------------------------------------------# +# Configure options for the GNU compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "GNU" ] +then + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -ffast-math" + FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=knl" or "-march=haswell" causes g++ to generate code + # specific to and optimized for the specific architecture of either KNL + # or Haswell. It appears that the Cray wrappers already do this correctly + # for KNL but it seems they may not for Haswell. + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER+=" -march=haswell" + + #--------------------------------------------------------------------------# + # Use "-g" to provide debug symbols in the executable. In general, use of + # "-g" with modern compilers does not degrade performance and provides + # information required by many tools such as debugging and performance + # analysis tools. + # + # Use of "-O2" provides fairly aggressive optimization. When using vector + # intrinsics versions, most of the optimization is explicit in the + # intrinsics implementations. Reasonable alternatives to "-O2" could be + # "-O3" or "-Ofast". These alternatives should be benchmarked sometime. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + + #--------------------------------------------------------------------------# + # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules + # and specifications for math functions which can result in faster code. + # + # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math + # optimizations that got turned on by use of "-ffast-math" option. Some + # comments in VPIC source code indicate need for this with older compilers. + # This should be checked some time to see if it is still a relevant issue. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -ffast-math" + FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations" + + #--------------------------------------------------------------------------# + # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a + # register for functions that do not need one. This can make an extra + # register available in many functions and reduce number of overall + # instructions. Some profiling should be done to measure the benefit of + # using this option. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fomit-frame-pointer" + + #--------------------------------------------------------------------------# + # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey + # ANSI aliasing rules which can reduce available optimizations. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -fno-strict-aliasing" + + #--------------------------------------------------------------------------# + # Use of "-Winline" cause compiler to emit a warning when a function that + # is declared inline is not inlined. Inlining is very important to VPIC + # performance and it is useful to know if compiler has not inlined a + # function that was assumed to be inlined. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -Winline" + + #--------------------------------------------------------------------------# + # Use of "-rdynamic" removes the following type of VPIC warnings. + # + # Unable to find a safely writable symbol that corresponds to address + # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux"). + # Writing out the raw address instead and keeping my fingers crossed. + # + # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on + # targets that support it. This instructs the linker to add all symbols, + # not only used ones, to the dynamic symbol table. This option is needed + # for some uses of "dlopen" or to allow obtaining backtraces from within + # a program. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -rdynamic" + + #--------------------------------------------------------------------------# + # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver + # to link dynamic libraries at runtime instead of static libraries. The + # default on Cray systems is to link static libraries. It is important for + # many tools, especially performance analysis tools, to have an executable + # that has been linked dynamically to system libraries and MPI libraries. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # Use of "-march=knl" or "-march=haswell" causes gcc to generate code + # specific to and optimized for the specific architecture of either KNL + # or Haswell. It appears that the Cray wrappers already do this correctly + # for KNL but it seems they may not for Haswell. + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER+=" -march=haswell" +fi + +#------------------------------------------------------------------------------# +# Configure options for the Cray compilers. +#------------------------------------------------------------------------------# + +if [ "$VCOM" = "CCE" ] +then + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_CXX_COMPILER="-g -O2" + #FLAGS_CXX_COMPILER+=" -hlist=ad" + #FLAGS_CXX_COMPILER+=" -hipa5" + FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic" + #FLAGS_CXX_COMPILER+=" -rdynamic" + FLAGS_CXX_COMPILER+=" -dynamic" + + #--------------------------------------------------------------------------# + # + #--------------------------------------------------------------------------# + + FLAGS_C_COMPILER="-g -O2" + #FLAGS_C_COMPILER+=" -hlist=ad" + #FLAGS_C_COMPILER+=" -hipa5" + FLAGS_C_COMPILER+=" -Wl,--export-dynamic" + #FLAGS_C_COMPILER+=" -rdynamic" + FLAGS_C_COMPILER+=" -dynamic" +fi + +#------------------------------------------------------------------------------# +# This ends user configuration section. +# +# No changes required below unless VPIC build system has been extended or the +# module system on ATS-1 machines has changed in some fundamental way. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# Configure thread model. +#------------------------------------------------------------------------------# + +if [ "$VTHR" = "PTH" ] +then + SET_OPENMP="OFF" + SET_PTHREADS="ON" +fi + +if [ "$VTHR" = "OMP" ] +then + SET_OPENMP="ON" + SET_PTHREADS="OFF" +fi + +#------------------------------------------------------------------------------# +# Configure particle sort method. +#------------------------------------------------------------------------------# + +if [ "$VSORT" = "LSORT" ] +then + SET_LEGACY_SORT="ON" +fi + +#------------------------------------------------------------------------------# +# Make sure the Cray programming environment is configured as the default of +# PrgEnv-intel. +#------------------------------------------------------------------------------# + +if [ "$CRAY_PRGENVGNU" = "loaded" ] +then + module swap PrgEnv-gnu PrgEnv-intel +fi + +if [ "$CRAY_PRGENVCRAY" = "loaded" ] +then + module swap PrgEnv-cray PrgEnv-intel +fi + +#------------------------------------------------------------------------------# +# Configure environment using modules. +#------------------------------------------------------------------------------# +# Note that the user may want to modify the module configuration. +# +# Note that module commands used to define the build environment are captured +# in a Bash script named bashrc.modules which is written into the top level +# build directory. This script can be used in run scripts and other scenarios +# where there is a need to reproduce the environment used to build VPIC. +#------------------------------------------------------------------------------# + +echo '#!/bin/bash' >> bashrc.modules +echo "" >> bashrc.modules + +module load friendly-testing +echo "module load friendly-testing" >> bashrc.modules + +module load sandbox +echo "module load sandbox" >> bashrc.modules + +module load cmake +echo "module load cmake" >> bashrc.modules + +if [ ! "x$VERSION_CMAKE" = "x" ] +then + module swap cmake cmake/$VERSION_CMAKE + echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules +fi + +module unload craype-hugepages2M +echo "module unload craype-hugepages2M" >> bashrc.modules + +if [ "$VCOM" = "INT" ] +then + if [ ! "x$VERSION_INTEL" = "x" ] + then + module swap intel intel/$VERSION_INTEL + echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "GNU" ] +then + module swap PrgEnv-intel PrgEnv-gnu + echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules + + if [ ! "x$VERSION_GNU" = "x" ] + then + module swap gcc gcc/$VERSION_GNU + echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules + fi +fi + +if [ "$VCOM" = "CCE" ] +then + module swap PrgEnv-intel PrgEnv-cray + echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules + + if [ ! "x$VERSION_CCE" = "x" ] + then + module swap cce cce/$VERSION_CCE + echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules + fi +fi + +if [ "$VMPI" = "CMPI" ] +then + if [ ! "x$VERSION_CRAY_MPICH" = "x" ] + then + module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH + echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules + fi + + export MPI_ROOT=$MPICH_DIR +fi + +if [ "$VMPI" = "OMPI" ] +then + module unload cray-mpich + echo "module unload cray-mpich" >> bashrc.modules + + module unload cray-libsci + echo "module unload cray-libsci" >> bashrc.modules + + module load openmpi + echo "module load openmpi" >> bashrc.modules + + if [ ! "x$VERSION_OPEN_MPI" = "x" ] + then + module swap openmpi openmpi/$VERSION_OPEN_MPI + echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules + fi +fi + +module list +echo "" >> bashrc.modules +echo "module list" >> bashrc.modules + +#------------------------------------------------------------------------------# +# Call cmake command. +#------------------------------------------------------------------------------# +# Notes: +# +# Use of the "-LAH" command line option to cmake causes cmake to output the +# values of all of its variables. This is useful information when debugging +# a failed build. +# +# Note that all of the possible VPIC cmake variables relevant to an ATS-1 +# system are set on the command line so that they can all be conditionally +# configured above through user selections. +#------------------------------------------------------------------------------# + +cmake \ + -LAH \ + -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \ + -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \ + -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \ + -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \ + -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \ + -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \ + -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \ + -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \ + -DUSE_V4_SSE=$SET_V4_SSE \ + -DUSE_V4_AVX=$SET_V4_AVX \ + -DUSE_V4_AVX2=$SET_V4_AVX2 \ + -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \ + -DUSE_V8_AVX=$SET_V8_AVX \ + -DUSE_V8_AVX2=$SET_V8_AVX2 \ + -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \ + -DUSE_V16_AVX512=$SET_V16_AVX512 \ + -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \ + -DUSE_OPENMP=$SET_OPENMP \ + -DUSE_PTHREADS=$SET_PTHREADS \ + -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \ + -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \ + -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \ + -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \ + -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \ + $src_dir + +#------------------------------------------------------------------------------# +# Call make command. +#------------------------------------------------------------------------------# +# Notes: +# +# In general, it is necessary to call the "make" command within this script +# because the module environment has been configured within this script. +# +# Setting VERBOSE=1 causes "make" to output the commands it is executing. +# This information is useful if debugging a failed build. +# +# If the NJ variable is not defined, "make" will perform a parallel build +# using maximum number of processors on the compilation machine. If using +# VERBOSE=1, the verbose output will be garbled by many processes writing +# to STDOUT at the same time and will be difficult to interpret. When using +# VERBOSE=1, it can be helpful to also use NJ=1. +#------------------------------------------------------------------------------# + +make -j $NJ VERBOSE=$SET_VERBOSE + +#------------------------------------------------------------------------------# +# Done. +#------------------------------------------------------------------------------# + +#------------------------------------------------------------------------------# +# vim: syntax=sh +#------------------------------------------------------------------------------# diff --git a/arch/lanl-ats1 b/arch/lanl-ats1-knl similarity index 95% rename from arch/lanl-ats1 rename to arch/lanl-ats1-knl index 46a2b34f..907dfb1b 100755 --- a/arch/lanl-ats1 +++ b/arch/lanl-ats1-knl @@ -1,19 +1,15 @@ #! /usr/bin/env bash #------------------------------------------------------------------------------# # This script supports building VPIC on ATS-1 machines at Los Alamos National -# Laboratory (LANL). These machines run the Cray Linux Environment Operating -# System and have two compute partitions, a Haswell partition and a Knights -# Landing (KNL) partition. Both processor types are Intel processors. These -# machines provide three compiler choices: Intel, GNU and Cray compilers. Two -# MPI implementations are provided: Cray Mpich and Open MPI. +# Laboratory (LANL) for Knights Landing nodes. These machines run the Cray +# Linux Environment Operating System and have two compute partitions, a Haswell +# partition and a Knights Landing (KNL) partition. Both processor types are +# Intel processors. These machines provide three compiler choices: Intel, GNU +# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open +# MPI. # # Normal users should not need to change this script if building VPIC to run # on the KNL nodes of ATS-1 machines and happy with defaults. -# -# If normal users desire to build VPIC to run on the Haswell nodes of ATS-1 -# machines, they will need to change this script in two places: first in the -# section where a node type is chosen and second in the section where the type -# of vector intrinsics used are chosen. #------------------------------------------------------------------------------# #------------------------------------------------------------------------------# @@ -26,23 +22,6 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -# Choose a processor node type. -#------------------------------------------------------------------------------# -# One of the node types must be chosen. Valid options are the following. -# -# KNL: Knights Landing nodes -# HSW: Haswell nodes -# -# If HSW, for Haswell, is chosen, you must also change the section on vector -# intrinsics support below to turn off support for V16_AVX512. Normally, you -# would also turn on support for V8_AVX2. See the documentation on the vector -# intrinsics section below for more details. -#------------------------------------------------------------------------------# - -KNL="yes" -#HSW="yes" - #------------------------------------------------------------------------------# # Choose type of vector intrinsics support. #------------------------------------------------------------------------------# @@ -471,10 +450,7 @@ then # for KNL processors. #--------------------------------------------------------------------------# - if [ "$KNL" = "yes" ] - then - FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high" - fi + FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high" #--------------------------------------------------------------------------# # Use "-g" to provide debug symbols in the executable. In general, use of @@ -578,10 +554,7 @@ then # for KNL processors. #--------------------------------------------------------------------------# - if [ "$KNL" = "yes" ] - then - FLAGS_C_COMPILER+=" -qopt-zmm-usage=high" - fi + FLAGS_C_COMPILER+=" -qopt-zmm-usage=high" fi #------------------------------------------------------------------------------# @@ -676,12 +649,7 @@ then # for KNL but it seems they may not for Haswell. #--------------------------------------------------------------------------# - if [ "$KNL" = "yes" ] - then - FLAGS_CXX_COMPILER+=" -march=knl" - else - FLAGS_CXX_COMPILER+=" -march=haswell" - fi + FLAGS_CXX_COMPILER+=" -march=knl" #--------------------------------------------------------------------------# # Use "-g" to provide debug symbols in the executable. In general, use of @@ -769,12 +737,7 @@ then # for KNL but it seems they may not for Haswell. #--------------------------------------------------------------------------# - if [ "$KNL" = "yes" ] - then - FLAGS_C_COMPILER+=" -march=knl" - else - FLAGS_C_COMPILER+=" -march=haswell" - fi + FLAGS_C_COMPILER+=" -march=knl" fi #------------------------------------------------------------------------------# @@ -885,6 +848,9 @@ fi module unload craype-hugepages2M echo "module unload craype-hugepages2M" >> bashrc.modules +module swap craype-haswell craype-mic-knl +echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules + if [ "$VCOM" = "INT" ] then if [ ! "x$VERSION_INTEL" = "x" ] @@ -918,12 +884,6 @@ then fi fi -if [ "$KNL" = "yes" ] -then - module swap craype-haswell craype-mic-knl - echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules -fi - if [ "$VMPI" = "CMPI" ] then if [ ! "x$VERSION_CRAY_MPICH" = "x" ] From 3782d9ae5cb710d100cc3c8c441cc0341b30678c Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 19 Feb 2019 22:58:54 -0700 Subject: [PATCH 11/95] Reorder build configuration options to something that seems more sensible. --- arch/lanl-ats1-hsw | 92 ++++++++++++++++++++++---------------------- arch/lanl-ats1-knl | 92 ++++++++++++++++++++++---------------------- arch/lanl-cts1 | 96 +++++++++++++++++++++++----------------------- 3 files changed, 140 insertions(+), 140 deletions(-) diff --git a/arch/lanl-ats1-hsw b/arch/lanl-ats1-hsw index 2b69ade9..56a18692 100755 --- a/arch/lanl-ats1-hsw +++ b/arch/lanl-ats1-hsw @@ -22,6 +22,52 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# CCE: Cray compilers +# +# Note that selecting CCE for the Cray compilers currently does not work. The +# main reason why you might want to compile with the Cray compilers is to use +# some of the Cray specific tools like Reveal or a small set of features in +# the CrayPat profiling software. This is not a common use case for users. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="CCE" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# CMPI: Cray Mpich, the Cray supported MPI library +# OMPI: Open MPI +#------------------------------------------------------------------------------# + +VMPI="CMPI" +#VMPI="OMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + #------------------------------------------------------------------------------# # Choose type of vector intrinsics support. #------------------------------------------------------------------------------# @@ -79,52 +125,6 @@ SET_V8_AVX2="ON" #SET_V16_PORTABLE="ON" #SET_V16_AVX512="ON" -#------------------------------------------------------------------------------# -# Choose a compiler. -#------------------------------------------------------------------------------# -# One of the compiler choices in this section must be chosen. Valid options -# are the following. -# -# INT: Intel compilers -# GNU: GNU compilers -# CCE: Cray compilers -# -# Note that selecting CCE for the Cray compilers currently does not work. The -# main reason why you might want to compile with the Cray compilers is to use -# some of the Cray specific tools like Reveal or a small set of features in -# the CrayPat profiling software. This is not a common use case for users. -#------------------------------------------------------------------------------# - -VCOM="INT" -#VCOM="GNU" -#VCOM="CCE" - -#------------------------------------------------------------------------------# -# Choose an MPI implementation. -#------------------------------------------------------------------------------# -# One of the MPI library choices must be chosen. Valid options are the -# following. -# -# CMPI: Cray Mpich, the Cray supported MPI library -# OMPI: Open MPI -#------------------------------------------------------------------------------# - -VMPI="CMPI" -#VMPI="OMPI" - -#------------------------------------------------------------------------------# -# Choose a thread model. -#------------------------------------------------------------------------------# -# One of the two available thread models must be chosen. Valid options are the -# following. -# -# PTH: Pthreads -# OMP: OpenMP -#------------------------------------------------------------------------------# - -VTHR="PTH" -#VTHR="OMP" - #------------------------------------------------------------------------------# # Choose format of status update output. #------------------------------------------------------------------------------# diff --git a/arch/lanl-ats1-knl b/arch/lanl-ats1-knl index 907dfb1b..68c2e12a 100755 --- a/arch/lanl-ats1-knl +++ b/arch/lanl-ats1-knl @@ -22,6 +22,52 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# CCE: Cray compilers +# +# Note that selecting CCE for the Cray compilers currently does not work. The +# main reason why you might want to compile with the Cray compilers is to use +# some of the Cray specific tools like Reveal or a small set of features in +# the CrayPat profiling software. This is not a common use case for users. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="CCE" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# CMPI: Cray Mpich, the Cray supported MPI library +# OMPI: Open MPI +#------------------------------------------------------------------------------# + +VMPI="CMPI" +#VMPI="OMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + #------------------------------------------------------------------------------# # Choose type of vector intrinsics support. #------------------------------------------------------------------------------# @@ -79,52 +125,6 @@ SET_V4_AVX2="ON" #SET_V16_PORTABLE="ON" SET_V16_AVX512="ON" -#------------------------------------------------------------------------------# -# Choose a compiler. -#------------------------------------------------------------------------------# -# One of the compiler choices in this section must be chosen. Valid options -# are the following. -# -# INT: Intel compilers -# GNU: GNU compilers -# CCE: Cray compilers -# -# Note that selecting CCE for the Cray compilers currently does not work. The -# main reason why you might want to compile with the Cray compilers is to use -# some of the Cray specific tools like Reveal or a small set of features in -# the CrayPat profiling software. This is not a common use case for users. -#------------------------------------------------------------------------------# - -VCOM="INT" -#VCOM="GNU" -#VCOM="CCE" - -#------------------------------------------------------------------------------# -# Choose an MPI implementation. -#------------------------------------------------------------------------------# -# One of the MPI library choices must be chosen. Valid options are the -# following. -# -# CMPI: Cray Mpich, the Cray supported MPI library -# OMPI: Open MPI -#------------------------------------------------------------------------------# - -VMPI="CMPI" -#VMPI="OMPI" - -#------------------------------------------------------------------------------# -# Choose a thread model. -#------------------------------------------------------------------------------# -# One of the two available thread models must be chosen. Valid options are the -# following. -# -# PTH: Pthreads -# OMP: OpenMP -#------------------------------------------------------------------------------# - -VTHR="PTH" -#VTHR="OMP" - #------------------------------------------------------------------------------# # Choose format of status update output. #------------------------------------------------------------------------------# diff --git a/arch/lanl-cts1 b/arch/lanl-cts1 index 74364294..6e17d194 100755 --- a/arch/lanl-cts1 +++ b/arch/lanl-cts1 @@ -20,6 +20,54 @@ src_dir="${0%/*}/.." # Configure the type of build that we want to perform. #------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +# Choose a compiler. +#------------------------------------------------------------------------------# +# One of the compiler choices in this section must be chosen. Valid options +# are the following. +# +# INT: Intel compilers +# GNU: GNU compilers +# PGI: Portland Group compilers, now part of Nvidia +# +# Note that selecting PGI for Portland Group compilers has not been tested +# and probably does not work. +#------------------------------------------------------------------------------# + +VCOM="INT" +#VCOM="GNU" +#VCOM="PGI" + +#------------------------------------------------------------------------------# +# Choose an MPI implementation. +#------------------------------------------------------------------------------# +# One of the MPI library choices must be chosen. Valid options are the +# following. +# +# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines +# IMPI: Intel MPI +# +# Choose Intel MPI if you want to use the Intel Application Performance +# Snapshot performance analysis tool to analyze MPI performance of VPIC or +# other Intel analysis tools which provide analysis of MPI usage. +#------------------------------------------------------------------------------# + +VMPI="OMPI" +#VMPI="IMPI" + +#------------------------------------------------------------------------------# +# Choose a thread model. +#------------------------------------------------------------------------------# +# One of the two available thread models must be chosen. Valid options are the +# following. +# +# PTH: Pthreads +# OMP: OpenMP +#------------------------------------------------------------------------------# + +VTHR="PTH" +#VTHR="OMP" + #------------------------------------------------------------------------------# # Choose type of vector intrinsics support. #------------------------------------------------------------------------------# @@ -74,54 +122,6 @@ SET_V4_AVX2="ON" SET_V8_AVX2="ON" #SET_V16_PORTABLE="ON" -#------------------------------------------------------------------------------# -# Choose a compiler. -#------------------------------------------------------------------------------# -# One of the compiler choices in this section must be chosen. Valid options -# are the following. -# -# INT: Intel compilers -# GNU: GNU compilers -# PGI: Portland Group compilers, now part of Nvidia -# -# Note that selecting PGI for Portland Group compilers has not been tested -# and probably does not work. -#------------------------------------------------------------------------------# - -VCOM="INT" -#VCOM="GNU" -#VCOM="PGI" - -#------------------------------------------------------------------------------# -# Choose an MPI implementation. -#------------------------------------------------------------------------------# -# One of the MPI library choices must be chosen. Valid options are the -# following. -# -# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines -# IMPI: Intel MPI -# -# Choose Intel MPI if you want to use the Intel Application Performance -# Snapshot performance analysis tool to analyze MPI performance of VPIC or -# other Intel analysis tools which provide analysis of MPI usage. -#------------------------------------------------------------------------------# - -VMPI="OMPI" -#VMPI="IMPI" - -#------------------------------------------------------------------------------# -# Choose a thread model. -#------------------------------------------------------------------------------# -# One of the two available thread models must be chosen. Valid options are the -# following. -# -# PTH: Pthreads -# OMP: OpenMP -#------------------------------------------------------------------------------# - -VTHR="PTH" -#VTHR="OMP" - #------------------------------------------------------------------------------# # Choose format of status update output. #------------------------------------------------------------------------------# From 7e396dc38d4c5e8339b471ba09d7b41349cc9021 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 28 Feb 2019 16:35:31 -0700 Subject: [PATCH 12/95] Add more documentation about various available CMake configuration variables. --- README.md | 84 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 2383a792..e3e634a7 100644 --- a/README.md +++ b/README.md @@ -206,32 +206,84 @@ Currently, the following options are exposed at compile time for the users consi ## Threading Model - - `USE_PTHREADS` (default `ON`): Use Pthreads for the threading model (default enabled) - - `USE_OPENMP`: Use OpenMP for the threading model + - `USE_PTHREADS`: Use Pthreads for threading model, (default `ON`) + - `USE_OPENMP`: Use OpenMP for threading model ## Vectorization - - `USE_V4_SSE`: Enable 4 wide (128-bit) SSE - - `USE_V4_AVX`: Enable 4 wide (128-bit) AVX - - `USE_V4_AVX2`: Enable 4 wide (128-bit) AVX2 - - `USE_V4_ALTIVEC`: Enable 4 wide (128-bit) Altivec - - `USE_V4_PORTABLE`: Enable 4 wide (128-bit) portable implementation +The following CMake variables are used to control the vector implementation that +VPIC uses for each SIMD width. Currently, there is support for 128 bit, 256 bit +and 512 bit SIMD widths. The default is for each of these CMake variables to be +disabled which means that an unvectorized reference implementation of functions +will be used. - - `USE_V8_AVX`: Enable 8 wide (256-bit) AVX - - `USE_V8_AVX2`: Enable 8 wide (256-bit) AVX2 - - `USE_V8_PORTABLE`: Enable 8 wide (256-bit) portable implementation + - `USE_V4_SSE`: Enable 4 wide (128-bit) SSE + - `USE_V4_AVX`: Enable 4 wide (128-bit) AVX + - `USE_V4_AVX2`: Enable 4 wide (128-bit) AVX2 + - `USE_V4_ALTIVEC`: Enable 4 wide (128-bit) Altivec + - `USE_V4_PORTABLE`: Enable 4 wide (128-bit) portable implementation - - `USE_V16_AVX512`: Enable 16 wide (512-bit) AVX512 - - `USE_V16_PORTABLE`: Enable 16 wide (512-bit) portable implementation + - `USE_V8_AVX`: Enable 8 wide (256-bit) AVX + - `USE_V8_AVX2`: Enable 8 wide (256-bit) AVX2 + - `USE_V8_PORTABLE`: Enable 8 wide (256-bit) portable implementation -If no combination of these are selected, the "reference" (read: unvectorized) -version of the pusher will be used + - `USE_V16_AVX512`: Enable 16 wide (512-bit) AVX512 + - `USE_V16_PORTABLE`: Enable 16 wide (512-bit) portable implementation -See example decks for how these are used together in combination. +Several functions in VPIC have vector implementations for each of the three SIMD +widths. Some only have a single implementation. An example of the latter is +move_p which only has a reference implementation and a V4 implementation. + +It is possible to have a single CMake vector variable configured as ON for each +of the three supported SIMD vector widths. It is recommended to always have a +CMake variable configured as ON for the 128 bit SIMD vector width so that move_p +will be vectorized. In addition, it is recommended to configure as ON the CMake +variable that is associated with the native SIMD vector width of the processor +that VPIC is targeting. If a CMake variable is configured as ON for each of the +three available SIMD vector widths, then for a given function in VPIC, the +implementation which supports the largest SIMD vector length will be chosen. If +a V16 implementation exists, it will be chosen. If a V16 implementation does not +exist but V8 and V4 implementations exist, the V8 implementation will be chosen. +If V16 and V8 implementations do not exist but a V4 implementation does, it will +be chosen. If no SIMD vector implementation exists, the unvectorized reference +implementation will be chosen. + +In summary, when using vector versions on a machine with 256 bit SIMD, the +V4 and V8 implementations should be configured as ON. When using a machine +with 512 bit SIMD, V4 and V16 implementations should be configured as ON. +When choosing a vector implementation for a given SIMD vector length, the +implementation that is closest to the SIMD instruction set for the targeted +processor should be chosen. The portable versions are most commonly used for +debugging the implementation of new intrinsics versions. However, the portable +versions are generally more performant than the unvectorized reference +implemenation. So, one might consider using the V4_PORTABLE version on ARM +processors until a V4_NEON implementation becomes available. ## Output - - `VPIC_PRINT_MORE_DIGITS`: Enable more digits in the debug timing implementation + - `VPIC_PRINT_MORE_DIGITS`: Enable more digits in timing output of status reports + +## Particle sorting implementation + +The CMake variable below allows building VPIC to use the legacy, thread serial +implementation of the particle sort algorithm. + + - `USE_LEGACY_SORT`: Use legacy thread serial particle sort, (default `OFF`) + +The legacy particle sort implementation is the thread serial particle sort +implementation from the legacy v407 version of VPIC. This implementation +supports both in-place and out-of-place sorting of the particles. It is very +competitive with the thread parallel sort implementation for a small number +of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting +the particles in-place allows the fraction of particles stored in High +Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint +of VPIC is reduced by the memory of a particle array which can be significant +for particle dominated problems. + +The default particle sort implementation is a thread parallel implementation. +Currently, it can only perform out-of-place sorting of the particles. It will +be more performant than the legacy implementation when using many threads per +MPI rank but uses more memory because of the out-of-place sort. # Workflow From 61ad4721d59fbc380523f11eee510560723c9e65 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 11 Mar 2019 15:46:25 -0600 Subject: [PATCH 13/95] Switch all C files to C++ files unless it is explicit that they need to be C files. --- src/boundary/{absorb_tally.c => absorb_tally.cc} | 0 src/boundary/{boundary.c => boundary.cc} | 0 src/boundary/{link.c => link.cc} | 0 src/boundary/{maxwellian_reflux.c => maxwellian_reflux.cc} | 0 src/collision/{binary.c => binary.cc} | 0 src/collision/{collision.c => collision.cc} | 0 src/collision/{hard_sphere.c => hard_sphere.cc} | 0 src/collision/{langevin.c => langevin.cc} | 0 src/collision/{large_angle_coulomb.c => large_angle_coulomb.cc} | 0 src/collision/pipeline/{binary_pipeline.c => binary_pipeline.cc} | 0 .../pipeline/{langevin_pipeline.c => langevin_pipeline.cc} | 0 src/collision/pipeline/{unary_pipeline.c => unary_pipeline.cc} | 0 src/collision/{unary.c => unary.cc} | 0 src/emitter/{child_langmuir.c => child_langmuir.cc} | 0 src/emitter/{emitter.c => emitter.cc} | 0 src/field_advance/{field_advance.c => field_advance.cc} | 0 src/field_advance/standard/{clean_div_e.c => clean_div_e.cc} | 0 .../standard/{compute_div_e_err.c => compute_div_e_err.cc} | 0 src/field_advance/standard/{compute_rhob.c => compute_rhob.cc} | 0 .../{compute_rms_div_b_err.c => compute_rms_div_b_err.cc} | 0 .../{compute_rms_div_e_err.c => compute_rms_div_e_err.cc} | 0 src/field_advance/standard/{energy_f.c => energy_f.cc} | 0 src/field_advance/standard/{local.c => local.cc} | 0 .../pipeline/{clean_div_e_pipeline.c => clean_div_e_pipeline.cc} | 0 ...compute_div_e_err_pipeline.c => compute_div_e_err_pipeline.cc} | 0 .../{compute_rhob_pipeline.c => compute_rhob_pipeline.cc} | 0 ...rms_div_b_err_pipeline.c => compute_rms_div_b_err_pipeline.cc} | 0 ...rms_div_e_err_pipeline.c => compute_rms_div_e_err_pipeline.cc} | 0 .../pipeline/{energy_f_pipeline.c => energy_f_pipeline.cc} | 0 ...cuum_clean_div_e_pipeline.c => vacuum_clean_div_e_pipeline.cc} | 0 ..._div_e_err_pipeline.c => vacuum_compute_div_e_err_pipeline.cc} | 0 ...um_compute_rhob_pipeline.c => vacuum_compute_rhob_pipeline.cc} | 0 .../{vacuum_energy_f_pipeline.c => vacuum_energy_f_pipeline.cc} | 0 src/field_advance/standard/{remote.c => remote.cc} | 0 src/field_advance/standard/{sfa.c => sfa.cc} | 0 .../standard/{vacuum_clean_div_e.c => vacuum_clean_div_e.cc} | 0 .../{vacuum_compute_div_e_err.c => vacuum_compute_div_e_err.cc} | 0 .../standard/{vacuum_compute_rhob.c => vacuum_compute_rhob.cc} | 0 .../standard/{vacuum_energy_f.c => vacuum_energy_f.cc} | 0 src/grid/{grid_comm.c => grid_comm.cc} | 0 src/grid/{grid_structors.c => grid_structors.cc} | 0 src/grid/{ops.c => ops.cc} | 0 src/grid/{partition.c => partition.cc} | 0 src/material/{material.c => material.cc} | 0 src/sf_interface/{accumulator_array.c => accumulator_array.cc} | 0 src/sf_interface/{clear_accumulators.c => clear_accumulators.cc} | 0 src/sf_interface/{hydro_array.c => hydro_array.cc} | 0 ...ear_accumulators_pipeline.c => clear_accumulators_pipeline.cc} | 0 src/species_advance/{species_advance.c => species_advance.cc} | 0 src/species_advance/standard/{hydro_p.c => hydro_p.cc} | 0 .../standard/pipeline/{sort_p_pipeline.c => sort_p_pipeline.cc} | 0 src/species_advance/standard/{sort_p.c => sort_p.cc} | 0 src/util/{boot.c => boot.cc} | 0 src/util/checkpt/{checkpt.c => checkpt.cc} | 0 src/util/pipelines/{pipelines_helper.c => pipelines_helper.cc} | 0 src/util/pipelines/{pipelines_serial.c => pipelines_serial.cc} | 0 src/util/pipelines/{pipelines_thread.c => pipelines_thread.cc} | 0 src/util/profile/{profile.c => profile.cc} | 0 src/util/rng/{drandn_table.c => drandn_table.cc} | 0 src/util/rng/{frandn_table.c => frandn_table.cc} | 0 src/util/rng/{rng.c => rng.cc} | 0 src/util/rng/{rng_pool.c => rng_pool.cc} | 0 src/util/{util_base.c => util_base.cc} | 0 63 files changed, 0 insertions(+), 0 deletions(-) rename src/boundary/{absorb_tally.c => absorb_tally.cc} (100%) rename src/boundary/{boundary.c => boundary.cc} (100%) rename src/boundary/{link.c => link.cc} (100%) rename src/boundary/{maxwellian_reflux.c => maxwellian_reflux.cc} (100%) rename src/collision/{binary.c => binary.cc} (100%) rename src/collision/{collision.c => collision.cc} (100%) rename src/collision/{hard_sphere.c => hard_sphere.cc} (100%) rename src/collision/{langevin.c => langevin.cc} (100%) rename src/collision/{large_angle_coulomb.c => large_angle_coulomb.cc} (100%) rename src/collision/pipeline/{binary_pipeline.c => binary_pipeline.cc} (100%) rename src/collision/pipeline/{langevin_pipeline.c => langevin_pipeline.cc} (100%) rename src/collision/pipeline/{unary_pipeline.c => unary_pipeline.cc} (100%) rename src/collision/{unary.c => unary.cc} (100%) rename src/emitter/{child_langmuir.c => child_langmuir.cc} (100%) rename src/emitter/{emitter.c => emitter.cc} (100%) rename src/field_advance/{field_advance.c => field_advance.cc} (100%) rename src/field_advance/standard/{clean_div_e.c => clean_div_e.cc} (100%) rename src/field_advance/standard/{compute_div_e_err.c => compute_div_e_err.cc} (100%) rename src/field_advance/standard/{compute_rhob.c => compute_rhob.cc} (100%) rename src/field_advance/standard/{compute_rms_div_b_err.c => compute_rms_div_b_err.cc} (100%) rename src/field_advance/standard/{compute_rms_div_e_err.c => compute_rms_div_e_err.cc} (100%) rename src/field_advance/standard/{energy_f.c => energy_f.cc} (100%) rename src/field_advance/standard/{local.c => local.cc} (100%) rename src/field_advance/standard/pipeline/{clean_div_e_pipeline.c => clean_div_e_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{compute_div_e_err_pipeline.c => compute_div_e_err_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{compute_rhob_pipeline.c => compute_rhob_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{compute_rms_div_b_err_pipeline.c => compute_rms_div_b_err_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{compute_rms_div_e_err_pipeline.c => compute_rms_div_e_err_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{energy_f_pipeline.c => energy_f_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{vacuum_clean_div_e_pipeline.c => vacuum_clean_div_e_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{vacuum_compute_div_e_err_pipeline.c => vacuum_compute_div_e_err_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{vacuum_compute_rhob_pipeline.c => vacuum_compute_rhob_pipeline.cc} (100%) rename src/field_advance/standard/pipeline/{vacuum_energy_f_pipeline.c => vacuum_energy_f_pipeline.cc} (100%) rename src/field_advance/standard/{remote.c => remote.cc} (100%) rename src/field_advance/standard/{sfa.c => sfa.cc} (100%) rename src/field_advance/standard/{vacuum_clean_div_e.c => vacuum_clean_div_e.cc} (100%) rename src/field_advance/standard/{vacuum_compute_div_e_err.c => vacuum_compute_div_e_err.cc} (100%) rename src/field_advance/standard/{vacuum_compute_rhob.c => vacuum_compute_rhob.cc} (100%) rename src/field_advance/standard/{vacuum_energy_f.c => vacuum_energy_f.cc} (100%) rename src/grid/{grid_comm.c => grid_comm.cc} (100%) rename src/grid/{grid_structors.c => grid_structors.cc} (100%) rename src/grid/{ops.c => ops.cc} (100%) rename src/grid/{partition.c => partition.cc} (100%) rename src/material/{material.c => material.cc} (100%) rename src/sf_interface/{accumulator_array.c => accumulator_array.cc} (100%) rename src/sf_interface/{clear_accumulators.c => clear_accumulators.cc} (100%) rename src/sf_interface/{hydro_array.c => hydro_array.cc} (100%) rename src/sf_interface/pipeline/{clear_accumulators_pipeline.c => clear_accumulators_pipeline.cc} (100%) rename src/species_advance/{species_advance.c => species_advance.cc} (100%) rename src/species_advance/standard/{hydro_p.c => hydro_p.cc} (100%) rename src/species_advance/standard/pipeline/{sort_p_pipeline.c => sort_p_pipeline.cc} (100%) rename src/species_advance/standard/{sort_p.c => sort_p.cc} (100%) rename src/util/{boot.c => boot.cc} (100%) rename src/util/checkpt/{checkpt.c => checkpt.cc} (100%) rename src/util/pipelines/{pipelines_helper.c => pipelines_helper.cc} (100%) rename src/util/pipelines/{pipelines_serial.c => pipelines_serial.cc} (100%) rename src/util/pipelines/{pipelines_thread.c => pipelines_thread.cc} (100%) rename src/util/profile/{profile.c => profile.cc} (100%) rename src/util/rng/{drandn_table.c => drandn_table.cc} (100%) rename src/util/rng/{frandn_table.c => frandn_table.cc} (100%) rename src/util/rng/{rng.c => rng.cc} (100%) rename src/util/rng/{rng_pool.c => rng_pool.cc} (100%) rename src/util/{util_base.c => util_base.cc} (100%) diff --git a/src/boundary/absorb_tally.c b/src/boundary/absorb_tally.cc similarity index 100% rename from src/boundary/absorb_tally.c rename to src/boundary/absorb_tally.cc diff --git a/src/boundary/boundary.c b/src/boundary/boundary.cc similarity index 100% rename from src/boundary/boundary.c rename to src/boundary/boundary.cc diff --git a/src/boundary/link.c b/src/boundary/link.cc similarity index 100% rename from src/boundary/link.c rename to src/boundary/link.cc diff --git a/src/boundary/maxwellian_reflux.c b/src/boundary/maxwellian_reflux.cc similarity index 100% rename from src/boundary/maxwellian_reflux.c rename to src/boundary/maxwellian_reflux.cc diff --git a/src/collision/binary.c b/src/collision/binary.cc similarity index 100% rename from src/collision/binary.c rename to src/collision/binary.cc diff --git a/src/collision/collision.c b/src/collision/collision.cc similarity index 100% rename from src/collision/collision.c rename to src/collision/collision.cc diff --git a/src/collision/hard_sphere.c b/src/collision/hard_sphere.cc similarity index 100% rename from src/collision/hard_sphere.c rename to src/collision/hard_sphere.cc diff --git a/src/collision/langevin.c b/src/collision/langevin.cc similarity index 100% rename from src/collision/langevin.c rename to src/collision/langevin.cc diff --git a/src/collision/large_angle_coulomb.c b/src/collision/large_angle_coulomb.cc similarity index 100% rename from src/collision/large_angle_coulomb.c rename to src/collision/large_angle_coulomb.cc diff --git a/src/collision/pipeline/binary_pipeline.c b/src/collision/pipeline/binary_pipeline.cc similarity index 100% rename from src/collision/pipeline/binary_pipeline.c rename to src/collision/pipeline/binary_pipeline.cc diff --git a/src/collision/pipeline/langevin_pipeline.c b/src/collision/pipeline/langevin_pipeline.cc similarity index 100% rename from src/collision/pipeline/langevin_pipeline.c rename to src/collision/pipeline/langevin_pipeline.cc diff --git a/src/collision/pipeline/unary_pipeline.c b/src/collision/pipeline/unary_pipeline.cc similarity index 100% rename from src/collision/pipeline/unary_pipeline.c rename to src/collision/pipeline/unary_pipeline.cc diff --git a/src/collision/unary.c b/src/collision/unary.cc similarity index 100% rename from src/collision/unary.c rename to src/collision/unary.cc diff --git a/src/emitter/child_langmuir.c b/src/emitter/child_langmuir.cc similarity index 100% rename from src/emitter/child_langmuir.c rename to src/emitter/child_langmuir.cc diff --git a/src/emitter/emitter.c b/src/emitter/emitter.cc similarity index 100% rename from src/emitter/emitter.c rename to src/emitter/emitter.cc diff --git a/src/field_advance/field_advance.c b/src/field_advance/field_advance.cc similarity index 100% rename from src/field_advance/field_advance.c rename to src/field_advance/field_advance.cc diff --git a/src/field_advance/standard/clean_div_e.c b/src/field_advance/standard/clean_div_e.cc similarity index 100% rename from src/field_advance/standard/clean_div_e.c rename to src/field_advance/standard/clean_div_e.cc diff --git a/src/field_advance/standard/compute_div_e_err.c b/src/field_advance/standard/compute_div_e_err.cc similarity index 100% rename from src/field_advance/standard/compute_div_e_err.c rename to src/field_advance/standard/compute_div_e_err.cc diff --git a/src/field_advance/standard/compute_rhob.c b/src/field_advance/standard/compute_rhob.cc similarity index 100% rename from src/field_advance/standard/compute_rhob.c rename to src/field_advance/standard/compute_rhob.cc diff --git a/src/field_advance/standard/compute_rms_div_b_err.c b/src/field_advance/standard/compute_rms_div_b_err.cc similarity index 100% rename from src/field_advance/standard/compute_rms_div_b_err.c rename to src/field_advance/standard/compute_rms_div_b_err.cc diff --git a/src/field_advance/standard/compute_rms_div_e_err.c b/src/field_advance/standard/compute_rms_div_e_err.cc similarity index 100% rename from src/field_advance/standard/compute_rms_div_e_err.c rename to src/field_advance/standard/compute_rms_div_e_err.cc diff --git a/src/field_advance/standard/energy_f.c b/src/field_advance/standard/energy_f.cc similarity index 100% rename from src/field_advance/standard/energy_f.c rename to src/field_advance/standard/energy_f.cc diff --git a/src/field_advance/standard/local.c b/src/field_advance/standard/local.cc similarity index 100% rename from src/field_advance/standard/local.c rename to src/field_advance/standard/local.cc diff --git a/src/field_advance/standard/pipeline/clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/clean_div_e_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/clean_div_e_pipeline.c rename to src/field_advance/standard/pipeline/clean_div_e_pipeline.cc diff --git a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c rename to src/field_advance/standard/pipeline/compute_div_e_err_pipeline.cc diff --git a/src/field_advance/standard/pipeline/compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/compute_rhob_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/compute_rhob_pipeline.c rename to src/field_advance/standard/pipeline/compute_rhob_pipeline.cc diff --git a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c rename to src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.cc diff --git a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c rename to src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.cc diff --git a/src/field_advance/standard/pipeline/energy_f_pipeline.c b/src/field_advance/standard/pipeline/energy_f_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/energy_f_pipeline.c rename to src/field_advance/standard/pipeline/energy_f_pipeline.cc diff --git a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c rename to src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.cc diff --git a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c rename to src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.cc diff --git a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c rename to src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.cc diff --git a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.cc similarity index 100% rename from src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c rename to src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.cc diff --git a/src/field_advance/standard/remote.c b/src/field_advance/standard/remote.cc similarity index 100% rename from src/field_advance/standard/remote.c rename to src/field_advance/standard/remote.cc diff --git a/src/field_advance/standard/sfa.c b/src/field_advance/standard/sfa.cc similarity index 100% rename from src/field_advance/standard/sfa.c rename to src/field_advance/standard/sfa.cc diff --git a/src/field_advance/standard/vacuum_clean_div_e.c b/src/field_advance/standard/vacuum_clean_div_e.cc similarity index 100% rename from src/field_advance/standard/vacuum_clean_div_e.c rename to src/field_advance/standard/vacuum_clean_div_e.cc diff --git a/src/field_advance/standard/vacuum_compute_div_e_err.c b/src/field_advance/standard/vacuum_compute_div_e_err.cc similarity index 100% rename from src/field_advance/standard/vacuum_compute_div_e_err.c rename to src/field_advance/standard/vacuum_compute_div_e_err.cc diff --git a/src/field_advance/standard/vacuum_compute_rhob.c b/src/field_advance/standard/vacuum_compute_rhob.cc similarity index 100% rename from src/field_advance/standard/vacuum_compute_rhob.c rename to src/field_advance/standard/vacuum_compute_rhob.cc diff --git a/src/field_advance/standard/vacuum_energy_f.c b/src/field_advance/standard/vacuum_energy_f.cc similarity index 100% rename from src/field_advance/standard/vacuum_energy_f.c rename to src/field_advance/standard/vacuum_energy_f.cc diff --git a/src/grid/grid_comm.c b/src/grid/grid_comm.cc similarity index 100% rename from src/grid/grid_comm.c rename to src/grid/grid_comm.cc diff --git a/src/grid/grid_structors.c b/src/grid/grid_structors.cc similarity index 100% rename from src/grid/grid_structors.c rename to src/grid/grid_structors.cc diff --git a/src/grid/ops.c b/src/grid/ops.cc similarity index 100% rename from src/grid/ops.c rename to src/grid/ops.cc diff --git a/src/grid/partition.c b/src/grid/partition.cc similarity index 100% rename from src/grid/partition.c rename to src/grid/partition.cc diff --git a/src/material/material.c b/src/material/material.cc similarity index 100% rename from src/material/material.c rename to src/material/material.cc diff --git a/src/sf_interface/accumulator_array.c b/src/sf_interface/accumulator_array.cc similarity index 100% rename from src/sf_interface/accumulator_array.c rename to src/sf_interface/accumulator_array.cc diff --git a/src/sf_interface/clear_accumulators.c b/src/sf_interface/clear_accumulators.cc similarity index 100% rename from src/sf_interface/clear_accumulators.c rename to src/sf_interface/clear_accumulators.cc diff --git a/src/sf_interface/hydro_array.c b/src/sf_interface/hydro_array.cc similarity index 100% rename from src/sf_interface/hydro_array.c rename to src/sf_interface/hydro_array.cc diff --git a/src/sf_interface/pipeline/clear_accumulators_pipeline.c b/src/sf_interface/pipeline/clear_accumulators_pipeline.cc similarity index 100% rename from src/sf_interface/pipeline/clear_accumulators_pipeline.c rename to src/sf_interface/pipeline/clear_accumulators_pipeline.cc diff --git a/src/species_advance/species_advance.c b/src/species_advance/species_advance.cc similarity index 100% rename from src/species_advance/species_advance.c rename to src/species_advance/species_advance.cc diff --git a/src/species_advance/standard/hydro_p.c b/src/species_advance/standard/hydro_p.cc similarity index 100% rename from src/species_advance/standard/hydro_p.c rename to src/species_advance/standard/hydro_p.cc diff --git a/src/species_advance/standard/pipeline/sort_p_pipeline.c b/src/species_advance/standard/pipeline/sort_p_pipeline.cc similarity index 100% rename from src/species_advance/standard/pipeline/sort_p_pipeline.c rename to src/species_advance/standard/pipeline/sort_p_pipeline.cc diff --git a/src/species_advance/standard/sort_p.c b/src/species_advance/standard/sort_p.cc similarity index 100% rename from src/species_advance/standard/sort_p.c rename to src/species_advance/standard/sort_p.cc diff --git a/src/util/boot.c b/src/util/boot.cc similarity index 100% rename from src/util/boot.c rename to src/util/boot.cc diff --git a/src/util/checkpt/checkpt.c b/src/util/checkpt/checkpt.cc similarity index 100% rename from src/util/checkpt/checkpt.c rename to src/util/checkpt/checkpt.cc diff --git a/src/util/pipelines/pipelines_helper.c b/src/util/pipelines/pipelines_helper.cc similarity index 100% rename from src/util/pipelines/pipelines_helper.c rename to src/util/pipelines/pipelines_helper.cc diff --git a/src/util/pipelines/pipelines_serial.c b/src/util/pipelines/pipelines_serial.cc similarity index 100% rename from src/util/pipelines/pipelines_serial.c rename to src/util/pipelines/pipelines_serial.cc diff --git a/src/util/pipelines/pipelines_thread.c b/src/util/pipelines/pipelines_thread.cc similarity index 100% rename from src/util/pipelines/pipelines_thread.c rename to src/util/pipelines/pipelines_thread.cc diff --git a/src/util/profile/profile.c b/src/util/profile/profile.cc similarity index 100% rename from src/util/profile/profile.c rename to src/util/profile/profile.cc diff --git a/src/util/rng/drandn_table.c b/src/util/rng/drandn_table.cc similarity index 100% rename from src/util/rng/drandn_table.c rename to src/util/rng/drandn_table.cc diff --git a/src/util/rng/frandn_table.c b/src/util/rng/frandn_table.cc similarity index 100% rename from src/util/rng/frandn_table.c rename to src/util/rng/frandn_table.cc diff --git a/src/util/rng/rng.c b/src/util/rng/rng.cc similarity index 100% rename from src/util/rng/rng.c rename to src/util/rng/rng.cc diff --git a/src/util/rng/rng_pool.c b/src/util/rng/rng_pool.cc similarity index 100% rename from src/util/rng/rng_pool.c rename to src/util/rng/rng_pool.cc diff --git a/src/util/util_base.c b/src/util/util_base.cc similarity index 100% rename from src/util/util_base.c rename to src/util/util_base.cc From 3a03a8810eb465c64739f9e6e0743168bc71ee33 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 9 Jul 2019 21:26:28 -0600 Subject: [PATCH 14/95] Format tweaks. --- deck/main.cc | 2 +- src/boundary/boundary_p.cc | 726 ++++++++++++------ src/boundary/link.cc | 3 +- src/collision/collision_private.h | 3 +- src/collision/hard_sphere.cc | 101 +-- src/collision/large_angle_coulomb.cc | 107 +-- src/emitter/child_langmuir.cc | 1 - src/species_advance/species_advance.cc | 27 +- src/species_advance/standard/hydro_p.cc | 3 +- src/species_advance/standard/move_p.cc | 107 ++- .../standard/pipeline/advance_p_pipeline.cc | 3 +- .../pipeline/advance_p_pipeline_v16.cc | 121 +-- .../standard/pipeline/center_p_pipeline.cc | 5 +- .../standard/pipeline/spa_private.h | 4 - .../standard/pipeline/uncenter_p_pipeline.cc | 8 +- src/species_advance/standard/rho_p.cc | 8 +- src/species_advance/standard/sort_p.cc | 4 +- src/util/profile/profile.h | 1 + src/util/v16/v16_avx512.h | 18 +- src/util/v4/v4_altivec.h | 42 +- src/util/v4/v4_avx.h | 39 +- src/util/v4/v4_avx2.h | 16 +- src/util/v4/v4_sse.h | 39 +- src/vpic/dump.cc | 3 +- src/vpic/initialize.cc | 1 - src/vpic/misc.cc | 21 +- src/vpic/vpic.h | 6 +- 27 files changed, 898 insertions(+), 521 deletions(-) diff --git a/deck/main.cc b/deck/main.cc index 001baff4..f9f7fb1b 100644 --- a/deck/main.cc +++ b/deck/main.cc @@ -103,7 +103,7 @@ int main(int argc, char** argv) // Do any post init/restore simulation modifications - // Detec if the "modify" option is passed, which allows users to change + // Detect if the "modify" option is passed, which allows users to change // options (such as quota, num_step, etc) when restoring fbase = strip_cmdline_string( &argc, &argv, "--modify", NULL ); if( fbase ) diff --git a/src/boundary/boundary_p.cc b/src/boundary/boundary_p.cc index 25d87b41..a50d6657 100644 --- a/src/boundary/boundary_p.cc +++ b/src/boundary/boundary_p.cc @@ -1,9 +1,11 @@ #define IN_boundary + #include "boundary_private.h" -// If this is defined particle and mover buffers will not resize dynamically -// (This is the common case for the users) -//#define DISABLE_DYNAMIC_RESIZING +// If this is defined particle and mover buffers will not resize dynamically. +// This is the common case for the users. + +#define DISABLE_DYNAMIC_RESIZING // FIXME: ARCHITECTURAL FLAW! CUSTOM BCS AND SHARED FACES CANNOT // COEXIST ON THE SAME FACE! THIS MEANS THAT CUSTOM BOUNDARYS MUST @@ -25,6 +27,10 @@ using namespace v4; #endif +#ifdef V8_ACCELERATION +using namespace v8; +#endif + #ifndef MIN_NP #define MIN_NP 128 // Default to 4kb (~1 page worth of memory) //#define MIN_NP 32768 // 32768 particles is 1 MiB of memory. @@ -33,13 +39,15 @@ using namespace v4; enum { MAX_PBC = 32, MAX_SP = 32 }; +// This is the AoS implementation. + void boundary_p( particle_bc_t * RESTRICT pbc_list, species_t * RESTRICT sp_list, field_array_t * RESTRICT fa, - accumulator_array_t * RESTRICT aa ) { - - // Gives the local mp port associated with a local face + accumulator_array_t * RESTRICT aa ) +{ + // Gives the local mp port associated with a local face. static const int f2b[6] = { BOUNDARY(-1, 0, 0), BOUNDARY( 0,-1, 0), BOUNDARY( 0, 0,-1), @@ -47,7 +55,7 @@ boundary_p( particle_bc_t * RESTRICT pbc_list, BOUNDARY( 0, 1, 0), BOUNDARY( 0, 0, 1) }; - // Gives the remote mp port associated with a local face + // Gives the remote mp port associated with a local face. static const int f2rb[6] = { BOUNDARY( 1, 0, 0), BOUNDARY( 0, 1, 0), BOUNDARY( 0, 0, 1), @@ -55,122 +63,178 @@ boundary_p( particle_bc_t * RESTRICT pbc_list, BOUNDARY( 0,-1, 0), BOUNDARY( 0, 0,-1) }; - // Gives the axis associated with a local face - static const int axis[6] = { 0, 1, 2, 0, 1, 2 }; + // Gives the axis associated with a local face. + static const int axis[6] = { 0, 1, 2, 0, 1, 2 }; - // Gives the location of sending face on the receiver + // Gives the location of sending face on the receiver. static const float dir[6] = { 1, 1, 1, -1, -1, -1 }; - // Temporary store for local particle injectors - // FIXME: Ugly static usage + // Temporary store for local particle injectors. + // FIXME: Ugly static usage. static particle_injector_t * RESTRICT ALIGNED(16) ci = NULL; + static int max_ci = 0; int n_send[6], n_recv[6], n_ci; species_t * sp; + int face; - // Check input args + // Check input args. - if( !sp_list ) return; // Nothing to do if no species - if( !fa || !aa || sp_list->g!=aa->g || fa->g!=aa->g ) - ERROR(( "Bad args" )); + if ( ! sp_list ) + { + return; // Nothing to do if no species. + } - // Unpack the particle boundary conditions + if ( ! fa || + ! aa || + sp_list->g != aa->g || + fa->g != aa->g ) + { + ERROR( ( "Bad args." ) ); + } + + // Unpack the particle boundary conditions. particle_bc_func_t pbc_interact[MAX_PBC]; + void * pbc_params[MAX_PBC]; + const int nb = num_particle_bc( pbc_list ); - if( nb>MAX_PBC ) ERROR(( "Update this to support more particle boundary conditions" )); - for( particle_bc_t * pbc=pbc_list; pbc; pbc=pbc->next ) { - pbc_interact[-pbc->id-3] = pbc->interact; - pbc_params[ -pbc->id-3] = pbc->params; - } - // Unpack fields + if ( nb > MAX_PBC ) + { + ERROR( ( "Update this to support more particle boundary conditions." ) ); + } + + for( particle_bc_t * pbc = pbc_list; pbc; pbc = pbc->next ) + { + pbc_interact[ -pbc->id - 3 ] = pbc->interact; + pbc_params [ -pbc->id - 3 ] = pbc->params; + } + + // Unpack fields. field_t * RESTRICT ALIGNED(128) f = fa->f; grid_t * RESTRICT g = fa->g; - // Unpack accumulator + // Unpack accumulator. accumulator_t * RESTRICT ALIGNED(128) a0 = aa->a; - // Unpack the grid + // Unpack the grid. const int64_t * RESTRICT ALIGNED(128) neighbor = g->neighbor; /**/ mp_t * RESTRICT mp = g->mp; + const int64_t rangel = g->rangel; const int64_t rangeh = g->rangeh; const int64_t rangem = g->range[world_size]; + /*const*/ int bc[6], shared[6]; /*const*/ int64_t range[6]; - for( face=0; face<6; face++ ) { - bc[face] = g->bc[f2b[face]]; - shared[face] = (bc[face]>=0) && (bc[face]range[bc[face]]; - } - // Begin receiving the particle counts + for( face = 0; face < 6; face++ ) + { + bc [ face ] = g->bc[ f2b[ face ] ]; + + shared[ face ] = ( bc[ face ] >= 0 ) && + ( bc[ face ] < world_size ) && + ( bc[ face ] != world_rank ); - for( face=0; face<6; face++ ) - if( shared[face] ) { - mp_size_recv_buffer( mp, f2b[face], sizeof(int) ); - mp_begin_recv( mp, f2b[face], sizeof(int), bc[face], f2rb[face] ); + if ( shared[ face ] ) + { + range[ face ] = g->range[ bc[ face ] ]; } + } - // Load the particle send and local injection buffers + // Begin receiving the particle counts. + + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + mp_size_recv_buffer( mp, + f2b[ face ], + sizeof( int ) ); + + mp_begin_recv( mp, + f2b[ face ], + sizeof( int ), + bc[ face ], + f2rb[ face ] ); + } + } - do { + // Load the particle send and local injection buffers. + do + { particle_injector_t * RESTRICT ALIGNED(16) pi_send[6]; - // Presize the send and injection buffers + // Presize the send and injection buffers. // // Each buffer is large enough to hold one injector corresponding // to every mover in use (worst case, but plausible scenario in // beam simulations, is one buffer gets all the movers). // // FIXME: We could be several times more efficient in our particle - // injector buffer sizing here. Namely, we could create on local - // injector buffer of nm is size. All injection for all - // boundaries would be done here. The local buffer would then be - // counted to determine the size of each send buffer. The local - // buffer would then move all injectors into the approate send - // buffers (leaving only the local injectors). This would require - // some extra data motion though. (But would give a more robust - // implementation against variations in MP implementation.) + // injector buffer sizing here. Namely, we could create one local + // injector buffer of nm in size. All injection for all boundaries + // would be done here. The local buffer would then be counted to + // determine the size of each send buffer. The local buffer would + // then move all injectors into the appropriate send buffers, leaving + // only the local injectors. This would require some extra data + // motion though, but would give a more robust implementation against + // variations in MP implementation. // // FIXME: This presizing assumes that custom boundary conditions // inject at most one particle per incident particle. Currently, // the invocation of pbc_interact[*] insures that assumption will - // be satisfied (if the handlers conform that it). We should be - // more flexible though in the future (especially given above the - // above overalloc). + // be satisfied, if the handlers conform that it. We should be + // more flexible though in the future, especially given the above + // overalloc. - int nm = 0; LIST_FOR_EACH( sp, sp_list ) nm += sp->nm; + int nm = 0; - for( face=0; face<6; face++ ) - if( shared[face] ) { - mp_size_send_buffer( mp, f2b[face], 16+nm*sizeof(particle_injector_t) ); - pi_send[face] = (particle_injector_t *)(((char *)mp_send_buffer(mp,f2b[face]))+16); - n_send[face] = 0; + LIST_FOR_EACH( sp, sp_list ) nm += sp->nm; + + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + mp_size_send_buffer( mp, + f2b[ face ], + 16 + nm * sizeof( particle_injector_t ) ); + + pi_send[ face ] = (particle_injector_t *) ( ( (char *) mp_send_buffer( mp, + f2b[ face ] ) + ) + 16 ); + + n_send[ face ] = 0; } + } - if( max_ciq; const int32_t sp_id = sp->id; @@ -187,49 +251,70 @@ boundary_p( particle_bc_t * RESTRICT pbc_list, // Note that particle movers for each species are processed in // reverse order. This allows us to backfill holes in the // particle list created by boundary conditions and/or - // communication. This assumes particle on the mover list are + // communication. This assumes particles on the mover list are // monotonically increasing. That is: pm[n].i > pm[n-1].i for // n=1...nm-1. advance_p and inject_particle create movers with // property if all aged particle injection occurs after - // advance_p and before this + // advance_p and before this. - for( ; nm; pm--, nm-- ) { - i = pm->i; - voxel = p0[i].i; - face = voxel & 7; + for( ; nm; pm--, nm-- ) + { + i = pm->i; + voxel = p0[i].i; + face = voxel & 7; voxel >>= 3; p0[i].i = voxel; - nn = neighbor[ 6*voxel + face ]; + nn = neighbor[ 6 * voxel + face ]; - // Absorb + // Absorb. - if( nn==absorb_particles ) { + if ( nn == absorb_particles ) + { // Ideally, we would batch all rhob accumulations together - // for efficiency - accumulate_rhob( f, p0+i, g, sp_q ); + // for efficiency. + accumulate_rhob( f, p0 + i, g, sp_q ); + goto backfill; } - // Send to a neighboring node + // Send to a neighboring node. + + if ( ( ( nn >= 0 ) & ( nn < rangel ) ) | + ( ( nn > rangeh ) & ( nn <= rangem ) ) ) + { + pi = &pi_send[ face ] [ n_send[ face ]++ ]; + + #ifdef V4_ACCELERATION - if( ((nn>=0) & (nn< rangel)) | ((nn>rangeh) & (nn<=rangem)) ) { - pi = &pi_send[face][n_send[face]++]; -# ifdef V4_ACCELERATION copy_4x1( &pi->dx, &p0[i].dx ); copy_4x1( &pi->ux, &p0[i].ux ); copy_4x1( &pi->dispx, &pm->dispx ); -# else - pi->dx=p0[i].dx; pi->dy=p0[i].dy; pi->dz=p0[i].dz; - pi->ux=p0[i].ux; pi->uy=p0[i].uy; pi->uz=p0[i].uz; pi->w=p0[i].w; - pi->dispx = pm->dispx; pi->dispy = pm->dispy; pi->dispz = pm->dispz; -# endif - (&pi->dx)[axis[face]] = dir[face]; - pi->i = nn - range[face]; - pi->sp_id = sp_id; + + #else + + pi->dx = p0[i].dx; + pi->dy = p0[i].dy; + pi->dz = p0[i].dz; + + pi->ux = p0[i].ux; + pi->uy = p0[i].uy; + pi->uz = p0[i].uz; + pi->w = p0[i].w; + + pi->dispx = pm->dispx; + pi->dispy = pm->dispy; + pi->dispz = pm->dispz; + + #endif + + ( &pi->dx )[ axis[ face ] ] = dir[ face ]; + pi->i = nn - range[ face ]; + pi->sp_id = sp_id; + goto backfill; } - // User-defined handling + // User-defined handling. // After a particle interacts with a boundary it is removed // from the local particle list. Thus, if a boundary handler @@ -248,27 +333,45 @@ boundary_p( particle_bc_t * RESTRICT pbc_list, // nothing to rhob. nn = -nn - 3; // Assumes reflective/absorbing are -1, -2 - if( (nn>=0) & (nn= 0 ) & + ( nn < nb ) ) + { + n_ci += pbc_interact[ nn ]( pbc_params[ nn ], + sp, + p0 + i, + pm, + ci + n_ci, + 1, + face ); + goto backfill; } - // Uh-oh: We fell through + // Uh-oh: We fell through. - WARNING(( "Unknown boundary interaction ... dropping particle " - "(species=%s)", sp->name )); + WARNING( ( "Unknown boundary interaction ... dropping particle " + "(species=%s)", + sp->name ) ); backfill: np--; -# ifdef V4_ACCELERATION + + #if defined(V8_ACCELERATION) + + copy_8x1( &p0[i].dx, &p0[np].dx ); + + #elif defined(V4_ACCELERATION) + copy_4x1( &p0[i].dx, &p0[np].dx ); copy_4x1( &p0[i].ux, &p0[np].ux ); -# else + + #else + p0[i] = p0[np]; -# endif + #endif } sp->np = np; @@ -289,230 +392,399 @@ boundary_p( particle_bc_t * RESTRICT pbc_list, // equilvanet of a MPI_Getcount to determine how much data you // actually received. - for( face=0; face<6; face++ ) - if( shared[face] ) { - *((int *)mp_send_buffer( mp, f2b[face] )) = n_send[face]; - mp_begin_send( mp, f2b[face], sizeof(int), bc[face], f2b[face] ); + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + *( (int *) mp_send_buffer( mp, + f2b[ face ] ) ) = n_send[ face ]; + + mp_begin_send( mp, + f2b[ face ], + sizeof( int ), + bc[ face ], + f2b[ face ] ); } + } - for( face=0; face<6; face++ ) - if( shared[face] ) { - mp_end_recv( mp, f2b[face] ); - n_recv[face] = *((int *)mp_recv_buffer( mp, f2b[face] )); - mp_size_recv_buffer( mp, f2b[face], - 16+n_recv[face]*sizeof(particle_injector_t) ); - mp_begin_recv( mp, f2b[face], 16+n_recv[face]*sizeof(particle_injector_t), - bc[face], f2rb[face] ); + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + mp_end_recv( mp, + f2b[ face ] ); + + n_recv[ face ] = *( (int *) mp_recv_buffer( mp, + f2b[ face ] ) ); + + mp_size_recv_buffer( mp, + f2b[ face ], + 16 + n_recv[ face ] * sizeof( particle_injector_t ) ); + + mp_begin_recv( mp, + f2b[ face ], + 16 + n_recv[ face ] * sizeof( particle_injector_t ), + bc[ face ], + f2rb[ face ] ); } + } + + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + mp_end_send( mp, + f2b[ face ] ); - for( face=0; face<6; face++ ) - if( shared[face] ) { - mp_end_send( mp, f2b[face] ); // FIXME: ASSUMES MP WON'T MUCK WITH REST OF SEND BUFFER. IF WE // DID MORE EFFICIENT MOVER ALLOCATION ABOVE, THIS WOULD BE - // ROBUSTED AGAINST MP IMPLEMENTATION VAGARIES - mp_begin_send( mp, f2b[face], 16+n_send[face]*sizeof(particle_injector_t), - bc[face], f2b[face] ); + // ROBUSTED AGAINST MP IMPLEMENTATION VAGARIES. + + mp_begin_send( mp, + f2b[ face ], + 16 + n_send[ face ] * sizeof( particle_injector_t ), + bc[ face ], + f2b[ face ] ); } + } -# ifndef DISABLE_DYNAMIC_RESIZING - // Resize particle storage to accomodate worst case inject + #ifndef DISABLE_DYNAMIC_RESIZING + // Resize particle storage to accomodate worst case inject. - do { + do + { int n, nm; // Resize each species's particle and mover storage to be large - // enough to guarantee successful injection. (If we broke down + // enough to guarantee successful injection. If we broke down // the n_recv[face] by species before sending it, we could be - // tighter on memory footprint here.) + // tighter on memory footprint here. int max_inj = n_ci; - for( face=0; face<6; face++ ) - if( shared[face] ) max_inj += n_recv[face]; - LIST_FOR_EACH( sp, sp_list ) { + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + max_inj += n_recv[ face ]; + } + } + + LIST_FOR_EACH( sp, sp_list ) + { particle_mover_t * new_pm; - particle_t * new_p; + particle_t * new_p; n = sp->np + max_inj; - if( n>sp->max_np ) { - n += 0.3125*n; // Increase by 31.25% (~<"silver - /**/ // ratio") to minimize resizes (max - /**/ // rate that avoids excessive heap - /**/ // fragmentation) - //float resize_ratio = (float)n/sp->max_np; - WARNING(( "Resizing local %s particle storage from %i to %i", - sp->name, sp->max_np, n )); + + if ( n > sp->max_np ) + { + n += 0.3125 * n; // Increase by 31.25% (~<"silver + /**/ // ratio") to minimize resizes (max + /**/ // rate that avoids excessive heap + /**/ // fragmentation) + + // float resize_ratio = (float)n/sp->max_np; + + WARNING( ( "Resizing local %s particle storage from %i to %i", + sp->name, + sp->max_np, + n ) ); + MALLOC_ALIGNED( new_p, n, 128 ); + COPY( new_p, sp->p, sp->np ); + FREE_ALIGNED( sp->p ); - sp->p = new_p, sp->max_np = n; - /*nm = sp->max_nm * resize_ratio; + sp->p = new_p; + sp->max_np = n; + + /* + nm = sp->max_nm * resize_ratio; WARNING(( "Resizing local %s mover storage from %i to %i", sp->name, sp->max_nm, nm )); MALLOC_ALIGNED( new_pm, nm, 128 ); COPY( new_pm, sp->pm, sp->nm ); FREE_ALIGNED( sp->pm ); sp->pm = new_pm; - sp->max_nm = nm;*/ + sp->max_nm = nm; + */ } - else if(sp->max_np > MIN_NP && n < sp->max_np>>1) + + else if( sp->max_np > MIN_NP && + n < sp->max_np >> 1 ) { - n += 0.125*n; // Overallocate by less since this rank is decreasing - if (nmax_np; - WARNING(( "Resizing (shrinking) local %s particle storage from " - "%i to %i", sp->name, sp->max_np, n)); + n += 0.125 * n; // Overallocate by less since this rank is decreasing + + if ( n < MIN_NP ) + { + n = MIN_NP; + } + + // float resize_ratio = (float)n/sp->max_np; + + WARNING( ( "Resizing (shrinking) local %s particle storage from " + "%i to %i", + sp->name, + sp->max_np, + n ) ); + MALLOC_ALIGNED( new_p, n, 128 ); + COPY( new_p, sp->p, sp->np ); + FREE_ALIGNED( sp->p ); - sp->p = new_p, sp->max_np = n; - /*nm = sp->max_nm * resize_ratio; + sp->p = new_p; + sp->max_np = n; + + /* + nm = sp->max_nm * resize_ratio; + WARNING(( "Resizing (shrinking) local %s mover storage from " "%i to %i", sp->name, sp->max_nm, nm)); MALLOC_ALIGNED( new_pm, nm, 128 ); COPY( new_pm, sp->pm, sp->nm ); FREE_ALIGNED( sp->pm ); - sp->pm = new_pm, sp->max_nm = nm;*/ + sp->pm = new_pm, sp->max_nm = nm; + */ } // Feasibly, a vacuum-filled rank may receive a shock and need more movers - // than available from MIN_NP + // than available from MIN_NP. + nm = sp->nm + max_inj; - if( nm>sp->max_nm ) { - nm += 0.3125*nm; // See note above - //float resize_ratio = (float)nm/sp->max_nm; - WARNING(( "This happened. Resizing local %s mover storage from " - "%i to %i based on not enough movers", - sp->name, sp->max_nm, nm )); + + if ( nm > sp->max_nm ) + { + nm += 0.3125 * nm; // See note above + + // float resize_ratio = (float)nm/sp->max_nm; + + WARNING( ( "This happened. Resizing local %s mover storage from " + "%i to %i based on not enough movers", + sp->name, + sp->max_nm, + nm ) ); + MALLOC_ALIGNED( new_pm, nm, 128 ); + COPY( new_pm, sp->pm, sp->nm ); + FREE_ALIGNED( sp->pm ); - sp->pm = new_pm; + + sp->pm = new_pm; sp->max_nm = nm; - /*n = sp->max_np * resize_ratio; + /* + n = sp->max_np * resize_ratio; WARNING(( "Resizing local %s particle storage from %i to %i", sp->name, sp->max_np, n )); MALLOC_ALIGNED( new_p, n, 128 ); COPY( new_p, sp->p, sp->np ); FREE_ALIGNED( sp->p ); - sp->p = new_p, sp->max_np = n;*/ + sp->p = new_p, sp->max_np = n; + */ } } } while(0); -# endif + #endif - do { + do + { + // Unpack the species list for random acesss. - // Unpack the species list for random acesss + particle_t * RESTRICT ALIGNED(32) sp_p [ MAX_SP ]; + particle_mover_t * RESTRICT ALIGNED(32) sp_pm[ MAX_SP ]; - particle_t * RESTRICT ALIGNED(32) sp_p[ MAX_SP]; - particle_mover_t * RESTRICT ALIGNED(32) sp_pm[MAX_SP]; - float sp_q[MAX_SP]; - int sp_np[MAX_SP]; - int sp_nm[MAX_SP]; + float sp_q [ MAX_SP ]; + int sp_np[ MAX_SP ]; + int sp_nm[ MAX_SP ]; -# ifdef DISABLE_DYNAMIC_RESIZING + #ifdef DISABLE_DYNAMIC_RESIZING int sp_max_np[64], n_dropped_particles[64]; - int sp_max_nm[64], n_dropped_movers[64]; -# endif + int sp_max_nm[64], n_dropped_movers [64]; + #endif - if( num_species( sp_list ) > MAX_SP ) - ERROR(( "Update this to support more species" )); - LIST_FOR_EACH( sp, sp_list ) { - sp_p[ sp->id ] = sp->p; + if ( num_species( sp_list ) > MAX_SP ) + { + ERROR( ( "Update this to support more species." ) ); + } + + LIST_FOR_EACH( sp, sp_list ) + { + sp_p [ sp->id ] = sp->p; sp_pm[ sp->id ] = sp->pm; - sp_q[ sp->id ] = sp->q; + sp_q [ sp->id ] = sp->q; sp_np[ sp->id ] = sp->np; sp_nm[ sp->id ] = sp->nm; -# ifdef DISABLE_DYNAMIC_RESIZING - sp_max_np[sp->id]=sp->max_np; n_dropped_particles[sp->id]=0; - sp_max_nm[sp->id]=sp->max_nm; n_dropped_movers[sp->id]=0; -# endif + + #ifdef DISABLE_DYNAMIC_RESIZING + sp_max_np[ sp->id ] = sp->max_np; + sp_max_nm[ sp->id ] = sp->max_nm; + + n_dropped_particles[ sp->id ] = 0; + n_dropped_movers [ sp->id ] = 0; + #endif } // Inject particles. We do custom local injection first to // increase message overlap opportunities. face = 5; - do { + + do + { /**/ particle_t * RESTRICT ALIGNED(32) p; /**/ particle_mover_t * RESTRICT ALIGNED(16) pm; const particle_injector_t * RESTRICT ALIGNED(16) pi; + int np, nm, n, id; - face++; if( face==7 ) face = 0; - if( face==6 ) pi = ci, n = n_ci; - else if( shared[face] ) { - mp_end_recv( mp, f2b[face] ); + face++; + + if ( face == 7 ) + { + face = 0; + } + + if ( face == 6 ) + { + pi = ci; + n = n_ci; + } + + else if ( shared[ face ] ) + { + mp_end_recv( mp, + f2b[ face ] ); + pi = (const particle_injector_t *) - (((char *)mp_recv_buffer(mp,f2b[face]))+16); - n = n_recv[face]; - } else continue; + ( ( (char *) mp_recv_buffer( mp, + f2b[ face ] ) ) + 16 ); + + n = n_recv[ face ]; + } + + else + { + continue; + } // Reverse order injection is done to reduce thrashing of the - // particle list (particles are removed reverse order so the + // particle list. Particles are removed in reverse order so the // overall impact of removal + injection is to keep injected - // particles in order). + // particles in order. // - // WARNING: THIS TRUSTS THAT THE INJECTORS (INCLUDING THOSE - // RECEIVED FROM OTHER NODES) HAVE VALID PARTICLE IDS. + // WARNING: THIS TRUSTS THAT THE INJECTORS, INCLUDING THOSE + // RECEIVED FROM OTHER NODES, HAVE VALID PARTICLE IDS. + + pi += n - 1; - pi += n-1; - for( ; n; pi--, n-- ) { + for( ; n; pi--, n-- ) + { id = pi->sp_id; - p = sp_p[id]; np = sp_np[id]; - pm = sp_pm[id]; nm = sp_nm[id]; - -# ifdef DISABLE_DYNAMIC_RESIZING - if( np>=sp_max_np[id] ) { n_dropped_particles[id]++; continue; } -# endif -# ifdef V4_ACCELERATION - copy_4x1( &p[np].dx, &pi->dx ); - copy_4x1( &p[np].ux, &pi->ux ); -# else - p[np].dx=pi->dx; p[np].dy=pi->dy; p[np].dz=pi->dz; p[np].i=pi->i; - p[np].ux=pi->ux; p[np].uy=pi->uy; p[np].uz=pi->uz; p[np].w=pi->w; -# endif - sp_np[id] = np+1; - -# ifdef DISABLE_DYNAMIC_RESIZING - if( nm>=sp_max_nm[id] ) { n_dropped_movers[id]++; continue; } -# endif -# ifdef V4_ACCELERATION + + p = sp_p [id]; + np = sp_np[id]; + + pm = sp_pm[id]; + nm = sp_nm[id]; + + #ifdef DISABLE_DYNAMIC_RESIZING + if ( np >= sp_max_np[ id ] ) + { + n_dropped_particles[ id ]++; + + continue; + } + #endif + + #ifdef V4_ACCELERATION + + copy_4x1( &p[np].dx, &pi->dx ); + copy_4x1( &p[np].ux, &pi->ux ); + + #else + + p[np].dx = pi->dx; + p[np].dy = pi->dy; + p[np].dz = pi->dz; + p[np].i = pi->i; + + p[np].ux = pi->ux; + p[np].uy = pi->uy; + p[np].uz = pi->uz; + p[np].w = pi->w; + + #endif + + sp_np[id] = np + 1; + + #ifdef DISABLE_DYNAMIC_RESIZING + if ( nm >= sp_max_nm[ id ] ) + { + n_dropped_movers[ id ]++; + + continue; + } + #endif + + #ifdef V4_ACCELERATION + copy_4x1( &pm[nm].dispx, &pi->dispx ); + pm[nm].i = np; -# else - pm[nm].dispx=pi->dispx; pm[nm].dispy=pi->dispy; pm[nm].dispz=pi->dispz; - pm[nm].i=np; -# endif - sp_nm[id] = nm + move_p( p, pm+nm, a0, g, sp_q[id] ); + + #else + + pm[nm].dispx = pi->dispx; + pm[nm].dispy = pi->dispy; + pm[nm].dispz = pi->dispz; + pm[nm].i = np; + + #endif + + sp_nm[id] = nm + move_p( p, pm + nm, a0, g, sp_q[id] ); } - } while(face!=5); - - LIST_FOR_EACH( sp, sp_list ) { -# ifdef DISABLE_DYNAMIC_RESIZING - if( n_dropped_particles[sp->id] ) - WARNING(( "Dropped %i particles from species \"%s\". Use a larger " - "local particle allocation in your simulation setup for " - "this species on this node.", - n_dropped_particles[sp->id], sp->name )); - if( n_dropped_movers[sp->id] ) - WARNING(( "%i particles were not completed moved to their final " - "location this timestep for species \"%s\". Use a larger " - "local particle mover buffer in your simulation setup " - "for this species on this node.", - n_dropped_movers[sp->id], sp->name )); -# endif - sp->np=sp_np[sp->id]; - sp->nm=sp_nm[sp->id]; + } while( face != 5 ); + + LIST_FOR_EACH( sp, sp_list ) + { + #ifdef DISABLE_DYNAMIC_RESIZING + if ( n_dropped_particles[ sp->id ] ) + { + WARNING( ( "Dropped %i particles from species \"%s\". Use a larger " + "local particle allocation in your simulation setup for " + "this species on this node.", + n_dropped_particles[ sp->id ], + sp->name ) ); + } + + if ( n_dropped_movers[ sp->id ] ) + { + WARNING( ( "%i particles were not completed moved to their final " + "location this timestep for species \"%s\". Use a larger " + "local particle mover buffer in your simulation setup " + "for this species on this node.", + n_dropped_movers[ sp->id ], + sp->name ) ); + } + #endif + + sp->np = sp_np[ sp->id ]; + sp->nm = sp_nm[ sp->id ]; } } while(0); - for( face=0; face<6; face++ ) - if( shared[face] ) mp_end_send(mp,f2b[face]); + for( face = 0; face < 6; face++ ) + { + if ( shared[ face ] ) + { + mp_end_send( mp, + f2b[ face ] ); + } + } } diff --git a/src/boundary/link.cc b/src/boundary/link.cc index 6743cfeb..0e12cec4 100644 --- a/src/boundary/link.cc +++ b/src/boundary/link.cc @@ -24,7 +24,8 @@ link_boundary( link_boundary_t * lb, species_t * sp, particle_injector_t * pi, rng_t * rng, - int face ) { + int face ) +{ static FILE *fp = NULL; int ix, iy, iz; double x, y, z; diff --git a/src/collision/collision_private.h b/src/collision/collision_private.h index 52d6c8e4..2bbef1b6 100644 --- a/src/collision/collision_private.h +++ b/src/collision/collision_private.h @@ -44,7 +44,8 @@ END_C_DECLS /////////////////////////////////////////////////////////////////////////////// // Langevin pipeline interface -typedef struct langevin_pipeline_args { +typedef struct langevin_pipeline_args +{ MEM_PTR( particle_t, 128 ) p; MEM_PTR( rng_t, 128 ) rng[ MAX_PIPELINE ]; float decay; diff --git a/src/collision/hard_sphere.cc b/src/collision/hard_sphere.cc index 966d75c2..922ebfb5 100644 --- a/src/collision/hard_sphere.cc +++ b/src/collision/hard_sphere.cc @@ -2,7 +2,8 @@ /* Private interface *********************************************************/ -typedef struct hard_sphere { +typedef struct hard_sphere +{ float twomu_mi, twomu_mj, Kc; float udx, udy, udz, ut; float ut2, alpha_Kt2ut4, beta_Kt2ut2, gamma_Kt2; @@ -99,7 +100,8 @@ typedef struct hard_sphere { float hard_sphere_fluid_rate_constant( const hard_sphere_t * RESTRICT hs, const species_t * RESTRICT spi, - const particle_t * RESTRICT pi ) { + const particle_t * RESTRICT pi ) +{ static const float gamma = (3.*M_PI-8.)/(24.-6*M_PI); float urx = pi->ux - hs->udx; float ury = pi->uy - hs->udy; @@ -116,7 +118,8 @@ hard_sphere_rate_constant( const hard_sphere_t * RESTRICT hs, const species_t * RESTRICT spi, const species_t * RESTRICT spj, const particle_t * RESTRICT pi, - const particle_t * RESTRICT pj ) { + const particle_t * RESTRICT pj ) +{ float urx = pi->ux - pj->ux; float ury = pi->uy - pj->uy; float urz = pi->uz - pj->uz; @@ -236,45 +239,46 @@ hard_sphere_rate_constant( const hard_sphere_t * RESTRICT hs, #define CMOV(a,b) if(t0=1 ); \ - \ - /* There are lots of ways to formulate T vector formation */ \ - /* This has no branches (but uses L1 heavily) */ \ - \ +#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng) \ + do { \ + float bcs_R, bsn_R, b2_R2, ur, tx, ty, tz, t0, t1, t2, stack[3]; \ + int d0, d1, d2; \ + \ + do { \ + bcs_R = 2*frand_c0(rng) - 1; \ + bsn_R = 2*frand_c0(rng) - 1; \ + b2_R2 = bcs_R*bcs_R + bsn_R*bsn_R; \ + } while( b2_R2>=1 ); \ + \ + /* There are lots of ways to formulate T vector formation */ \ + /* This has no branches (but uses L1 heavily) */ \ + \ t0 = urx*urx; d0=0; d1=1; d2=2; t1=t0; ur = t0; \ t0 = ury*ury; CMOV(d0,1); CMOV(d1,2); CMOV(d2,0); CMOV(t1,t0); ur += t0; \ t0 = urz*urz; CMOV(d0,2); CMOV(d1,0); CMOV(d2,1); ur += t0; \ - ur = sqrtf( ur ); \ - \ - stack[0] = urx; \ - stack[1] = ury; \ - stack[2] = urz; \ - t1 = stack[d1]; \ - t2 = stack[d2]; \ - t0 = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN ); \ - stack[d0] = 0; \ - stack[d1] = t0*t2; \ - stack[d2] = -t0*t1; \ - tx = stack[0]; \ - ty = stack[1]; \ - tz = stack[2]; \ - \ - t0 = 1 - b2_R2; \ - t2 = sqrtf( t0 ); \ - t1 = t2*bcs_R*ur; \ - t2 *= bsn_R; \ - \ - ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty ); \ - ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz ); \ - az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx ); \ + ur = sqrtf( ur ); \ + \ + stack[0] = urx; \ + stack[1] = ury; \ + stack[2] = urz; \ + t1 = stack[d1]; \ + t2 = stack[d2]; \ + t0 = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN ); \ + stack[d0] = 0; \ + stack[d1] = t0*t2; \ + stack[d2] = -t0*t1; \ + tx = stack[0]; \ + ty = stack[1]; \ + tz = stack[2]; \ + \ + t0 = 1 - b2_R2; \ + t2 = sqrtf( t0 ); \ + t1 = t2*bcs_R*ur; \ + t2 *= bsn_R; \ + \ + ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty ); \ + ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz ); \ + az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx ); \ } while(0) /* It would be nice to preserve redundant rate constant @@ -284,7 +288,8 @@ void hard_sphere_fluid_collision( const hard_sphere_t * RESTRICT hs, const species_t * RESTRICT spi, /**/ particle_t * RESTRICT pi, - /**/ rng_t * RESTRICT rng ) { + /**/ rng_t * RESTRICT rng ) +{ float urx, ury, urz, ax, ay, az, w; urx = pi->ux - hs->udx; @@ -313,7 +318,8 @@ hard_sphere_collision( const hard_sphere_t * RESTRICT hs, /**/ particle_t * RESTRICT pi, /**/ particle_t * RESTRICT pj, /**/ rng_t * RESTRICT rng, - const int type ) { + const int type ) +{ float urx, ury, urz, ax, ay, az, w; urx = pi->ux - pj->ux; @@ -340,12 +346,14 @@ hard_sphere_collision( const hard_sphere_t * RESTRICT hs, #undef CMOV void -checkpt_hard_sphere( const hard_sphere_t * hs ) { +checkpt_hard_sphere( const hard_sphere_t * hs ) +{ CHECKPT( hs, 1 ); } hard_sphere_t * -restore_hard_sphere( void ) { +restore_hard_sphere( void ) +{ hard_sphere_t * hs; RESTORE( hs ); return hs; @@ -365,7 +373,8 @@ hard_sphere_fluid( const char * RESTRICT name, /* Model name */ species_t * RESTRICT sp, /* Species */ const float rsp, /* Species p. radius (LENGTH) */ rng_pool_t * RESTRICT rp, /* Entropy pool */ - const int interval ) { /* How often to apply this */ + const int interval ) /* How often to apply this */ +{ hard_sphere_t * hs; if( n0<0 || kT0<0 || m0<=0 || r0<0 || @@ -401,7 +410,8 @@ hard_sphere( const char * RESTRICT name, /* Model name */ const float rj, /* Species-j p. radius (LENGTH) */ rng_pool_t * RESTRICT rp, /* Entropy pool */ const double sample, /* Sampling density */ - const int interval ) { /* How often to apply this */ + const int interval ) /* How often to apply this */ +{ hard_sphere_t * hs; if( !spi || spi->m<=0 || ri<0 || @@ -419,4 +429,3 @@ hard_sphere( const char * RESTRICT name, /* Model name */ (binary_collision_func_t) hard_sphere_collision, hs, spi, spj, rp, sample, interval ); } - diff --git a/src/collision/large_angle_coulomb.cc b/src/collision/large_angle_coulomb.cc index 901ff072..bdae3beb 100644 --- a/src/collision/large_angle_coulomb.cc +++ b/src/collision/large_angle_coulomb.cc @@ -2,7 +2,8 @@ /* Private interface *********************************************************/ -typedef struct large_angle_coulomb { +typedef struct large_angle_coulomb +{ float cc, twomu_mi, twomu_mj, Kc; float udx, udy, udz, ut; float ut2, alpha_Kt2ut4, beta_Kt2ut2, gamma_Kt2; @@ -14,7 +15,8 @@ float large_angle_coulomb_fluid_rate_constant( const large_angle_coulomb_t * RESTRICT lac, const species_t * RESTRICT spi, - const particle_t * RESTRICT pi ) { + const particle_t * RESTRICT pi ) +{ static const float gamma = (3.*M_PI-8.)/(24.-6*M_PI); float urx = pi->ux - lac->udx; float ury = pi->uy - lac->udy; @@ -30,7 +32,8 @@ large_angle_coulomb_rate_constant( const species_t * RESTRICT spi, const species_t * RESTRICT spj, const particle_t * RESTRICT pi, - const particle_t * RESTRICT pj ) { + const particle_t * RESTRICT pj ) +{ float urx = pi->ux - pj->ux; float ury = pi->uy - pj->uy; float urz = pi->uz - pj->uz; @@ -79,48 +82,49 @@ large_angle_coulomb_rate_constant( #define CMOV(a,b) if(t0=1 ); \ - \ - /* There are lots of ways to formulate T vector formation */ \ - /* This has no branches (but uses L1 heavily) */ \ - \ +#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng) \ + do { \ + float bcs_bmax, bsn_bmax, b2_bmax2, ur2, ur, tx, ty, tz; \ + float t0, t1, t2, stack[3]; \ + int d0, d1, d2; \ + \ + do { \ + bcs_bmax = 2*frand_c0(rng) - 1; \ + bsn_bmax = 2*frand_c0(rng) - 1; \ + b2_bmax2 = bcs_bmax*bcs_bmax + bsn_bmax*bsn_bmax; \ + } while( b2_bmax2>=1 ); \ + \ + /* There are lots of ways to formulate T vector formation */ \ + /* This has no branches (but uses L1 heavily) */ \ + \ t0 = urx*urx; d0=0; d1=1; d2=2; t1=t0; ur2 = t0; \ t0 = ury*ury; CMOV(d0,1); CMOV(d1,2); CMOV(d2,0); CMOV(t1,t0); ur2 += t0; \ t0 = urz*urz; CMOV(d0,2); CMOV(d1,0); CMOV(d2,1); ur2 += t0; \ - ur = sqrtf( ur2 ); \ - \ - stack[0] = urx; \ - stack[1] = ury; \ - stack[2] = urz; \ - t1 = stack[d1]; \ - t2 = stack[d2]; \ - t0 = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN ); \ - stack[d0] = 0; \ - stack[d1] = t0*t2; \ - stack[d2] = -t0*t1; \ - tx = stack[0]; \ - ty = stack[1]; \ - tz = stack[2]; \ - \ - t2 = lac->cc; /* 4 pi eps0 mu c^2 bmax / (qi qj) */ \ - t1 = t2 * ur2; /* B (bmax / b) */ \ - t0 = 1/(1+(t1*t1)*b2_bmax2); /* 1 / ( B^2 + 1 ) */ \ - t2 = t0*t1; /* (B / ( B^2 + 1 ))(bmax / b) */ \ - t1 = t2*bcs_bmax*ur; /* (B / (B^2+1)) cos phi |ur0| */ \ - t2 = t2*bsn_bmax; /* (B / (B^2+1)) sin phi */ \ - \ - ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty ); \ - ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz ); \ - az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx ); \ + ur = sqrtf( ur2 ); \ + \ + stack[0] = urx; \ + stack[1] = ury; \ + stack[2] = urz; \ + t1 = stack[d1]; \ + t2 = stack[d2]; \ + t0 = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN ); \ + stack[d0] = 0; \ + stack[d1] = t0*t2; \ + stack[d2] = -t0*t1; \ + tx = stack[0]; \ + ty = stack[1]; \ + tz = stack[2]; \ + \ + t2 = lac->cc; /* 4 pi eps0 mu c^2 bmax / (qi qj) */ \ + t1 = t2 * ur2; /* B (bmax / b) */ \ + t0 = 1/(1+(t1*t1)*b2_bmax2); /* 1 / ( B^2 + 1 ) */ \ + t2 = t0*t1; /* (B / ( B^2 + 1 ))(bmax / b) */ \ + t1 = t2*bcs_bmax*ur; /* (B / (B^2+1)) cos phi |ur0| */ \ + t2 = t2*bsn_bmax; /* (B / (B^2+1)) sin phi */ \ + \ + ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty ); \ + ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz ); \ + az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx ); \ } while(0) /* It would be nice to preserve redundant rate constant @@ -131,7 +135,8 @@ large_angle_coulomb_fluid_collision( const large_angle_coulomb_t * RESTRICT lac, const species_t * RESTRICT spi, /**/ particle_t * RESTRICT pi, - /**/ rng_t * RESTRICT rng ) { + /**/ rng_t * RESTRICT rng ) +{ float urx, ury, urz, ax, ay, az, w; urx = pi->ux - lac->udx; @@ -161,7 +166,8 @@ large_angle_coulomb_collision( /**/ particle_t * RESTRICT pi, /**/ particle_t * RESTRICT pj, /**/ rng_t * RESTRICT rng, - const int type ) { + const int type ) +{ float urx, ury, urz, ax, ay, az, w; urx = pi->ux - pj->ux; @@ -188,12 +194,14 @@ large_angle_coulomb_collision( #undef CMOV void -checkpt_large_angle_coulomb( const large_angle_coulomb_t * lac ) { +checkpt_large_angle_coulomb( const large_angle_coulomb_t * lac ) +{ CHECKPT( lac, 1 ); } large_angle_coulomb_t * -restore_large_angle_coulomb( void ) { +restore_large_angle_coulomb( void ) +{ large_angle_coulomb_t * lac; RESTORE( lac ); return lac; @@ -214,7 +222,8 @@ large_angle_coulomb_fluid( species_t * RESTRICT sp, /* Species */ const float bmax, /* Impact parameter cutoff */ rng_pool_t * RESTRICT rp, /* Entropy pool */ - const int interval ) { /* How often to apply this */ + const int interval ) /* How often to apply this */ +{ large_angle_coulomb_t * lac; if( n0<0 || kT0<0 || !q0 || m0<=0 || !sp || !sp->q || sp->m<=0 || bmax<0 ) @@ -255,7 +264,8 @@ large_angle_coulomb( const char * RESTRICT name, /* Model name */ const float bmax, /* Impact parameter cutoff */ rng_pool_t * RESTRICT rp, /* Entropy pool */ const double sample, /* Sampling density */ - const int interval ) { /* How often to apply this */ + const int interval ) /* How often to apply this */ +{ large_angle_coulomb_t * lac; if( !spi || !spi->q || spi->m<=0 || @@ -277,4 +287,3 @@ large_angle_coulomb( const char * RESTRICT name, /* Model name */ (binary_collision_func_t) large_angle_coulomb_collision, lac, spi, spj, rp, sample, interval ); } - diff --git a/src/emitter/child_langmuir.cc b/src/emitter/child_langmuir.cc index dd74178a..cbeb5c20 100644 --- a/src/emitter/child_langmuir.cc +++ b/src/emitter/child_langmuir.cc @@ -208,4 +208,3 @@ child_langmuir( /**/ species_t * RESTRICT sp, (restore_func_t)restore_child_langmuir, NULL ); } - diff --git a/src/species_advance/species_advance.cc b/src/species_advance/species_advance.cc index 33ffd435..0e85a646 100644 --- a/src/species_advance/species_advance.cc +++ b/src/species_advance/species_advance.cc @@ -13,7 +13,8 @@ /* Private interface *********************************************************/ void -checkpt_species( const species_t * sp ) { +checkpt_species( const species_t * sp ) +{ CHECKPT( sp, 1 ); CHECKPT_STR( sp->name ); checkpt_data( sp->p, @@ -28,7 +29,8 @@ checkpt_species( const species_t * sp ) { } species_t * -restore_species( void ) { +restore_species( void ) +{ species_t * sp; RESTORE( sp ); RESTORE_STR( sp->name ); @@ -41,7 +43,8 @@ restore_species( void ) { } void -delete_species( species_t * sp ) { +delete_species( species_t * sp ) +{ UNREGISTER_OBJECT( sp ); FREE_ALIGNED( sp->partition ); FREE_ALIGNED( sp->pm ); @@ -53,12 +56,14 @@ delete_species( species_t * sp ) { /* Public interface **********************************************************/ int -num_species( const species_t * sp_list ) { +num_species( const species_t * sp_list ) +{ return sp_list ? sp_list->id+1 : 0; } void -delete_species_list( species_t * sp_list ) { +delete_species_list( species_t * sp_list ) +{ species_t * sp; while( sp_list ) { sp = sp_list; @@ -69,7 +74,8 @@ delete_species_list( species_t * sp_list ) { species_t * find_species_id( species_id id, - species_t * sp_list ) { + species_t * sp_list ) +{ species_t * sp; LIST_FIND_FIRST( sp, sp_list, sp->id==id ); return sp; @@ -77,7 +83,8 @@ find_species_id( species_id id, species_t * find_species_name( const char * name, - species_t * sp_list ) { + species_t * sp_list ) +{ species_t * sp; if( !name ) return NULL; LIST_FIND_FIRST( sp, sp_list, strcmp( sp->name, name )==0 ); @@ -86,7 +93,8 @@ find_species_name( const char * name, species_t * append_species( species_t * sp, - species_t ** sp_list ) { + species_t ** sp_list ) +{ if( !sp || !sp_list ) ERROR(( "Bad args" )); if( sp->next ) ERROR(( "Species \"%s\" already in a list", sp->name )); if( find_species_name( sp->name, *sp_list ) ) @@ -107,7 +115,8 @@ species( const char * name, size_t max_local_nm, int sort_interval, int sort_out_of_place, - grid_t * g ) { + grid_t * g ) +{ species_t * sp; int len = name ? strlen(name) : 0; diff --git a/src/species_advance/standard/hydro_p.cc b/src/species_advance/standard/hydro_p.cc index f85a79a2..f81f989f 100644 --- a/src/species_advance/standard/hydro_p.cc +++ b/src/species_advance/standard/hydro_p.cc @@ -26,7 +26,8 @@ void accumulate_hydro_p( hydro_array_t * RESTRICT ha, const species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ) { + const interpolator_array_t * RESTRICT ia ) +{ /**/ hydro_t * RESTRICT ALIGNED(128) h; const particle_t * RESTRICT ALIGNED(128) p; const interpolator_t * RESTRICT ALIGNED(128) f; diff --git a/src/species_advance/standard/move_p.cc b/src/species_advance/standard/move_p.cc index dfba3785..507d1807 100644 --- a/src/species_advance/standard/move_p.cc +++ b/src/species_advance/standard/move_p.cc @@ -12,7 +12,7 @@ // position is updated to the point where the particle interacted and // m->dispx, m->dispy, m->dispz contains the remaining particle // displacement. The displacements are the physical displacments -// normalized current cell size. +// normalized by the current cell size. // // Because move_p is frequently called, it does not check its input // arguments. Higher level routines are responsible for insuring valid @@ -32,10 +32,10 @@ move_p( particle_t * RESTRICT ALIGNED(128) p, particle_mover_t * RESTRICT ALIGNED(16) pm, accumulator_t * RESTRICT ALIGNED(128) a, const grid_t * g, - const float qsp ) { - - /*const*/ v4float one( 1.f ); - /*const*/ v4float tiny( 1e-37f ); + const float qsp ) +{ + /*const*/ v4float one( 1.0f ); + /*const*/ v4float tiny( 1.0e-37f ); /*const*/ v4int sign_bits( 1<<31 ); v4float dr, r, u, q, q3; @@ -217,19 +217,21 @@ move_p( particle_t * ALIGNED(128) p0, particle_mover_t * ALIGNED(16) pm, accumulator_t * ALIGNED(128) a0, const grid_t * g, - const float qsp ) { + const float qsp ) +{ float s_midx, s_midy, s_midz; float s_dispx, s_dispy, s_dispz; float s_dir[3]; float v0, v1, v2, v3, v4, v5, q; int axis, face; int64_t neighbor; - float *a; + float * a; particle_t * ALIGNED(32) p = p0 + pm->i; - q = qsp*p->w; + q = qsp * p->w; - for(;;) { + for( ;; ) + { s_midx = p->dx; s_midy = p->dy; s_midz = p->dz; @@ -238,15 +240,15 @@ move_p( particle_t * ALIGNED(128) p0, s_dispy = pm->dispy; s_dispz = pm->dispz; - s_dir[0] = (s_dispx>0.0f) ? 1.0f : -1.0f; - s_dir[1] = (s_dispy>0.0f) ? 1.0f : -1.0f; - s_dir[2] = (s_dispz>0.0f) ? 1.0f : -1.0f; + s_dir[0] = ( s_dispx > 0.0f ) ? 1.0f : -1.0f; + s_dir[1] = ( s_dispy > 0.0f ) ? 1.0f : -1.0f; + s_dir[2] = ( s_dispz > 0.0f ) ? 1.0f : -1.0f; - // Compute the twice the fractional distance to each potential + // Compute twice the fractional distance to each potential // streak/cell face intersection. - v0 = (s_dispx==0.0f) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx; - v1 = (s_dispy==0.0f) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy; - v2 = (s_dispz==0.0f) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz; + v0 = ( s_dispx == 0.0f ) ? 3.4e38f : ( s_dir[0] - s_midx ) / s_dispx; + v1 = ( s_dispy == 0.0f ) ? 3.4e38f : ( s_dir[1] - s_midy ) / s_dispy; + v2 = ( s_dispz == 0.0f ) ? 3.4e38f : ( s_dir[2] - s_midz ) / s_dispz; // Determine the fractional length and axis of current streak. The // streak ends on either the first face intersected by the @@ -254,16 +256,17 @@ move_p( particle_t * ALIGNED(128) p0, // // axis 0,1 or 2 ... streak ends on a x,y or z-face respectively // axis 3 ... streak ends at end of the particle track - /**/ v3=2.0f, axis=3; - if(v0i); -# define accumulate_j(X,Y,Z) \ + v5 = q * s_dispx * s_dispy * s_dispz * ( 1.0 / 3.0 ); + + a = (float *) ( a0 + p->i ); + + #define accumulate_j(X,Y,Z) \ v4 = q*s_disp##X; /* v2 = q ux */ \ v1 = v4*s_mid##Y; /* v1 = q ux dy */ \ v0 = v4-v1; /* v0 = q ux (1-dy) */ \ @@ -292,24 +297,31 @@ move_p( particle_t * ALIGNED(128) p0, a[1] += v1; \ a[2] += v2; \ a[3] += v3 + accumulate_j(x,y,z); a += 4; accumulate_j(y,z,x); a += 4; accumulate_j(z,x,y); -# undef accumulate_j - // Compute the remaining particle displacment + #undef accumulate_j + + // Compute the remaining particle displacment. pm->dispx -= s_dispx; pm->dispy -= s_dispy; pm->dispz -= s_dispz; - // Compute the new particle offset - p->dx += s_dispx+s_dispx; - p->dy += s_dispy+s_dispy; - p->dz += s_dispz+s_dispz; + // Compute the new particle offset. + p->dx += s_dispx + s_dispx; + p->dy += s_dispy + s_dispy; + p->dz += s_dispz + s_dispz; - // If an end streak, return success (should be ~50% of the time) + // If an end streak, return success (should be ~50% of the time). This + // is the case where the particle moves to a voxel located within the + // same MPI domain. - if( axis==3 ) break; + if ( axis == 3 ) + { + break; + } // Determine if the particle crossed into a local cell or if it // hit a boundary and convert the coordinate system accordingly. @@ -319,25 +331,38 @@ move_p( particle_t * ALIGNED(128) p0, // +/-1 _exactly_ for the particle. v0 = s_dir[axis]; - (&(p->dx))[axis] = v0; // Avoid roundoff fiascos--put the particle - // _exactly_ on the boundary. - face = axis; if( v0>0 ) face += 3; - neighbor = g->neighbor[ 6*p->i + face ]; - if( UNLIKELY( neighbor==reflect_particles ) ) { + ( &( p->dx ) )[axis] = v0; // Avoid roundoff fiascos--put the particle + // _exactly_ on the boundary. + + face = axis; + + if ( v0 > 0 ) + { + face += 3; + } + + neighbor = g->neighbor[ 6 * p->i + face ]; + + if ( UNLIKELY( neighbor == reflect_particles ) ) + { // Hit a reflecting boundary condition. Reflect the particle // momentum and remaining displacement and keep moving the // particle. - (&(p->ux ))[axis] = -(&(p->ux ))[axis]; - (&(pm->dispx))[axis] = -(&(pm->dispx))[axis]; + ( &( p->ux ) )[axis] = - ( &( p->ux ) )[axis]; + ( &( pm->dispx ) )[axis] = - ( &( pm->dispx ) )[axis]; + continue; } - if( UNLIKELY( neighborrangel || neighbor>g->rangeh ) ) { + if ( UNLIKELY( neighbor < g->rangel || + neighbor > g->rangeh ) ) + { // Cannot handle the boundary condition here. Save the updated // particle position, face it hit and update the remaining // displacement in the particle mover. - p->i = 8*p->i + face; + p->i = 8 * p->i + face; + return 1; // Return "mover still in use" } @@ -346,7 +371,7 @@ move_p( particle_t * ALIGNED(128) p0, p->i = neighbor - g->rangel; // Compute local index of neighbor /**/ // Note: neighbor - g->rangel < 2^31 / 6 - (&(p->dx))[axis] = -v0; // Convert coordinate system + ( &( p->dx ) )[axis] = - v0; // Convert coordinate system } return 0; // Return "mover not in use" diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc index 8dde6f27..3cdc4d10 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc @@ -13,7 +13,8 @@ //----------------------------------------------------------------------------// // Reference implementation for an advance_p pipeline function which does not -// make use of explicit calls to vector intrinsic functions. +// make use of explicit calls to vector intrinsic functions. This is the AoS +// version. //----------------------------------------------------------------------------// void diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc index ef6f8b1a..49ea867e 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc @@ -254,7 +254,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args, dy = v01; dz = v02; - v13 = q*ux*uy*uz*one_third; // Charge conservation correction + v15 = q*ux*uy*uz*one_third; // Charge conservation correction //-------------------------------------------------------------------------- // Set current density accumulation pointers. @@ -279,67 +279,82 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args, //-------------------------------------------------------------------------- // Accumulate current density. //-------------------------------------------------------------------------- - // Accumulate Jx for 16 particles into the v0-v3 vectors. - v12 = q*ux; // v12 = q ux - v01 = v12*dy; // v01 = q ux dy - v00 = v12-v01; // v00 = q ux (1-dy) - v01 += v12; // v01 = q ux (1+dy) - v12 = one+dz; // v12 = 1+dz - v02 = v00*v12; // v02 = q ux (1-dy)(1+dz) - v03 = v01*v12; // v03 = q ux (1+dy)(1+dz) - v12 = one-dz; // v12 = 1-dz - v00 *= v12; // v00 = q ux (1-dy)(1-dz) - v01 *= v12; // v01 = q ux (1+dy)(1-dz) - v00 += v13; // v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] - v01 -= v13; // v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] - v02 -= v13; // v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] - v03 += v13; // v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] - - // Accumulate Jy for 16 particles into the v4-v7 vectors. - v12 = q*uy; // v12 = q uy - v05 = v12*dz; // v05 = q uy dz - v04 = v12-v05; // v04 = q uy (1-dz) - v05 += v12; // v05 = q uy (1+dz) - v12 = one+dx; // v12 = 1+dx - v06 = v04*v12; // v06 = q uy (1-dz)(1+dx) - v07 = v05*v12; // v07 = q uy (1+dz)(1+dx) - v12 = one-dx; // v12 = 1-dx - v04 *= v12; // v04 = q uy (1-dz)(1-dx) - v05 *= v12; // v05 = q uy (1+dz)(1-dx) - v04 += v13; // v04 = q uy [ (1-dz)(1-dx) + ux*uz/3 ] - v05 -= v13; // v05 = q uy [ (1+dz)(1-dx) - ux*uz/3 ] - v06 -= v13; // v06 = q uy [ (1-dz)(1+dx) - ux*uz/3 ] - v07 += v13; // v07 = q uy [ (1+dz)(1+dx) + ux*uz/3 ] - - // Accumulate Jz for 16 particles into the v8-v11 vectors. - v12 = q*uz; // v12 = q uz - v09 = v12*dx; // v09 = q uz dx - v08 = v12-v09; // v08 = q uz (1-dx) - v09 += v12; // v09 = q uz (1+dx) - v12 = one+dy; // v12 = 1+dy - v10 = v08*v12; // v10 = q uz (1-dx)(1+dy) - v11 = v09*v12; // v11 = q uz (1+dx)(1+dy) - v12 = one-dy; // v12 = 1-dy - v08 *= v12; // v08 = q uz (1-dx)(1-dy) - v09 *= v12; // v09 = q uz (1+dx)(1-dy) - v08 += v13; // v08 = q uz [ (1-dx)(1-dy) + ux*uy/3 ] - v09 -= v13; // v09 = q uz [ (1+dx)(1-dy) - ux*uy/3 ] - v10 -= v13; // v10 = q uz [ (1-dx)(1+dy) - ux*uy/3 ] - v11 += v13; // v11 = q uz [ (1+dx)(1+dy) + ux*uy/3 ] - - // Zero the v12-v15 vectors prior to transposing the data. + // Accumulate Jx for 16 particles into the v0 - v3 vectors. + + v12 = q * ux; // v12 = q ux + v01 = v12 * dy; // v01 = q ux dy + v00 = v12 - v01; // v00 = q ux (1-dy) + v01 += v12; // v01 = q ux (1+dy) + + v13 = one + dz; // v13 = 1+dz + v02 = v00 * v13; // v02 = q ux (1-dy)(1+dz) + v03 = v01 * v13; // v03 = q ux (1+dy)(1+dz) + + v14 = one - dz; // v14 = 1-dz + v00 *= v14; // v00 = q ux (1-dy)(1-dz) + v01 *= v14; // v01 = q ux (1+dy)(1-dz) + + v00 += v15; // v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] + v01 -= v15; // v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] + v02 -= v15; // v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] + v03 += v15; // v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] + + // Accumulate Jy for 16 particles into the v4 - v7 vectors. + + v12 = q * uy; // v12 = q uy + v05 = v12 * dz; // v05 = q uy dz + v04 = v12 - v05; // v04 = q uy (1-dz) + v05 += v12; // v05 = q uy (1+dz) + + v13 = one + dx; // v13 = 1+dx + v06 = v04 * v13; // v06 = q uy (1-dz)(1+dx) + v07 = v05 * v13; // v07 = q uy (1+dz)(1+dx) + + v14 = one - dx; // v14 = 1-dx + v04 *= v14; // v04 = q uy (1-dz)(1-dx) + v05 *= v14; // v05 = q uy (1+dz)(1-dx) + + v04 += v15; // v04 = q uy [ (1-dz)(1-dx) + ux*uz/3 ] + v05 -= v15; // v05 = q uy [ (1+dz)(1-dx) - ux*uz/3 ] + v06 -= v15; // v06 = q uy [ (1-dz)(1+dx) - ux*uz/3 ] + v07 += v15; // v07 = q uy [ (1+dz)(1+dx) + ux*uz/3 ] + + // Accumulate Jz for 16 particles into the v8 - v11 vectors. + + v12 = q * uz; // v12 = q uz + v09 = v12 * dx; // v09 = q uz dx + v08 = v12 - v09; // v08 = q uz (1-dx) + v09 += v12; // v09 = q uz (1+dx) + + v13 = one + dy; // v13 = 1+dy + v10 = v08 * v13; // v10 = q uz (1-dx)(1+dy) + v11 = v09 * v13; // v11 = q uz (1+dx)(1+dy) + + v14 = one - dy; // v14 = 1-dy + v08 *= v14; // v08 = q uz (1-dx)(1-dy) + v09 *= v14; // v09 = q uz (1+dx)(1-dy) + + v08 += v15; // v08 = q uz [ (1-dx)(1-dy) + ux*uy/3 ] + v09 -= v15; // v09 = q uz [ (1+dx)(1-dy) - ux*uy/3 ] + v10 -= v15; // v10 = q uz [ (1-dx)(1+dy) - ux*uy/3 ] + v11 += v15; // v11 = q uz [ (1+dx)(1+dy) + ux*uy/3 ] + + // Zero the v12 - v15 vectors prior to transposing the data. + v12 = 0.0; v13 = 0.0; v14 = 0.0; v15 = 0.0; - // Transpose the data in vectors v0-v15 so it can be added into the + // Transpose the data in vectors v0 - v15 so it can be added into the // accumulator arrays using vector operations. + transpose( v00, v01, v02, v03, v04, v05, v06, v07, v08, v09, v10, v11, v12, v13, v14, v15 ); // Add the contributions to Jx, Jy and Jz from 16 particles into the // accumulator arrays for Jx, Jy and Jz. + increment_16x1( vp00, v00 ); increment_16x1( vp01, v01 ); increment_16x1( vp02, v02 ); @@ -362,7 +377,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args, // particles. //-------------------------------------------------------------------------- -# define MOVE_OUTBND(N) \ + #define MOVE_OUTBND(N) \ if ( outbnd(N) ) /* Unlikely */ \ { \ local_pm->dispx = ux(N); \ @@ -399,7 +414,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args, MOVE_OUTBND(14); MOVE_OUTBND(15); -# undef MOVE_OUTBND + #undef MOVE_OUTBND } args->seg[pipeline_rank].pm = pm; diff --git a/src/species_advance/standard/pipeline/center_p_pipeline.cc b/src/species_advance/standard/pipeline/center_p_pipeline.cc index 64b8ea32..bec2cdcd 100644 --- a/src/species_advance/standard/pipeline/center_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/center_p_pipeline.cc @@ -112,7 +112,7 @@ center_p_pipeline( species_t * RESTRICT sp, !ia || sp->g != ia->g ) { - ERROR( ( "Bad args" ) ); + ERROR( ( "Bad args." ) ); } // Have the pipelines do the bulk of particles in blocks and have the @@ -120,9 +120,10 @@ center_p_pipeline( species_t * RESTRICT sp, args->p0 = sp->p; args->f0 = ia->i; - args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); + args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac ); args->np = sp->np; EXEC_PIPELINES( center_p, args, 0 ); + WAIT_PIPELINES(); } diff --git a/src/species_advance/standard/pipeline/spa_private.h b/src/species_advance/standard/pipeline/spa_private.h index ed95881e..8f56a622 100644 --- a/src/species_advance/standard/pipeline/spa_private.h +++ b/src/species_advance/standard/pipeline/spa_private.h @@ -43,7 +43,6 @@ typedef struct advance_p_pipeline_args int nz; // z-mesh resolution PAD_STRUCT( 6*SIZEOF_MEM_PTR + 5*sizeof(float) + 5*sizeof(int) ) - } advance_p_pipeline_args_t; // PROTOTYPE_PIPELINE( advance_p, advance_p_pipeline_args_t ); @@ -79,7 +78,6 @@ typedef struct center_p_pipeline_args int np; // Number of particles PAD_STRUCT( 2*SIZEOF_MEM_PTR + sizeof(float) + sizeof(int) ) - } center_p_pipeline_args_t; // PROTOTYPE_PIPELINE( center_p, center_p_pipeline_args_t ); @@ -138,7 +136,6 @@ typedef struct energy_p_pipeline_args int np; // Number of particles PAD_STRUCT( 3*SIZEOF_MEM_PTR + 2*sizeof(float) + sizeof(int) ) - } energy_p_pipeline_args_t; // PROTOTYPE_PIPELINE( energy_p, energy_p_pipeline_args_t ); @@ -206,7 +203,6 @@ typedef struct sort_p_pipeline_args int n_voxel; // Number of voxels total (including ghosts) PAD_STRUCT( 5*SIZEOF_MEM_PTR + 5*sizeof(int) ) - } sort_p_pipeline_args_t; // PROTOTYPE_PIPELINE( coarse_count, sort_p_pipeline_args_t ); diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc index f3b6d442..1b6a1cc3 100644 --- a/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc @@ -108,11 +108,11 @@ uncenter_p_pipeline( species_t * RESTRICT sp, { DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 ); - if ( !sp || - !ia || + if ( ! sp || + ! ia || sp->g != ia->g ) { - ERROR( ( "Bad args" ) ); + ERROR( ( "Bad args." ) ); } // Have the pipelines do the bulk of particles in blocks and have the @@ -120,7 +120,7 @@ uncenter_p_pipeline( species_t * RESTRICT sp, args->p0 = sp->p; args->f0 = ia->i; - args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); + args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac ); args->np = sp->np; EXEC_PIPELINES( uncenter_p, args, 0 ); diff --git a/src/species_advance/standard/rho_p.cc b/src/species_advance/standard/rho_p.cc index 629badc1..776594cd 100644 --- a/src/species_advance/standard/rho_p.cc +++ b/src/species_advance/standard/rho_p.cc @@ -17,11 +17,12 @@ // interpolation is used. rhof is known at the nodes at the same time // as particle positions. No effort is made to fix up edges of the // computational domain; see note in synchronize_rhob about why this -// is done this way. All particles on the list must be inbounds. +// is done this way. All particles on the list must be in bounds. void accumulate_rho_p( /**/ field_array_t * RESTRICT fa, - const species_t * RESTRICT sp ) { + const species_t * RESTRICT sp ) +{ if( !fa || !sp || fa->g!=sp->g ) ERROR(( "Bad args" )); /**/ field_t * RESTRICT ALIGNED(128) f = fa->f; @@ -126,7 +127,8 @@ void accumulate_rhob( field_t * RESTRICT ALIGNED(128) f, const particle_t * RESTRICT ALIGNED(32) p, const grid_t * RESTRICT g, - const float qsp ) { + const float qsp ) +{ # if 1 // See note in rhof for why this variant is used. diff --git a/src/species_advance/standard/sort_p.cc b/src/species_advance/standard/sort_p.cc index 3464011a..935d6fa9 100644 --- a/src/species_advance/standard/sort_p.cc +++ b/src/species_advance/standard/sort_p.cc @@ -153,9 +153,9 @@ sort_p( species_t * sp ) void sort_p( species_t * sp ) { - if ( !sp ) + if ( ! sp ) { - ERROR( ( "Bad args" ) ); + ERROR( ( "Bad args." ) ); } // Conditionally execute this when more abstractions are available. diff --git a/src/util/profile/profile.h b/src/util/profile/profile.h index 3175611f..f26de1e3 100644 --- a/src/util/profile/profile.h +++ b/src/util/profile/profile.h @@ -34,6 +34,7 @@ _( load_interpolator ) \ _( compute_curl_b ) \ _( compute_rhob ) \ + _( center_p ) \ _( uncenter_p ) \ _( user_initialization ) \ _( user_particle_collisions ) \ diff --git a/src/util/v16/v16_avx512.h b/src/util/v16/v16_avx512.h index b9331831..69d0922d 100644 --- a/src/util/v16/v16_avx512.h +++ b/src/util/v16/v16_avx512.h @@ -523,31 +523,33 @@ namespace v16 // v16 memory manipulation functions + // Portable version. inline void load_16x1( const void * ALIGNED(64) p, v16 &a ) { for( int j = 0; j < 16; j++ ) - a.i[j] = ((const int * ALIGNED(64))p)[j]; + a.i[j] = ( ( const int * ALIGNED(64) ) p )[j]; } + // Portable version. inline void store_16x1( const v16 &a, void * ALIGNED(64) p ) { for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = a.i[j]; + ( ( int * ALIGNED(64) ) p )[j] = a.i[j]; } inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) { for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = a.i[j]; + ( ( int * ALIGNED(64) ) p )[j] = a.i[j]; } inline void clear_16x1( void * ALIGNED(64) p ) { for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = 0; + ( ( int * ALIGNED(64) ) p )[j] = 0; } // FIXME: Ordering semantics @@ -555,7 +557,7 @@ namespace v16 const void * ALIGNED(64) src ) { for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))dst)[j] = ((const int * ALIGNED(64))src)[j]; + ( ( int * ALIGNED(64) ) dst )[j] = ( ( const int * ALIGNED(64) ) src )[j]; } inline void swap_16x1( void * ALIGNED(64) a, @@ -565,9 +567,9 @@ namespace v16 for( int j = 0; j < 16; j++ ) { - t = ((int * ALIGNED(64))a)[j]; - ((int * ALIGNED(64))a)[j] = ((int * ALIGNED(64))b)[j]; - ((int * ALIGNED(64))b)[j] = t; + t = ( ( int * ALIGNED(64) ) a )[j]; + ( ( int * ALIGNED(64) ) a )[j] = ( ( int * ALIGNED(64) ) b )[j]; + ( ( int * ALIGNED(64) ) b )[j] = t; } } diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h index 2c52d963..6ff3f58c 100644 --- a/src/util/v4/v4_altivec.h +++ b/src/util/v4/v4_altivec.h @@ -232,33 +232,45 @@ namespace v4 { // v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.v = vec_ld( 0, (const float *)p ); + inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) + { + a.v = vec_ld( 0, ( const float * ) p ); } - inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - vec_st( a.v, 0, (float *)p ); + inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + vec_st( a.v, 0, ( float * ) p ); } - inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - vec_stl( a.v, 0, (float *)p ); + inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + vec_stl( a.v, 0, ( float * ) p ); } // FIXME: Ordering semantics - inline void clear_4x1( void * ALIGNED(16) d ) { - vec_st( _zero, 0, (float *)d ); + inline void clear_4x1( void * ALIGNED(16) d ) + { + vec_st( _zero, 0, ( float * ) d ); } // FIXME: Ordering semantics - inline void copy_4x1( void * ALIGNED(16) d, const void * ALIGNED(16) s ) { - vec_st( vec_ld( 0, (const float *)s ), 0, (float *)d ); + inline void copy_4x1( void * ALIGNED(16) d, + const void * ALIGNED(16) s ) + { + vec_st( vec_ld( 0, ( const float * ) s ), 0, ( float * ) d ); } - inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) { - _v4_float va = vec_ld( 0, (float *)a ); - _v4_float vb = vec_ld( 0, (float *)b ); - vec_st( vb, 0, (float *)a ); - vec_st( va, 0, (float *)b ); + inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) + { + _v4_float va = vec_ld( 0, ( float * ) a ); + _v4_float vb = vec_ld( 0, ( float * ) b ); + + vec_st( vb, 0, ( float * ) a ); + vec_st( va, 0, ( float * ) b ); } // v4 transposed memory manipulation functions diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h index f2b47552..3c48096e 100644 --- a/src/util/v4/v4_avx.h +++ b/src/util/v4/v4_avx.h @@ -178,32 +178,43 @@ namespace v4 { // v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.v = _mm_load_ps((float *)p); + inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) + { + a.v = _mm_load_ps( ( float * ) p ); } - inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_store_ps((float *)p,a.v); + inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + _mm_store_ps( ( float * ) p, a.v ); } - inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_stream_ps((float *)p,a.v); + inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + _mm_stream_ps( ( float * ) p, a.v ); } - inline void clear_4x1( void * ALIGNED(16) p ) { - _mm_store_ps( (float *)p, _mm_setzero_ps() ); + inline void clear_4x1( void * ALIGNED(16) p ) + { + _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); } inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) { - _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) ); + const void * ALIGNED(16) src ) + { + _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ - inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) { - __m128 t = _mm_load_ps((float *)a); - _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) ); - _mm_store_ps( (float *)b, t ); + inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) + { + __m128 t = _mm_load_ps( ( float * ) a ); + + _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); + _mm_store_ps( ( float * ) b, t ); } // v4 transposed memory manipulation functions diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index abb7814f..a7b7b783 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -204,39 +204,39 @@ namespace v4 inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.v = _mm_load_ps( (float *)p ); + a.v = _mm_load_ps( ( float * ) p ); } inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_store_ps( (float *)p, a.v ); + _mm_store_ps( ( float * ) p, a.v ); } inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_stream_ps( (float *)p, a.v ); + _mm_stream_ps( ( float * ) p, a.v ); } inline void clear_4x1( void * ALIGNED(16) p ) { - _mm_store_ps( (float *)p, _mm_setzero_ps() ); + _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); } inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { - _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) ); + _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) { - __m128 t = _mm_load_ps((float *)a); + __m128 t = _mm_load_ps( ( float * ) a ); - _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) ); - _mm_store_ps( (float *)b, t ); + _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); + _mm_store_ps( ( float * ) b, t ); } // v4 transposed memory manipulation functions diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h index fe82058f..b2ed5dcb 100644 --- a/src/util/v4/v4_sse.h +++ b/src/util/v4/v4_sse.h @@ -178,32 +178,43 @@ namespace v4 { // v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.v = _mm_load_ps((float *)p); + inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) + { + a.v = _mm_load_ps( ( float * ) p ); } - inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_store_ps((float *)p,a.v); + inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + _mm_store_ps( ( float * ) p, a.v ); } - inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - _mm_stream_ps((float *)p,a.v); + inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + _mm_stream_ps( ( float * ) p, a.v ); } - inline void clear_4x1( void * ALIGNED(16) p ) { - _mm_store_ps( (float *)p, _mm_setzero_ps() ); + inline void clear_4x1( void * ALIGNED(16) p ) + { + _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); } inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) { - _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) ); + const void * ALIGNED(16) src ) + { + _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ - inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) { - __m128 t = _mm_load_ps((float *)a); - _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) ); - _mm_store_ps( (float *)b, t ); + inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) + { + __m128 t = _mm_load_ps( ( float * ) a ); + + _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); + _mm_store_ps( ( float * ) b, t ); } // v4 transposed memory manipulation functions diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 31d689fb..62505147 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -259,7 +259,8 @@ vpic_simulation::dump_hydro( const char *sp_name, void vpic_simulation::dump_particles( const char *sp_name, const char *fbase, - int ftag ) { + int ftag ) +{ species_t *sp; char fname[256]; FileIO fileIO; diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc index d49bbf57..8cc28da0 100644 --- a/src/vpic/initialize.cc +++ b/src/vpic/initialize.cc @@ -68,4 +68,3 @@ vpic_simulation::finalize( void ) { barrier(); update_profile( rank()==0 ); } - diff --git a/src/vpic/misc.cc b/src/vpic/misc.cc index 4cc3f0a5..28bb9e27 100644 --- a/src/vpic/misc.cc +++ b/src/vpic/misc.cc @@ -1,12 +1,9 @@ -/* - * Written by: - * Kevin J. Bowers, Ph.D. - * Plasma Physics Group (X-1) - * Applied Physics Division - * Los Alamos National Lab - * March/April 2004 - Original version - * - */ +// Written by: +// Kevin J. Bowers, Ph.D. +// Plasma Physics Group (X-1) +// Applied Physics Division +// Los Alamos National Lab +// March/April 2004 - Original version #include "vpic.h" @@ -17,7 +14,8 @@ vpic_simulation::inject_particle( species_t * sp, double x, double y, double z, double ux, double uy, double uz, double w, double age, - int update_rhob ) { + int update_rhob ) +{ int ix, iy, iz; // Check input parameters @@ -96,9 +94,8 @@ vpic_simulation::inject_particle( species_t * sp, pm->i = sp->np-1; sp->nm += move_p( sp->p, pm, accumulator_array->a, grid, sp->q ); } - } - + // Add capability to modify certain fields "on the fly" so that one // can, e.g., extend a run, change a quota, or modify a dump interval // without having to rerun from the start. diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index b309bd55..2c34d11f 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -531,7 +531,8 @@ class vpic_simulation { inline void inject_particle_raw( species_t * RESTRICT sp, float dx, float dy, float dz, int32_t i, - float ux, float uy, float uz, float w ) { + float ux, float uy, float uz, float w ) + { particle_t * RESTRICT p = sp->p + (sp->np++); p->dx = dx; p->dy = dy; p->dz = dz; p->i = i; p->ux = ux; p->uy = uy; p->uz = uz; p->w = w; @@ -544,7 +545,8 @@ class vpic_simulation { float dx, float dy, float dz, int32_t i, float ux, float uy, float uz, float w, float dispx, float dispy, float dispz, - int update_rhob ) { + int update_rhob ) + { particle_t * RESTRICT p = sp->p + (sp->np++); particle_mover_t * RESTRICT pm = sp->pm + sp->nm; p->dx = dx; p->dy = dy; p->dz = dz; p->i = i; From 58a6873837e4298a4cede42c074cc93eeb9d671c Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 9 Jul 2019 22:22:29 -0600 Subject: [PATCH 15/95] Initial beginning of ARM Neon intrinsics support. --- src/util/v4/v4.h | 15 + src/util/v4/v4_neon.h | 1184 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1199 insertions(+) create mode 100644 src/util/v4/v4_neon.h diff --git a/src/util/v4/v4.h b/src/util/v4/v4.h index 3cf5183c..0b8cc4c1 100644 --- a/src/util/v4/v4.h +++ b/src/util/v4/v4.h @@ -1,20 +1,35 @@ #ifndef _v4_h_ #define _v4_h_ + /* FIXME: STYLE */ #define IN_v4_h + /* FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H? */ + #ifdef __cplusplus + # if defined USE_V4_ALTIVEC # include "v4_altivec.h" + # elif defined USE_V4_PORTABLE # include "v4_portable.h" + # elif defined USE_V4_SSE # include "v4_sse.h" + # elif defined USE_V4_AVX # include "v4_avx.h" + # elif defined USE_V4_AVX2 # include "v4_avx2.h" + +# elif defined USE_V4_NEON +# include "v4_neon.h" + # endif + #endif + #undef IN_v4_h + #endif // _v4_h_ diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h new file mode 100644 index 00000000..af16614a --- /dev/null +++ b/src/util/v4/v4_neon.h @@ -0,0 +1,1184 @@ +#ifndef _v4_neon_h_ +#define _v4_neon_h_ + +#ifndef IN_v4_h +#error "Do not include v4_neon.h directly; use v4.h" +#endif + +#include +#include + +#define V4_ACCELERATION +#define V4_NEON_ACCELERATION + +#ifndef ALIGNED +#define ALIGNED(n) +#endif + +// This does not work with gcc 5.3.1 and the -fopenmp-simd +// flag. Does not seem to work with -fopenmp either. Not +// sure why. It does work with the Intel compiler. Need +// to try later versions of gcc. +// #define ALWAYS_VECTORIZE _Pragma( "omp simd" ) + +// #define ALWAYS_VECTORIZE _Pragma( "simd" ) + +#define ALWAYS_VECTORIZE \ + _Pragma( "simd" ) \ + _Pragma( "vector aligned" ) + +#define ALWAYS_INLINE __attribute__((always_inline)) + +namespace v4 +{ + class v4; + class v4int; + class v4float; + + //////////////// + // v4 base class + + class v4 + { + friend class v4int; + friend class v4float; + + // v4 miscellaneous friends + + friend inline int any( const v4 &a ) ALWAYS_INLINE; + friend inline int all( const v4 &a ) ALWAYS_INLINE; + + template + friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + + template + friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + + friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; + friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + + // v4int miscellaneous friends + + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + + // v4 memory manipulation friends + + friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; + friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + // v4 transposed memory manipulation friends + + friend inline void load_4x1_tr( const void *a0, const void *a1, + const void *a2, const void *a3, + v4 &a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void * ALIGNED(8) a0, + const void * ALIGNED(8) a1, + const void * ALIGNED(8) a2, + const void * ALIGNED(8) a3, + v4 &a, v4 &b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4 &a, + void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4 &a, const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4 &a, const v4 &b, + const v4 &c, const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) ALWAYS_INLINE; + + protected: + + union + { + int i[4]; + float f[4]; + }; + + public: + + v4() {} // Default constructor + + v4( const v4 &a ) // Copy constructor + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; + } + + ~v4() {} // Default destructor + }; + + // v4 miscellaneous functions + + inline int any( const v4 &a ) + { + return a.i[0] || a.i[1] || a.i[2] || a.i[3]; + } + + inline int all( const v4 &a ) + { + return a.i[0] && a.i[1] && a.i[2] && a.i[3]; + } + + template + inline v4 splat( const v4 & a ) + { + v4 b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = a.i[n]; + + return b; + } + + template + inline v4 shuffle( const v4 & a ) + { + v4 b; + + b.i[0] = a.i[i0]; + b.i[1] = a.i[i1]; + b.i[2] = a.i[i2]; + b.i[3] = a.i[i3]; + + return b; + } + +# define sw(x,y) x^=y, y^=x, x^=y + + inline void swap( v4 &a, v4 &b ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + sw( a.i[j], b.i[j] ); + } + + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { + sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); + sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); + sw( a2.i[3],a3.i[2] ); + } + +# undef sw + + // v4 memory manipulation functions + + inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + a.i[j] = ((const int * ALIGNED(16))p)[j]; + } + + inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + ((int * ALIGNED(16))p)[j] = a.i[j]; + } + + inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + ((int * ALIGNED(16))p)[j] = a.i[j]; + } + + inline void clear_4x1( void * ALIGNED(16) p ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + ((int * ALIGNED(16))p)[j] = 0; + } + + // FIXME: Ordering semantics + inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; + } + + inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) + { + int t; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + { + t = ((int * ALIGNED(16))a)[j]; + ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; + ((int * ALIGNED(16))b)[j] = t; + } + } + + // v4 transposed memory manipulation functions + + inline void load_4x1_tr( const void *a0, const void *a1, + const void *a2, const void *a3, + v4 &a ) + { + a.i[0] = ((const int *)a0)[0]; + a.i[1] = ((const int *)a1)[0]; + a.i[2] = ((const int *)a2)[0]; + a.i[3] = ((const int *)a3)[0]; + } + + inline void load_4x2_tr( const void * ALIGNED(8) a0, + const void * ALIGNED(8) a1, + const void * ALIGNED(8) a2, + const void * ALIGNED(8) a3, + v4 &a, v4 &b ) + { + a.i[0] = ((const int * ALIGNED(8))a0)[0]; + b.i[0] = ((const int * ALIGNED(8))a0)[1]; + + a.i[1] = ((const int * ALIGNED(8))a1)[0]; + b.i[1] = ((const int * ALIGNED(8))a1)[1]; + + a.i[2] = ((const int * ALIGNED(8))a2)[0]; + b.i[2] = ((const int * ALIGNED(8))a2)[1]; + + a.i[3] = ((const int * ALIGNED(8))a3)[0]; + b.i[3] = ((const int * ALIGNED(8))a3)[1]; + } + + inline void load_4x3_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, v4 &b, v4 &c ) + { + a.i[0] = ((const int * ALIGNED(16))a0)[0]; + b.i[0] = ((const int * ALIGNED(16))a0)[1]; + c.i[0] = ((const int * ALIGNED(16))a0)[2]; + + a.i[1] = ((const int * ALIGNED(16))a1)[0]; + b.i[1] = ((const int * ALIGNED(16))a1)[1]; + c.i[1] = ((const int * ALIGNED(16))a1)[2]; + + a.i[2] = ((const int * ALIGNED(16))a2)[0]; + b.i[2] = ((const int * ALIGNED(16))a2)[1]; + c.i[2] = ((const int * ALIGNED(16))a2)[2]; + + a.i[3] = ((const int * ALIGNED(16))a3)[0]; + b.i[3] = ((const int * ALIGNED(16))a3)[1]; + c.i[3] = ((const int * ALIGNED(16))a3)[2]; + } + + inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, v4 &b, v4 &c, v4 &d ) + { + a.i[0] = ((const int * ALIGNED(16))a0)[0]; + b.i[0] = ((const int * ALIGNED(16))a0)[1]; + c.i[0] = ((const int * ALIGNED(16))a0)[2]; + d.i[0] = ((const int * ALIGNED(16))a0)[3]; + + a.i[1] = ((const int * ALIGNED(16))a1)[0]; + b.i[1] = ((const int * ALIGNED(16))a1)[1]; + c.i[1] = ((const int * ALIGNED(16))a1)[2]; + d.i[1] = ((const int * ALIGNED(16))a1)[3]; + + a.i[2] = ((const int * ALIGNED(16))a2)[0]; + b.i[2] = ((const int * ALIGNED(16))a2)[1]; + c.i[2] = ((const int * ALIGNED(16))a2)[2]; + d.i[2] = ((const int * ALIGNED(16))a2)[3]; + + a.i[3] = ((const int * ALIGNED(16))a3)[0]; + b.i[3] = ((const int * ALIGNED(16))a3)[1]; + c.i[3] = ((const int * ALIGNED(16))a3)[2]; + d.i[3] = ((const int * ALIGNED(16))a3)[3]; + } + + inline void store_4x1_tr( const v4 &a, + void *a0, void *a1, + void *a2, void *a3 ) + { + ((int *)a0)[0] = a.i[0]; + ((int *)a1)[0] = a.i[1]; + ((int *)a2)[0] = a.i[2]; + ((int *)a3)[0] = a.i[3]; + } + + inline void store_4x2_tr( const v4 &a, const v4 &b, + void * ALIGNED(8) a0, void * ALIGNED(8) a1, + void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + { + ((int * ALIGNED(8))a0)[0] = a.i[0]; + ((int * ALIGNED(8))a0)[1] = b.i[0]; + + ((int * ALIGNED(8))a1)[0] = a.i[1]; + ((int * ALIGNED(8))a1)[1] = b.i[1]; + + ((int * ALIGNED(8))a2)[0] = a.i[2]; + ((int * ALIGNED(8))a2)[1] = b.i[2]; + + ((int * ALIGNED(8))a3)[0] = a.i[3]; + ((int * ALIGNED(8))a3)[1] = b.i[3]; + } + + inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, + void * ALIGNED(16) a0, void * ALIGNED(16) a1, + void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + { + ((int * ALIGNED(16))a0)[0] = a.i[0]; + ((int * ALIGNED(16))a0)[1] = b.i[0]; + ((int * ALIGNED(16))a0)[2] = c.i[0]; + + ((int * ALIGNED(16))a1)[0] = a.i[1]; + ((int * ALIGNED(16))a1)[1] = b.i[1]; + ((int * ALIGNED(16))a1)[2] = c.i[1]; + + ((int * ALIGNED(16))a2)[0] = a.i[2]; + ((int * ALIGNED(16))a2)[1] = b.i[2]; + ((int * ALIGNED(16))a2)[2] = c.i[2]; + + ((int * ALIGNED(16))a3)[0] = a.i[3]; + ((int * ALIGNED(16))a3)[1] = b.i[3]; + ((int * ALIGNED(16))a3)[2] = c.i[3]; + } + + inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, + void * ALIGNED(16) a0, void * ALIGNED(16) a1, + void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + { + ((int * ALIGNED(16))a0)[0] = a.i[0]; + ((int * ALIGNED(16))a0)[1] = b.i[0]; + ((int * ALIGNED(16))a0)[2] = c.i[0]; + ((int * ALIGNED(16))a0)[3] = d.i[0]; + + ((int * ALIGNED(16))a1)[0] = a.i[1]; + ((int * ALIGNED(16))a1)[1] = b.i[1]; + ((int * ALIGNED(16))a1)[2] = c.i[1]; + ((int * ALIGNED(16))a1)[3] = d.i[1]; + + ((int * ALIGNED(16))a2)[0] = a.i[2]; + ((int * ALIGNED(16))a2)[1] = b.i[2]; + ((int * ALIGNED(16))a2)[2] = c.i[2]; + ((int * ALIGNED(16))a2)[3] = d.i[2]; + + ((int * ALIGNED(16))a3)[0] = a.i[3]; + ((int * ALIGNED(16))a3)[1] = b.i[3]; + ((int * ALIGNED(16))a3)[2] = c.i[3]; + ((int * ALIGNED(16))a3)[3] = d.i[3]; + } + + ////////////// + // v4int class + + class v4int : public v4 + { + // v4int prefix unary operator friends + + friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + // Note: Referencing (*) and dereferencing (&) apply to the whole vector + + // v4int prefix increment / decrement operator friends + + friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + + // v4int postfix increment / decrement operator friends + + friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + + // v4int binary operator friends + + friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + + // v4int logical operator friends + + friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + + // v4int miscellaneous friends + + friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + // FIXME: cswap, notcswap! + friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + + // v4float unary operator friends + + friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + + // v4float logical operator friends + + friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + + // v4float miscellaneous friends + + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + + public: + + // v4int constructors / destructors + + v4int() {} // Default constructor + + v4int( const v4int &a ) // Copy constructor + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; + } + + v4int( const v4 &a ) // Init from mixed + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; + } + + v4int( int a ) // Init from scalar + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + i[j] = a; + } + + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + { + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; + } + + ~v4int() {} // Destructor + + // v4int assignment operators + +# define ASSIGN(op) \ + inline v4int &operator op( const v4int &b ) \ + { \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ + } + + ASSIGN( =) + ASSIGN(+=) + ASSIGN(-=) + ASSIGN(*=) + ASSIGN(/=) + ASSIGN(%=) + ASSIGN(^=) + ASSIGN(&=) + ASSIGN(|=) + ASSIGN(<<=) + ASSIGN(>>=) + +# undef ASSIGN + + // v4int member access operator + + inline int &operator []( int n ) + { + return i[n]; + } + + inline int operator ()( int n ) + { + return i[n]; + } + }; + + // v4int prefix unary operators + +# define PREFIX_UNARY(op) \ + inline v4int operator op( const v4int & a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } + + PREFIX_UNARY(+) + PREFIX_UNARY(-) + + inline v4int operator !( const v4int & a ) + { + v4int b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = - ( !a.i[j] ); + + return b; + } + + PREFIX_UNARY(~) + +# undef PREFIX_UNARY + + // v4int prefix increment / decrement + +# define PREFIX_INCDEC(op) \ + inline v4int operator op( v4int & a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } + + PREFIX_INCDEC(++) + PREFIX_INCDEC(--) + +# undef PREFIX_INCDEC + + // v4int postfix increment / decrement + +# define POSTFIX_INCDEC(op) \ + inline v4int operator op( v4int & a, int ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } + + POSTFIX_INCDEC(++) + POSTFIX_INCDEC(--) + +# undef POSTFIX_INCDEC + + // v4int binary operators + +# define BINARY(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } + + BINARY(+) + BINARY(-) + BINARY(*) + BINARY(/) + BINARY(%) + BINARY(^) + BINARY(&) + BINARY(|) + BINARY(<<) + BINARY(>>) + +# undef BINARY + + // v4int logical operators + +# define LOGICAL(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + c.i[j] = - ( a.i[j] op b.i[j] ); \ + return c; \ + } + + LOGICAL(<) + LOGICAL(>) + LOGICAL(==) + LOGICAL(!=) + LOGICAL(<=) + LOGICAL(>=) + LOGICAL(&&) + LOGICAL(||) + +# undef LOGICAL + + // v4int miscellaneous functions + + inline v4int abs( const v4int &a ) + { + v4int b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + + return b; + } + + inline v4 czero( const v4int &c, const v4 &a ) + { + v4 b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; + + return b; + } + + inline v4 notczero( const v4int &c, const v4 &a ) + { + v4 b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & c.i[j]; + + return b; + } + + inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) + { + v4 m; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + + return m; + } + + //////////////// + // v4float class + + class v4float : public v4 + { + // v4float prefix unary operator friends + + friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; + friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + // Note: Referencing (*) and dereferencing (&) apply to the whole vector + + // v4float prefix increment / decrement operator friends + + friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + + // v4float postfix increment / decrement operator friends + + friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + + // v4float binary operator friends + + friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + + // v4float logical operator friends + + friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + + // v4float math library friends + +# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE +# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE + + CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); + CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); + CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); + CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); + CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); + + CMATH_FR2(copysign); + +# undef CMATH_FR1 +# undef CMATH_FR2 + + // v4float miscellaneous friends + + friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + + public: + + // v4float constructors / destructors + + v4float() {} // Default constructor + + v4float( const v4float &a ) // Copy constructor + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + f[j] = a.f[j]; + } + + v4float( const v4 &a ) // Init from mixed + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + f[j] = a.f[j]; + } + + v4float( float a ) // Init from scalar + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + f[j] = a; + } + + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + { + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; + } + + ~v4float() {} // Destructor + + // v4float assignment operators + +# define ASSIGN(op) \ + inline v4float &operator op( const v4float &b ) \ + { \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + f[j] op b.f[j]; \ + return *this; \ + } + + ASSIGN(=) + ASSIGN(+=) + ASSIGN(-=) + ASSIGN(*=) + ASSIGN(/=) + +# undef ASSIGN + + // v4float member access operator + + inline float &operator []( int n ) + { + return f[n]; + } + + inline float operator ()( int n ) + { + return f[n]; + } + }; + + // v4float prefix unary operators + + inline v4float operator +( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = +a.f[j]; + + return b; + } + + inline v4float operator -( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = -a.f[j]; + + return b; + } + + inline v4int operator !( const v4float &a ) + { + v4int b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] ? 0 : -1; + + return b; + } + + // v4float prefix increment / decrement operators + + inline v4float operator ++( v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = ++a.f[j]; + + return b; + } + + inline v4float operator --( v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = --a.f[j]; + + return b; + } + + // v4float postfix increment / decrement operators + + inline v4float operator ++( v4float &a, int ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]++; + + return b; + } + + inline v4float operator --( v4float &a, int ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]--; + + return b; + } + + // v4float binary operators + +# define BINARY(op) \ + inline v4float operator op( const v4float &a, const v4float &b ) \ + { \ + v4float c; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + c.f[j] = a.f[j] op b.f[j]; \ + return c; \ + } + + BINARY(+) + BINARY(-) + BINARY(*) + BINARY(/) + +# undef BINARY + + // v4float logical operators + +# define LOGICAL(op) \ + inline v4int operator op( const v4float &a, const v4float &b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + c.i[j] = - ( a.f[j] op b.f[j] ); \ + return c; \ + } + + LOGICAL(< ) + LOGICAL(> ) + LOGICAL(==) + LOGICAL(!=) + LOGICAL(<=) + LOGICAL(>=) + LOGICAL(&&) + LOGICAL(||) + +# undef LOGICAL + + // v4float math library functions + +# define CMATH_FR1(fn) \ + inline v4float fn( const v4float &a ) \ + { \ + v4float b; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } + +# define CMATH_FR2(fn) \ + inline v4float fn( const v4float &a, const v4float &b ) \ + { \ + v4float c; \ + ALWAYS_VECTORIZE \ + for( int j = 0; j < 4; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } + + CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) + CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) + CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) + CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) + CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) + + inline v4float copysign( const v4float &a, const v4float &b ) + { + v4float c; + float t; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + { + t = ::fabs( a.f[j] ); + if( b.f[j] < 0 ) t = -t; + c.f[j] = t; + } + + return c; + } + +# undef CMATH_FR1 +# undef CMATH_FR2 + + // v4float miscellaneous functions + + inline v4float rsqrt_approx( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); + + return b; + } + + inline v4float rsqrt( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); + + return b; + } + + inline v4float rcp_approx( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = 1.0f / a.f[j]; + + return b; + } + + inline v4float rcp( const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = 1.0f / a.f[j]; + + return b; + } + + inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) + { + v4float d; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + d.f[j] = a.f[j] * b.f[j] + c.f[j]; + + return d; + } + + inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) + { + v4float d; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + d.f[j] = a.f[j] * b.f[j] - c.f[j]; + + return d; + } + + inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) + { + v4float d; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + d.f[j] = c.f[j] - a.f[j] * b.f[j]; + + return d; + } + + inline v4float clear_bits( const v4int &m, const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = ( ~m.i[j] ) & a.i[j]; + + return b; + } + + inline v4float set_bits( const v4int &m, const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = m.i[j] | a.i[j]; + + return b; + } + + inline v4float toggle_bits( const v4int &m, const v4float &a ) + { + v4float b; + + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = m.i[j] ^ a.i[j]; + + return b; + } + + inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + p[j] += a.f[j]; + } + + inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + p[j] -= a.f[j]; + } + + inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + { + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + p[j] *= a.f[j]; + } + + inline void trilinear( v4float & wl, v4float & wh ) + { + float x = wl.f[0], y = wl.f[1], z = wl.f[2]; + + wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z ); + wl.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f - z ); + wl.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f - z ); + wl.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f - z ); + + wh.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f + z ); + wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z ); + wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z ); + wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z ); + } + +} // namespace v4 + +#endif // _v4_neon_h_ From 304922e7ad8a8e4a9684aa4b9d53adaa7667248d Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 14 Jul 2019 18:58:43 -0600 Subject: [PATCH 16/95] Add CMake support for ARM NEON intrinsics. --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f9902c5..797eb136 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,8 @@ option(USE_PTHREADS "Use Pthreads" ON) option(USE_V4_ALTIVEC "Enable V4 Altivec" OFF) +option(USE_V4_NEON "Enable V4 NEON" OFF) + option(USE_V4_PORTABLE "Enable V4 Portable" OFF) option(USE_V4_SSE "Enable V4 SSE" OFF) @@ -201,6 +203,12 @@ if(USE_V4_ALTIVEC) set(USE_V4 True) endif(USE_V4_ALTIVEC) +if(USE_V4_NEON) + add_definitions(-DUSE_V4_NEON) + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_NEON") + set(USE_V4 True) +endif(USE_V4_NEON) + #------------------------------------------------------------------------------# # Add options for building with v8 simd vector support. #------------------------------------------------------------------------------# From c68b58acb924312cac89372f1adf0d2b132b8649 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 14 Jul 2019 21:01:33 -0600 Subject: [PATCH 17/95] Add NEON support for v4float binary operators. --- src/util/v4/v4_neon.h | 87 ++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index af16614a..1dce1cb3 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -135,7 +135,7 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } ~v4() {} // Default destructor @@ -199,7 +199,7 @@ namespace v4 // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -207,7 +207,7 @@ namespace v4 } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -215,7 +215,7 @@ namespace v4 } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -239,7 +239,7 @@ namespace v4 } inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { int t; @@ -256,7 +256,7 @@ namespace v4 inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ) + v4 &a ) { a.i[0] = ((const int *)a0)[0]; a.i[1] = ((const int *)a1)[0]; @@ -335,7 +335,7 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, void *a1, - void *a2, void *a3 ) + void *a2, void *a3 ) { ((int *)a0)[0] = a.i[0]; ((int *)a1)[0] = a.i[1]; @@ -492,21 +492,21 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } v4int( const v4 &a ) // Init from mixed { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } v4int( int a ) // Init from scalar { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a; + i[j] = a; } v4int( int i0, int i1, int i2, int i3 ) // Init from scalars @@ -521,9 +521,9 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ +# define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ - { \ + { \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ i[j] op b.i[j]; \ @@ -561,7 +561,7 @@ namespace v4 # define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -591,7 +591,7 @@ namespace v4 # define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -608,7 +608,7 @@ namespace v4 # define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -625,7 +625,7 @@ namespace v4 # define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -650,7 +650,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -800,21 +800,21 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + f[j] = a.f[j]; } v4float( const v4 &a ) // Init from mixed { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + f[j] = a.f[j]; } v4float( float a ) // Init from scalar { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a; + f[j] = a; } v4float( float f0, float f1, float f2, float f3 ) // Init from scalars @@ -831,10 +831,10 @@ namespace v4 # define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ - { \ + { \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ - f[j] op b.f[j]; \ + f[j] op b.f[j]; \ return *this; \ } @@ -944,28 +944,43 @@ namespace v4 // v4float binary operators -# define BINARY(op) \ + #define BINARY(op,intrin) \ inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.f[j] = a.f[j] op b.f[j]; \ + c.v = intrin( a.v, b.v ); \ return c; \ } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) + BINARY( +, vaddq_f32 ) + BINARY( -, vsubq_f32 ) + BINARY( *, vmulq_f32 ) + BINARY( /, vdivq_f32 ) -# undef BINARY + #undef BINARY + + // #define BINARY(op) \ + // inline v4float operator op( const v4float &a, const v4float &b ) \ + // { \ + // v4float c; \ + // ALWAYS_VECTORIZE \ + // for( int j = 0; j < 4; j++ ) \ + // c.f[j] = a.f[j] op b.f[j]; \ + // return c; \ + // } + + // BINARY(+) + // BINARY(-) + // BINARY(*) + // BINARY(/) + + // #undef BINARY // v4float logical operators # define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -988,7 +1003,7 @@ namespace v4 # define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ - { \ + { \ v4float b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -998,7 +1013,7 @@ namespace v4 # define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -1081,6 +1096,8 @@ namespace v4 { v4float d; + // d.v = _mm_fmadd_ps( a.v, b.v, c.v ); + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) d.f[j] = a.f[j] * b.f[j] + c.f[j]; From ab4dd1044993a7496afd1a2ec3e031035004c366 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 14 Jul 2019 21:06:12 -0600 Subject: [PATCH 18/95] Add float32x4_t type to the v4_neon union. --- src/util/v4/v4_neon.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 1dce1cb3..bd340b55 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -125,6 +125,7 @@ namespace v4 { int i[4]; float f[4]; + float32x4_t v; }; public: From 66ad58a91cbc3dc55bfdfd29705fd54994c5a61a Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 14 Jul 2019 21:33:15 -0600 Subject: [PATCH 19/95] Add ARM NEON intrinsics support for fma and fms. --- src/util/v4/v4_neon.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index bd340b55..57886e87 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -1097,11 +1097,11 @@ namespace v4 { v4float d; - // d.v = _mm_fmadd_ps( a.v, b.v, c.v ); + d.v = vfmaq_f32( a.v, b.v, c.v ); - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = a.f[j] * b.f[j] + c.f[j]; + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // d.f[j] = a.f[j] * b.f[j] + c.f[j]; return d; } @@ -1110,9 +1110,11 @@ namespace v4 { v4float d; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = a.f[j] * b.f[j] - c.f[j]; + d.v = vfmsq_f32( a.v, b.v, c.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // d.f[j] = a.f[j] * b.f[j] - c.f[j]; return d; } From f891774fdd3609c2bf381d23329af4c164244b85 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 11:05:08 -0600 Subject: [PATCH 20/95] Remove some tabs. Format tweaks. Add NEON intrinsics support for contructors. --- src/util/v4/v4_avx2.h | 112 +++++++++++++++++------------------ src/util/v4/v4_neon.h | 84 ++++++++++++++++++-------- src/util/v4/v4_portable.h | 38 ++++++------ src/util/v4/v4_portable_v0.h | 38 ++++++------ src/util/v4/v4_portable_v1.h | 52 ++++++++-------- 5 files changed, 179 insertions(+), 145 deletions(-) diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index a7b7b783..104121db 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -130,7 +130,7 @@ namespace v4 v4( const v4 &a ) // Copy constructor { - v=a.v; + v = a.v; } ~v4() {} // Default destructor @@ -202,19 +202,19 @@ namespace v4 // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.v = _mm_load_ps( ( float * ) p ); } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_store_ps( ( float * ) p, a.v ); } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_stream_ps( ( float * ) p, a.v ); } @@ -243,7 +243,7 @@ namespace v4 inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ) + v4 &a ) { a.v = _mm_setr_ps( ((const float *)a0)[0], ((const float *)a1)[0], @@ -397,7 +397,7 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, void *a1, - void *a2, void *a3 ) + void *a2, void *a3 ) { ((float *)a0)[0] = a.f[0]; ((float *)a1)[0] = a.f[1]; @@ -446,7 +446,7 @@ namespace v4 // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, + const v4 &c, const v4 &d, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { @@ -564,8 +564,8 @@ namespace v4 { union { - int i; - float f; + int i; + float f; } u; u.i = a; @@ -576,8 +576,8 @@ namespace v4 { union { - int i; - float f; + int i; + float f; } u0, u1, u2, u3; u0.i = i0; @@ -592,9 +592,9 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ +# define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ - { \ + { \ i[0] op b.i[0]; \ i[1] op b.i[1]; \ i[2] op b.i[2]; \ @@ -658,7 +658,7 @@ namespace v4 # define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -712,7 +712,7 @@ namespace v4 # define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -730,7 +730,7 @@ namespace v4 # define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ - { \ + { \ v4int b; \ b.i[0] = ( a.i[0] op ); \ b.i[1] = ( a.i[1] op ); \ @@ -748,7 +748,7 @@ namespace v4 # define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = a.i[0] op b.i[0]; \ c.i[1] = a.i[1] op b.i[1]; \ @@ -799,7 +799,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = - ( a.i[0] op b.i[0] ); \ c.i[1] = - ( a.i[1] op b.i[1] ); \ @@ -857,7 +857,7 @@ namespace v4 v4 tf; tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), - _mm_and_ps( c_v, t.v ) ); + _mm_and_ps( c_v, t.v ) ); return tf; } @@ -967,11 +967,11 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op,intrin) \ +# define ASSIGN(op,intrin) \ inline v4float &operator op( const v4float &b ) \ - { \ + { \ v = intrin( v, b.v ); \ - return *this; \ + return *this; \ } inline v4float &operator =( const v4float &b ) @@ -1086,7 +1086,7 @@ namespace v4 # define BINARY(op,intrin) \ inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.v = intrin( a.v, b.v ); \ return c; \ @@ -1103,7 +1103,7 @@ namespace v4 # define LOGICAL(op,intrin) \ inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4int c; \ c.v = intrin( a.v, b.v ); \ return c; \ @@ -1123,7 +1123,7 @@ namespace v4 __m128 vzero = _mm_setzero_ps(); c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + _mm_cmpneq_ps( b.v, vzero ) ); return c; } @@ -1135,7 +1135,7 @@ namespace v4 __m128 vzero = _mm_setzero_ps(); c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + _mm_cmpneq_ps( b.v, vzero ) ); return c; } @@ -1146,7 +1146,7 @@ namespace v4 # define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ - { \ + { \ v4float b; \ b.f[0] = ::fn( a.f[0] ); \ b.f[1] = ::fn( a.f[1] ); \ @@ -1157,7 +1157,7 @@ namespace v4 # define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.f[0] = ::fn( a.f[0], b.f[0] ); \ c.f[1] = ::fn( a.f[1], b.f[1] ); \ @@ -1197,7 +1197,7 @@ namespace v4 __m128 t = _mm_set1_ps( -0.0f ); c.v = _mm_or_ps( _mm_and_ps( t, b.v ), - _mm_andnot_ps( t, a.v ) ); + _mm_andnot_ps( t, a.v ) ); return c; } @@ -1228,15 +1228,15 @@ namespace v4 // Note: It is quicker to just call div_ps and sqrt_ps if more // refinement desired! b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), - _mm_sub_ps( b_v, - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) - ) - ) - ) - ) - ); + _mm_sub_ps( b_v, + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) + ) + ) + ) + ) + ); return b; } @@ -1255,11 +1255,11 @@ namespace v4 // refinement desired! b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), - _mm_fnmadd_ps( a_v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) ), - b_v ), - b_v ); + _mm_fnmadd_ps( a_v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) ), + b_v ), + b_v ); return b; } @@ -1277,11 +1277,11 @@ namespace v4 // refinement desired! b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), - _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) ), - b_v ), - b_v ); + _mm_fnmadd_ps( a.v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) ), + b_v ), + b_v ); return b; } @@ -1305,10 +1305,10 @@ namespace v4 b_v = _mm_rcp_ps( a_v ); b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ), - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, b_v ) - ) - ); + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, b_v ) + ) + ); return b; } @@ -1324,8 +1324,8 @@ namespace v4 b_v = _mm_rcp_ps( a_v ); b.v = _mm_fnmadd_ps( a_v, - _mm_mul_ps( b_v, b_v ), - _mm_add_ps( b_v, b_v ) ); + _mm_mul_ps( b_v, b_v ), + _mm_add_ps( b_v, b_v ) ); return b; } @@ -1340,8 +1340,8 @@ namespace v4 b_v = _mm_rcp_ps( a.v ); b.v = _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, b_v ), - _mm_add_ps( b_v, b_v ) ); + _mm_mul_ps( b_v, b_v ), + _mm_add_ps( b_v, b_v ) ); return b; } diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 57886e87..44aa4648 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -134,9 +134,11 @@ namespace v4 v4( const v4 &a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // i[j] = a.i[j]; } ~v4() {} // Default destructor @@ -179,7 +181,7 @@ namespace v4 return b; } -# define sw(x,y) x^=y, y^=x, x^=y + #define sw(x,y) x^=y, y^=x, x^=y inline void swap( v4 &a, v4 &b ) { @@ -195,7 +197,7 @@ namespace v4 sw( a2.i[3],a3.i[2] ); } -# undef sw + #undef sw // v4 memory manipulation functions @@ -491,27 +493,53 @@ namespace v4 v4int( const v4int &a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // i[j] = a.i[j]; } v4int( const v4 &a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // i[j] = a.i[j]; } v4int( int a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a; + union + { + int i; + float f; + } u; + + u.i = a; + v = vdupq_n_f32( u.f ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // i[j] = a; } v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { + // union + // { + // int i; + // float f; + // } u0, u1, u2, u3; + + // u0.i = i0; + // u1.i = i1; + // u2.i = i2; + // u3.i = i3; + + // v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); + i[0] = i0; i[1] = i1; i[2] = i2; @@ -799,23 +827,29 @@ namespace v4 v4float( const v4float &a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // f[j] = a.f[j]; } v4float( const v4 &a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // f[j] = a.f[j]; } v4float( float a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a; + v = vdupq_n_f32( a ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // f[j] = a; } v4float( float f0, float f1, float f2, float f3 ) // Init from scalars @@ -830,7 +864,7 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ { \ ALWAYS_VECTORIZE \ @@ -845,7 +879,7 @@ namespace v4 ASSIGN(*=) ASSIGN(/=) -# undef ASSIGN + #undef ASSIGN // v4float member access operator diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h index 9f199697..6dbb790b 100644 --- a/src/util/v4/v4_portable.h +++ b/src/util/v4/v4_portable.h @@ -189,7 +189,7 @@ namespace v4 // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.i[0] = ((const int * ALIGNED(16))p)[0]; a.i[1] = ((const int * ALIGNED(16))p)[1]; @@ -198,7 +198,7 @@ namespace v4 } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ((int * ALIGNED(16))p)[0] = a.i[0]; ((int * ALIGNED(16))p)[1] = a.i[1]; @@ -207,7 +207,7 @@ namespace v4 } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ((int * ALIGNED(16))p)[0] = a.i[0]; ((int * ALIGNED(16))p)[1] = a.i[1]; @@ -234,7 +234,7 @@ namespace v4 } inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { int t; @@ -259,7 +259,7 @@ namespace v4 inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ) + v4 &a ) { a.i[0] = ((const int *)a0)[0]; a.i[1] = ((const int *)a1)[0]; @@ -338,7 +338,7 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, void *a1, - void *a2, void *a3 ) + void *a2, void *a3 ) { ((int *)a0)[0] = a.i[0]; ((int *)a1)[0] = a.i[1]; @@ -527,9 +527,9 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ +# define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ - { \ + { \ i[0] op b.i[0]; \ i[1] op b.i[1]; \ i[2] op b.i[2]; \ @@ -568,7 +568,7 @@ namespace v4 # define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -600,7 +600,7 @@ namespace v4 # define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -618,7 +618,7 @@ namespace v4 # define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ - { \ + { \ v4int b; \ b.i[0] = ( a.i[0] op ); \ b.i[1] = ( a.i[1] op ); \ @@ -636,7 +636,7 @@ namespace v4 # define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = a.i[0] op b.i[0]; \ c.i[1] = a.i[1] op b.i[1]; \ @@ -662,7 +662,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = -(a.i[0] op b.i[0]); \ c.i[1] = -(a.i[1] op b.i[1]); \ @@ -851,8 +851,8 @@ namespace v4 # define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ - { \ - f[0] op b.f[0]; \ + { \ + f[0] op b.f[0]; \ f[1] op b.f[1]; \ f[2] op b.f[2]; \ f[3] op b.f[3]; \ @@ -974,7 +974,7 @@ namespace v4 # define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.f[0] = a.f[0] op b.f[0]; \ c.f[1] = a.f[1] op b.f[1]; \ @@ -994,7 +994,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4int c; \ c.i[0] = - ( a.f[0] op b.f[0] ); \ c.i[1] = - ( a.f[1] op b.f[1] ); \ @@ -1018,7 +1018,7 @@ namespace v4 # define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ - { \ + { \ v4float b; \ b.f[0] = ::fn( a.f[0] ); \ b.f[1] = ::fn( a.f[1] ); \ @@ -1029,7 +1029,7 @@ namespace v4 # define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.f[0] = ::fn( a.f[0], b.f[0] ); \ c.f[1] = ::fn( a.f[1], b.f[1] ); \ diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h index 6b2555e8..6a89939e 100644 --- a/src/util/v4/v4_portable_v0.h +++ b/src/util/v4/v4_portable_v0.h @@ -189,7 +189,7 @@ namespace v4 // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.i[0] = ((const int * ALIGNED(16))p)[0]; a.i[1] = ((const int * ALIGNED(16))p)[1]; @@ -198,7 +198,7 @@ namespace v4 } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ((int * ALIGNED(16))p)[0] = a.i[0]; ((int * ALIGNED(16))p)[1] = a.i[1]; @@ -207,7 +207,7 @@ namespace v4 } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ((int * ALIGNED(16))p)[0] = a.i[0]; ((int * ALIGNED(16))p)[1] = a.i[1]; @@ -234,7 +234,7 @@ namespace v4 } inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { int t; @@ -259,7 +259,7 @@ namespace v4 inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ) + v4 &a ) { a.i[0] = ((const int *)a0)[0]; a.i[1] = ((const int *)a1)[0]; @@ -338,7 +338,7 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, void *a1, - void *a2, void *a3 ) + void *a2, void *a3 ) { ((int *)a0)[0] = a.i[0]; ((int *)a1)[0] = a.i[1]; @@ -527,9 +527,9 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ +# define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ - { \ + { \ i[0] op b.i[0]; \ i[1] op b.i[1]; \ i[2] op b.i[2]; \ @@ -568,7 +568,7 @@ namespace v4 # define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -600,7 +600,7 @@ namespace v4 # define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ - { \ + { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ b.i[1] = ( op a.i[1] ); \ @@ -618,7 +618,7 @@ namespace v4 # define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ - { \ + { \ v4int b; \ b.i[0] = ( a.i[0] op ); \ b.i[1] = ( a.i[1] op ); \ @@ -636,7 +636,7 @@ namespace v4 # define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = a.i[0] op b.i[0]; \ c.i[1] = a.i[1] op b.i[1]; \ @@ -662,7 +662,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ c.i[0] = - ( a.i[0] op b.i[0] ); \ c.i[1] = - ( a.i[1] op b.i[1] ); \ @@ -851,8 +851,8 @@ namespace v4 # define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ - { \ - f[0] op b.f[0]; \ + { \ + f[0] op b.f[0]; \ f[1] op b.f[1]; \ f[2] op b.f[2]; \ f[3] op b.f[3]; \ @@ -974,7 +974,7 @@ namespace v4 # define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.f[0] = a.f[0] op b.f[0]; \ c.f[1] = a.f[1] op b.f[1]; \ @@ -994,7 +994,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4int c; \ c.i[0] = - ( a.f[0] op b.f[0] ); \ c.i[1] = - ( a.f[1] op b.f[1] ); \ @@ -1018,7 +1018,7 @@ namespace v4 # define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ - { \ + { \ v4float b; \ b.f[0] = ::fn( a.f[0] ); \ b.f[1] = ::fn( a.f[1] ); \ @@ -1029,7 +1029,7 @@ namespace v4 # define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ c.f[0] = ::fn( a.f[0], b.f[0] ); \ c.f[1] = ::fn( a.f[1], b.f[1] ); \ diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h index 4d3c4b20..d67bf4b8 100644 --- a/src/util/v4/v4_portable_v1.h +++ b/src/util/v4/v4_portable_v1.h @@ -134,7 +134,7 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } ~v4() {} // Default destructor @@ -198,7 +198,7 @@ namespace v4 // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -206,7 +206,7 @@ namespace v4 } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -214,7 +214,7 @@ namespace v4 } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -238,7 +238,7 @@ namespace v4 } inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { int t; @@ -255,7 +255,7 @@ namespace v4 inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ) + v4 &a ) { a.i[0] = ((const int *)a0)[0]; a.i[1] = ((const int *)a1)[0]; @@ -334,7 +334,7 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, void *a1, - void *a2, void *a3 ) + void *a2, void *a3 ) { ((int *)a0)[0] = a.i[0]; ((int *)a1)[0] = a.i[1]; @@ -491,21 +491,21 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } v4int( const v4 &a ) // Init from mixed { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + i[j] = a.i[j]; } v4int( int a ) // Init from scalar { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - i[j] = a; + i[j] = a; } v4int( int i0, int i1, int i2, int i3 ) // Init from scalars @@ -520,9 +520,9 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ +# define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ - { \ + { \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ i[j] op b.i[j]; \ @@ -560,7 +560,7 @@ namespace v4 # define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -590,7 +590,7 @@ namespace v4 # define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -607,7 +607,7 @@ namespace v4 # define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ - { \ + { \ v4int b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -624,7 +624,7 @@ namespace v4 # define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -649,7 +649,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -799,21 +799,21 @@ namespace v4 { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + f[j] = a.f[j]; } v4float( const v4 &a ) // Init from mixed { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + f[j] = a.f[j]; } v4float( float a ) // Init from scalar { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - f[j] = a; + f[j] = a; } v4float( float f0, float f1, float f2, float f3 ) // Init from scalars @@ -830,10 +830,10 @@ namespace v4 # define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ - { \ + { \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ - f[j] op b.f[j]; \ + f[j] op b.f[j]; \ return *this; \ } @@ -945,7 +945,7 @@ namespace v4 # define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -964,7 +964,7 @@ namespace v4 # define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ + { \ v4int c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -987,7 +987,7 @@ namespace v4 # define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ - { \ + { \ v4float b; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ @@ -997,7 +997,7 @@ namespace v4 # define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ - { \ + { \ v4float c; \ ALWAYS_VECTORIZE \ for( int j = 0; j < 4; j++ ) \ From 2ddef89abb86a58e8738ac467654b8e70aab1a61 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 12:34:39 -0600 Subject: [PATCH 21/95] Add NEON intrinsics support for load_4x4_tr and store_4x4_tr. --- src/util/v4/v4_neon.h | 285 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 247 insertions(+), 38 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 44aa4648..f1a691a5 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -273,6 +273,19 @@ namespace v4 const void * ALIGNED(8) a3, v4 &a, v4 &b ) { + // __m128 a_v, b_v, t; + + // b_v = _mm_setzero_ps(); + + // t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); + // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); + + // a_v = _mm_shuffle_ps( t, b_v, 0x88 ); + // b_v = _mm_shuffle_ps( t, b_v, 0xdd ); + + // a.v = a_v; + // b.v = b_v; + a.i[0] = ((const int * ALIGNED(8))a0)[0]; b.i[0] = ((const int * ALIGNED(8))a0)[1]; @@ -315,25 +328,123 @@ namespace v4 const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c, v4 &d ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; + //----------------------------------------------------------------- + float32x4_t a_v, b_v, c_v, d_v, t, u; + //----------------------------------------------------------------- + // __m128 a_v, b_v, c_v, d_v, t, u; + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a_v = vld1q_f32( (const float *) a0 ); + b_v = vld1q_f32( (const float *) a1 ); + c_v = vld1q_f32( (const float *) a2 ); + d_v = vld1q_f32( (const float *) a3 ); + //----------------------------------------------------------------- + // a_v = _mm_load_ps( (const float *) a0 ); + // b_v = _mm_load_ps( (const float *) a1 ); + // c_v = _mm_load_ps( (const float *) a2 ); + // d_v = _mm_load_ps( (const float *) a3 ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a_vh = vget_high_f32( a_v ); + float32x2_t b_vh = vget_high_f32( b_v ); + + float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh ); + + t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] ); + //----------------------------------------------------------------- + // t = _mm_unpackhi_ps( a_v, b_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t c_vh = vget_high_f32( c_v ); + float32x2_t d_vh = vget_high_f32( d_v ); + + float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh ); + + u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] ); + //----------------------------------------------------------------- + // u = _mm_unpackhi_ps( c_v, d_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a_vl = vget_low_f32( a_v ); + float32x2_t b_vl = vget_low_f32( b_v ); + + float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl ); + + a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] ); + //----------------------------------------------------------------- + // a_v = _mm_unpacklo_ps( a_v, b_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t c_vl = vget_low_f32( c_v ); + float32x2_t d_vl = vget_low_f32( d_v ); + + float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl ); + + c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] ); + //----------------------------------------------------------------- + // c_v = _mm_unpacklo_ps( c_v, d_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a.v[0] = a_v[0]; + a.v[1] = a_v[1]; + a.v[2] = c_v[0]; + a.v[3] = c_v[1]; + //----------------------------------------------------------------- + // a.v = _mm_movelh_ps( a_v, c_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + c.v[0] = t[0]; + c.v[1] = t[1]; + c.v[2] = u[0]; + c.v[3] = u[1]; + //----------------------------------------------------------------- + // c.v = _mm_movelh_ps( t, u ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + b.v[0] = a_v[2]; + b.v[1] = a_v[3]; + b.v[2] = c_v[2]; + b.v[3] = c_v[3]; + //----------------------------------------------------------------- + // b.v = _mm_movehl_ps( c_v, a_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + d.v[0] = t[2]; + d.v[1] = t[3]; + d.v[2] = u[2]; + d.v[3] = u[3]; + //----------------------------------------------------------------- + // d.v = _mm_movehl_ps( u, t ); + //----------------------------------------------------------------- + + // a.i[0] = ((const int * ALIGNED(16))a0)[0]; + // b.i[0] = ((const int * ALIGNED(16))a0)[1]; + // c.i[0] = ((const int * ALIGNED(16))a0)[2]; + // d.i[0] = ((const int * ALIGNED(16))a0)[3]; + + // a.i[1] = ((const int * ALIGNED(16))a1)[0]; + // b.i[1] = ((const int * ALIGNED(16))a1)[1]; + // c.i[1] = ((const int * ALIGNED(16))a1)[2]; + // d.i[1] = ((const int * ALIGNED(16))a1)[3]; + + // a.i[2] = ((const int * ALIGNED(16))a2)[0]; + // b.i[2] = ((const int * ALIGNED(16))a2)[1]; + // c.i[2] = ((const int * ALIGNED(16))a2)[2]; + // d.i[2] = ((const int * ALIGNED(16))a2)[3]; + + // a.i[3] = ((const int * ALIGNED(16))a3)[0]; + // b.i[3] = ((const int * ALIGNED(16))a3)[1]; + // c.i[3] = ((const int * ALIGNED(16))a3)[2]; + // d.i[3] = ((const int * ALIGNED(16))a3)[3]; } inline void store_4x1_tr( const v4 &a, @@ -388,25 +499,123 @@ namespace v4 void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; + //----------------------------------------------------------------- + float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; + //----------------------------------------------------------------- + // __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a_vh = vget_high_f32( a_v ); + float32x2_t b_vh = vget_high_f32( b_v ); + + float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh ); + + t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] ); + //----------------------------------------------------------------- + // t = _mm_unpackhi_ps( a_v, b_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a_vl = vget_low_f32( a_v ); + float32x2_t b_vl = vget_low_f32( b_v ); + + float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl ); + + a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] ); + //----------------------------------------------------------------- + // a_v = _mm_unpacklo_ps( a_v, b_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t c_vh = vget_high_f32( c_v ); + float32x2_t d_vh = vget_high_f32( d_v ); + + float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh ); + + u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] ); + //----------------------------------------------------------------- + // u = _mm_unpackhi_ps( c_v, d_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t c_vl = vget_low_f32( c_v ); + float32x2_t d_vl = vget_low_f32( d_v ); + + float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl ); + + c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] ); + //----------------------------------------------------------------- + // c_v = _mm_unpacklo_ps( c_v, d_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + b_v[0] = a_v[2]; + b_v[1] = a_v[3]; + b_v[2] = c_v[2]; + b_v[3] = c_v[3]; + //----------------------------------------------------------------- + // b_v = _mm_movehl_ps( c_v, a_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a_v[0] = a_v[0]; + a_v[1] = a_v[1]; + a_v[2] = c_v[0]; + a_v[3] = c_v[1]; + //----------------------------------------------------------------- + // a_v = _mm_movelh_ps( a_v, c_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + c_v[0] = t[0]; + c_v[1] = t[1]; + c_v[2] = u[0]; + c_v[3] = u[1]; + //----------------------------------------------------------------- + // c_v = _mm_movelh_ps( t, u ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + d_v[0] = t[2]; + d_v[1] = t[3]; + d_v[2] = u[2]; + d_v[3] = u[3]; + //----------------------------------------------------------------- + // d_v = _mm_movehl_ps( u, t ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + vst1q_f32( (float *) a0, a_v ); + vst1q_f32( (float *) a1, b_v ); + vst1q_f32( (float *) a2, c_v ); + vst1q_f32( (float *) a3, d_v ); + //----------------------------------------------------------------- + // _mm_store_ps( (float *) a0, a_v ); + // _mm_store_ps( (float *) a1, b_v ); + // _mm_store_ps( (float *) a2, c_v ); + // _mm_store_ps( (float *) a3, d_v ); + //----------------------------------------------------------------- + + // ((int * ALIGNED(16))a0)[0] = a.i[0]; + // ((int * ALIGNED(16))a0)[1] = b.i[0]; + // ((int * ALIGNED(16))a0)[2] = c.i[0]; + // ((int * ALIGNED(16))a0)[3] = d.i[0]; + + // ((int * ALIGNED(16))a1)[0] = a.i[1]; + // ((int * ALIGNED(16))a1)[1] = b.i[1]; + // ((int * ALIGNED(16))a1)[2] = c.i[1]; + // ((int * ALIGNED(16))a1)[3] = d.i[1]; + + // ((int * ALIGNED(16))a2)[0] = a.i[2]; + // ((int * ALIGNED(16))a2)[1] = b.i[2]; + // ((int * ALIGNED(16))a2)[2] = c.i[2]; + // ((int * ALIGNED(16))a2)[3] = d.i[2]; + + // ((int * ALIGNED(16))a3)[0] = a.i[3]; + // ((int * ALIGNED(16))a3)[1] = b.i[3]; + // ((int * ALIGNED(16))a3)[2] = c.i[3]; + // ((int * ALIGNED(16))a3)[3] = d.i[3]; } ////////////// From 2a2fedde10e73d5fffabb5173a2b46746f36308a Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 14:07:06 -0600 Subject: [PATCH 22/95] Add NEON intrinsics support for v4float assignment operators. --- src/util/v4/v4_avx2.h | 8 ++++---- src/util/v4/v4_neon.h | 43 ++++++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index 104121db..1fdde94a 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -592,7 +592,7 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ { \ i[0] op b.i[0]; \ @@ -639,7 +639,7 @@ namespace v4 ASSIGN(<<=) ASSIGN(>>=) -# undef ASSIGN + #undef ASSIGN // v4int member access operator @@ -967,7 +967,7 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op,intrin) \ + #define ASSIGN(op,intrin) \ inline v4float &operator op( const v4float &b ) \ { \ v = intrin( v, b.v ); \ @@ -986,7 +986,7 @@ namespace v4 ASSIGN( *=, _mm_mul_ps ) ASSIGN( /=, _mm_div_ps ) -# undef ASSIGN + #undef ASSIGN // v4float member access operator diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index f1a691a5..2ec33500 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -759,7 +759,7 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ { \ ALWAYS_VECTORIZE \ @@ -780,7 +780,7 @@ namespace v4 ASSIGN(<<=) ASSIGN(>>=) -# undef ASSIGN + #undef ASSIGN // v4int member access operator @@ -1073,23 +1073,44 @@ namespace v4 // v4float assignment operators - #define ASSIGN(op) \ + #define ASSIGN(op,intrin) \ inline v4float &operator op( const v4float &b ) \ { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - f[j] op b.f[j]; \ + v = intrin( v, b.v ); \ return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + inline v4float &operator =( const v4float &b ) + { + v = b.v; + + return *this; + } + + ASSIGN( +=, vaddq_f32 ) + ASSIGN( -=, vsubq_f32 ) + ASSIGN( *=, vmulq_f32 ) + ASSIGN( /=, vdivq_f32 ) #undef ASSIGN + // #define ASSIGN(op) \ + // inline v4float &operator op( const v4float &b ) \ + // { \ + // ALWAYS_VECTORIZE \ + // for( int j = 0; j < 4; j++ ) \ + // f[j] op b.f[j]; \ + // return *this; \ + // } + + // ASSIGN(=) + // ASSIGN(+=) + // ASSIGN(-=) + // ASSIGN(*=) + // ASSIGN(/=) + + // #undef ASSIGN + // v4float member access operator inline float &operator []( int n ) From 29c7a566202f5e5262d724ff7760826d68c9f3c4 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 15:38:59 -0600 Subject: [PATCH 23/95] Add NEON intrinsics support for rsqrt and rcp functions. --- src/util/v4/v4_avx2.h | 4 +- src/util/v4/v4_neon.h | 91 +++++++++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 32 deletions(-) diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index 1fdde94a..c8132a3c 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -1202,8 +1202,8 @@ namespace v4 return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscelleanous functions diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 2ec33500..4c6efd3c 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -797,7 +797,7 @@ namespace v4 // v4int prefix unary operators -# define PREFIX_UNARY(op) \ + #define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ { \ v4int b; \ @@ -823,11 +823,11 @@ namespace v4 PREFIX_UNARY(~) -# undef PREFIX_UNARY + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ + #define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ { \ v4int b; \ @@ -840,11 +840,11 @@ namespace v4 PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ + #define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ { \ v4int b; \ @@ -857,11 +857,11 @@ namespace v4 POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -882,11 +882,11 @@ namespace v4 BINARY(<<) BINARY(>>) -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -905,7 +905,7 @@ namespace v4 LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions @@ -996,8 +996,8 @@ namespace v4 // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); @@ -1008,8 +1008,8 @@ namespace v4 CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends @@ -1266,7 +1266,7 @@ namespace v4 // v4float math library functions -# define CMATH_FR1(fn) \ + #define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ { \ v4float b; \ @@ -1276,7 +1276,7 @@ namespace v4 return b; \ } -# define CMATH_FR2(fn) \ + #define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -1308,8 +1308,8 @@ namespace v4 return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous functions @@ -1317,9 +1317,11 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + b.v = vrsqrteq_f32( a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; } @@ -1328,9 +1330,26 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + float32x4_t a_v = a.v, b_v; + + b_v = vrsqrteq_f32( a_v ); + + // Note: It is quicker to just call div_ps and sqrt_ps if more + // refinement desired! + b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), + vsubq_f32( b_v, + vmulq_f32( a_v, + vmulq_f32( b_v, + vmulq_f32( b_v, b_v ) + ) + ) + ) + ) + ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; } @@ -1339,9 +1358,11 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = 1.0f / a.f[j]; + b.v = vrecpeq_f32( a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = 1.0f / a.f[j]; return b; } @@ -1350,9 +1371,19 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = 1.0f / a.f[j]; + float32x4_t a_v = a.v, b_v; + + b_v = vrecpeq_f32( a_v ); + + b.v = vsubq_f32( vaddq_f32( b_v, b_v ), + vmulq_f32( a_v, + vmulq_f32( b_v, b_v ) + ) + ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = 1.0f / a.f[j]; return b; } From a482375d202992045250f884ced3194ff5e5f09d Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 17:31:49 -0600 Subject: [PATCH 24/95] Add NEON intrinsics support for transpose function. --- src/util/v4/v4_neon.h | 88 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 4c6efd3c..964510c9 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -192,9 +192,91 @@ namespace v4 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); - sw( a2.i[3],a3.i[2] ); + fload32x4_t a0_v, a2_v, t, u; + + //----------------------------------------------------------------- + float32x2_t a0_vh = vget_high_f32( a0.v ); + float32x2_t a1_vh = vget_high_f32( a1.v ); + + float32x2x2_t res_a0a1_h = vzip_f32( a0_vh, a1_vh ); + + t = vcombine_f32( res_a0a1_h.val[0], res_a0a1_h.val[1] ); + //----------------------------------------------------------------- + // t = _mm_unpackhi_ps( a0.v, a1.v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a0_vl = vget_low_f32( a0.v ); + float32x2_t a1_vl = vget_low_f32( a1.v ); + + float32x2x2_t res_a0a1_l = vzip_f32( a0_vl, a1_vl ); + + a0_v = vcombine_f32( res_a0a1_l.val[0], res_a0a1_l.val[1] ); + //----------------------------------------------------------------- + // a0_v = _mm_unpacklo_ps( a0.v, a1.v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a2_vh = vget_high_f32( a2.v ); + float32x2_t a3_vh = vget_high_f32( a3.v ); + + float32x2x2_t res_a2a3_h = vzip_f32( a2_vh, a3_vh ); + + u = vcombine_f32( res_a2a3_h.val[0], res_a2a3_h.val[1] ); + //----------------------------------------------------------------- + // u = _mm_unpackhi_ps( a2.v, a3.v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + float32x2_t a2_vl = vget_low_f32( a2.v ); + float32x2_t a3_vl = vget_low_f32( a3.v ); + + float32x2x2_t res_a2a3_l = vzip_f32( a2_vl, a3_vl ); + + a2_v = vcombine_f32( res_a2a3_l.val[0], res_a2a3_l.val[1] ); + //----------------------------------------------------------------- + // a2_v = _mm_unpacklo_ps( a2.v, a3.v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a0.v[0] = a0_v[0]; + a0.v[1] = a0_v[1]; + a0.v[2] = a2_v[0]; + a0.v[3] = a2_v[1]; + //----------------------------------------------------------------- + // a0.v = _mm_movelh_ps( a0_v, a2_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a1.v[0] = a0_v[2]; + a1.v[1] = a0_v[3]; + a1.v[2] = a2_v[2]; + a1.v[3] = a2_v[3]; + //----------------------------------------------------------------- + // a1.v = _mm_movehl_ps( a2_v, a0_v ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a2.v[0] = t[0]; + a2.v[1] = t[1]; + a2.v[2] = u[0]; + a2.v[3] = u[1]; + //----------------------------------------------------------------- + // a2.v = _mm_movelh_ps( t, u ); + //----------------------------------------------------------------- + + //----------------------------------------------------------------- + a3.v[0] = t[2]; + a3.v[1] = t[3]; + a3.v[2] = u[2]; + a3.v[3] = u[3]; + //----------------------------------------------------------------- + // a3.v = _mm_movehl_ps( u, t ); + //----------------------------------------------------------------- + + // sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); + // sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); + // sw( a2.i[3],a3.i[2] ); } #undef sw From 7353705152f52f28e8c888393e091f6c13dbebdd Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 17:39:16 -0600 Subject: [PATCH 25/95] Fix a typo. --- src/util/v4/v4_neon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 964510c9..5f3a48ac 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -192,7 +192,7 @@ namespace v4 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { - fload32x4_t a0_v, a2_v, t, u; + float32x4_t a0_v, a2_v, t, u; //----------------------------------------------------------------- float32x2_t a0_vh = vget_high_f32( a0.v ); From 4a80b0170363386b2bf37fa94f4487415b33646c Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 15 Jul 2019 21:02:58 -0600 Subject: [PATCH 26/95] Add support for benchmarking center_p and uncenter_p. --- src/vpic/initialize.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc index 8cc28da0..4aa7f0e7 100644 --- a/src/vpic/initialize.cc +++ b/src/vpic/initialize.cc @@ -51,6 +51,12 @@ vpic_simulation::initialize( int argc, if( rank()==0 ) MESSAGE(( "Uncentering particles" )); TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 ); } + LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 ); + for( int iwdn = 0; iwdn < 1000; iwdn++ ) + { + LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 ); + LIST_FOR_EACH( sp, species_list ) TIC center_p( sp, interpolator_array ); TOC( center_p, 1 ); + } LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 ); if( rank()==0 ) MESSAGE(( "Performing initial diagnostics" )); From 0c16a2024b7d382b91718461ab3ff74625718240 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 16 Jul 2019 11:18:01 -0600 Subject: [PATCH 27/95] Change number of iterations for uncenter/center loop. --- src/vpic/initialize.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc index 4aa7f0e7..4961559b 100644 --- a/src/vpic/initialize.cc +++ b/src/vpic/initialize.cc @@ -52,7 +52,7 @@ vpic_simulation::initialize( int argc, TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 ); } LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 ); - for( int iwdn = 0; iwdn < 1000; iwdn++ ) + for( int iwdn = 0; iwdn < 100; iwdn++ ) { LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 ); LIST_FOR_EACH( sp, species_list ) TIC center_p( sp, interpolator_array ); TOC( center_p, 1 ); From ec22f2c32a76e2cd028d503e26032dd0b8c1a530 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 16 Jul 2019 21:00:19 -0600 Subject: [PATCH 28/95] Add NEON intrinsic support for v4float logical operators to v4_neon.h. --- src/util/v4/v4_avx2.h | 2 +- src/util/v4/v4_neon.h | 102 +++++++++++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 16 deletions(-) diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index c8132a3c..4800c98e 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -1101,7 +1101,7 @@ namespace v4 // v4float logical operators -# define LOGICAL(op,intrin) \ + #define LOGICAL(op,intrin) \ inline v4int operator op( const v4float &a, const v4float &b ) \ { \ v4int c; \ diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 5f3a48ac..763519f9 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -123,8 +123,10 @@ namespace v4 union { - int i[4]; - float f[4]; + int i[4]; + float f[4]; + int32x4_t vsi; + uint32x4_t vui; float32x4_t v; }; @@ -1325,26 +1327,96 @@ namespace v4 // v4float logical operators -# define LOGICAL(op) \ + #define LOGICAL(op,intrin) \ inline v4int operator op( const v4float &a, const v4float &b ) \ { \ v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = - ( a.f[j] op b.f[j] ); \ + c.v = intrin( a.v, b.v ); \ return c; \ } - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) + LOGICAL( <, vcltq_f32 ) + LOGICAL( >, vcgtq_f32 ) + LOGICAL( ==, vceqq_f32 ) + LOGICAL( <=, vcleq_f32 ) + LOGICAL( >=, vcgeq_f32 ) + // LOGICAL( !=, _mm_cmpneq_ps ) + + inline v4int operator !=( const v4float &a, const v4float &b ) + { + v4int c; + + // r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + // return type looks wrong here. try adding uint32x4_t vi to + // the union. may need to do a cast. + + c.vui = vmvnq_u32( vceqq_f32( a.v, b.v ) ); + + return c; + } + + inline v4int operator &&( const v4float &a, const v4float &b ) + { + v4int c; + + float32x4_t vzero = vdupq_n_f32(0.0f); + + // __m128 vzero = _mm_setzero_ps(); + + // Is there a better way to do this than the SSE way? + c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v, + vzero ) ), + vmvnq_u32( vceqq_f32( b.v, + vzero ) ) ); + + // c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), + // _mm_cmpneq_ps( b.v, vzero ) ); + + return c; + } + + inline v4int operator ||( const v4float &a, const v4float &b ) + { + v4int c; + + float32x4_t vzero = vdupq_n_f32(0.0f); + + // __m128 vzero = _mm_setzero_ps(); + + // Is there a better way to do this than the SSE way? + c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v, + vzero ) ), + vmvnq_u32( vceqq_f32( b.v, + vzero ) ) ); + + // c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), + // _mm_cmpneq_ps( b.v, vzero ) ); + + return c; + } + + #undef LOGICAL + + // #define LOGICAL(op) \ + // inline v4int operator op( const v4float &a, const v4float &b ) \ + // { \ + // v4int c; \ + // ALWAYS_VECTORIZE \ + // for( int j = 0; j < 4; j++ ) \ + // c.i[j] = - ( a.f[j] op b.f[j] ); \ + // return c; \ + // } + + // LOGICAL(< ) + // LOGICAL(> ) + // LOGICAL(==) + // LOGICAL(!=) + // LOGICAL(<=) + // LOGICAL(>=) + // LOGICAL(&&) + // LOGICAL(||) -# undef LOGICAL + // #undef LOGICAL // v4float math library functions From 6be57c295d5519329d54a8692d8c605ff7170672 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Wed, 17 Jul 2019 09:31:12 -0600 Subject: [PATCH 29/95] Try an implementation of load/store transpose operations using vld4q_f32 and vst4q_f32. --- src/util/v4/v4_neon.h | 142 ++++++++++++++++++++++++++++++++---------- 1 file changed, 108 insertions(+), 34 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 763519f9..1fd3d609 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -345,10 +345,14 @@ namespace v4 const void *a2, const void *a3, v4 &a ) { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; + float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + + a.v = mat.val[0]; + + // a.i[0] = ((const int *)a0)[0]; + // a.i[1] = ((const int *)a1)[0]; + // a.i[2] = ((const int *)a2)[0]; + // a.i[3] = ((const int *)a3)[0]; } inline void load_4x2_tr( const void * ALIGNED(8) a0, @@ -357,55 +361,88 @@ namespace v4 const void * ALIGNED(8) a3, v4 &a, v4 &b ) { - // __m128 a_v, b_v, t; + float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - // b_v = _mm_setzero_ps(); + a.v = mat.val[0]; + b.v = mat.val[1]; - // t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); - // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); + // a.i[0] = ((const int * ALIGNED(8))a0)[0]; + // b.i[0] = ((const int * ALIGNED(8))a0)[1]; - // a_v = _mm_shuffle_ps( t, b_v, 0x88 ); - // b_v = _mm_shuffle_ps( t, b_v, 0xdd ); + // a.i[1] = ((const int * ALIGNED(8))a1)[0]; + // b.i[1] = ((const int * ALIGNED(8))a1)[1]; - // a.v = a_v; - // b.v = b_v; + // a.i[2] = ((const int * ALIGNED(8))a2)[0]; + // b.i[2] = ((const int * ALIGNED(8))a2)[1]; - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; + // a.i[3] = ((const int * ALIGNED(8))a3)[0]; + // b.i[3] = ((const int * ALIGNED(8))a3)[1]; + } - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; + inline void load_4x3_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, v4 &b, v4 &c ) + { + float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.v = mat.val[0]; + b.v = mat.val[1]; + c.v = mat.val[2]; - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; + // a.i[0] = ((const int * ALIGNED(16))a0)[0]; + // b.i[0] = ((const int * ALIGNED(16))a0)[1]; + // c.i[0] = ((const int * ALIGNED(16))a0)[2]; + + // a.i[1] = ((const int * ALIGNED(16))a1)[0]; + // b.i[1] = ((const int * ALIGNED(16))a1)[1]; + // c.i[1] = ((const int * ALIGNED(16))a1)[2]; + + // a.i[2] = ((const int * ALIGNED(16))a2)[0]; + // b.i[2] = ((const int * ALIGNED(16))a2)[1]; + // c.i[2] = ((const int * ALIGNED(16))a2)[2]; + + // a.i[3] = ((const int * ALIGNED(16))a3)[0]; + // b.i[3] = ((const int * ALIGNED(16))a3)[1]; + // c.i[3] = ((const int * ALIGNED(16))a3)[2]; } - inline void load_4x3_tr( const void * ALIGNED(16) a0, + inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, v4 &b, v4 &c, v4 &d ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; + float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + + a.v = mat.val[0]; + b.v = mat.val[1]; + c.v = mat.val[2]; + d.v = mat.val[3]; - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; + // a.i[0] = ((const int * ALIGNED(16))a0)[0]; + // b.i[0] = ((const int * ALIGNED(16))a0)[1]; + // c.i[0] = ((const int * ALIGNED(16))a0)[2]; + // d.i[0] = ((const int * ALIGNED(16))a0)[3]; - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; + // a.i[1] = ((const int * ALIGNED(16))a1)[0]; + // b.i[1] = ((const int * ALIGNED(16))a1)[1]; + // c.i[1] = ((const int * ALIGNED(16))a1)[2]; + // d.i[1] = ((const int * ALIGNED(16))a1)[3]; - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; + // a.i[2] = ((const int * ALIGNED(16))a2)[0]; + // b.i[2] = ((const int * ALIGNED(16))a2)[1]; + // c.i[2] = ((const int * ALIGNED(16))a2)[2]; + // d.i[2] = ((const int * ALIGNED(16))a2)[3]; + + // a.i[3] = ((const int * ALIGNED(16))a3)[0]; + // b.i[3] = ((const int * ALIGNED(16))a3)[1]; + // c.i[3] = ((const int * ALIGNED(16))a3)[2]; + // d.i[3] = ((const int * ALIGNED(16))a3)[3]; } + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -530,6 +567,7 @@ namespace v4 // c.i[3] = ((const int * ALIGNED(16))a3)[2]; // d.i[3] = ((const int * ALIGNED(16))a3)[3]; } + #endif inline void store_4x1_tr( const v4 &a, void *a0, void *a1, @@ -579,6 +617,41 @@ namespace v4 ((int * ALIGNED(16))a3)[2] = c.i[3]; } + inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, + void * ALIGNED(16) a0, void * ALIGNED(16) a1, + void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + { + float32x4x4_t mat; + + mat.val[0] = a.v; + mat.val[1] = b.v; + mat.val[2] = c.v; + mat.val[3] = d.v; + + vst4q_f32( (const float *) a0, mat ); + + // ((int * ALIGNED(16))a0)[0] = a.i[0]; + // ((int * ALIGNED(16))a0)[1] = b.i[0]; + // ((int * ALIGNED(16))a0)[2] = c.i[0]; + // ((int * ALIGNED(16))a0)[3] = d.i[0]; + + // ((int * ALIGNED(16))a1)[0] = a.i[1]; + // ((int * ALIGNED(16))a1)[1] = b.i[1]; + // ((int * ALIGNED(16))a1)[2] = c.i[1]; + // ((int * ALIGNED(16))a1)[3] = d.i[1]; + + // ((int * ALIGNED(16))a2)[0] = a.i[2]; + // ((int * ALIGNED(16))a2)[1] = b.i[2]; + // ((int * ALIGNED(16))a2)[2] = c.i[2]; + // ((int * ALIGNED(16))a2)[3] = d.i[2]; + + // ((int * ALIGNED(16))a3)[0] = a.i[3]; + // ((int * ALIGNED(16))a3)[1] = b.i[3]; + // ((int * ALIGNED(16))a3)[2] = c.i[3]; + // ((int * ALIGNED(16))a3)[3] = d.i[3]; + } + + #if 0 inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) @@ -701,6 +774,7 @@ namespace v4 // ((int * ALIGNED(16))a3)[2] = c.i[3]; // ((int * ALIGNED(16))a3)[3] = d.i[3]; } + #endif ////////////// // v4int class From e7fd3e09bed07c7d477bddf0157cfe358b767b12 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 18 Jul 2019 16:26:03 -0600 Subject: [PATCH 30/95] Comment out new implementations for load_4x4_tr and store_4x4_tr since they will not work. --- src/util/v4/v4_neon.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 1fd3d609..bdd30925 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -408,6 +408,7 @@ namespace v4 // c.i[3] = ((const int * ALIGNED(16))a3)[2]; } + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -441,8 +442,9 @@ namespace v4 // c.i[3] = ((const int * ALIGNED(16))a3)[2]; // d.i[3] = ((const int * ALIGNED(16))a3)[3]; } + #endif - #if 0 + #if 1 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -617,6 +619,7 @@ namespace v4 ((int * ALIGNED(16))a3)[2] = c.i[3]; } + #if 0 inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) @@ -650,8 +653,9 @@ namespace v4 // ((int * ALIGNED(16))a3)[2] = c.i[3]; // ((int * ALIGNED(16))a3)[3] = d.i[3]; } + #endif - #if 0 + #if 1 inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) From d9b7adb51a0460c5d413d0406b06239d06959f16 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 22 Jul 2019 15:11:50 -0600 Subject: [PATCH 31/95] More work on v4_neon support. --- src/util/v4/v4_avx2.h | 182 ++++++++------ src/util/v4/v4_neon.h | 553 +++++++++++++++++++++++++++++++----------- 2 files changed, 518 insertions(+), 217 deletions(-) diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index 4800c98e..023ba95a 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -29,7 +29,7 @@ namespace v4 constexpr static int value = i0 + i1*4 + i2*16 + i3*64; }; -# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) + #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) //////////////// // v4 base class @@ -151,8 +151,8 @@ namespace v4 template inline v4 splat( const v4 & a ) { - __m128 a_v = a.v; v4 b; + __m128 a_v = a.v; b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) ); @@ -162,8 +162,8 @@ namespace v4 template inline v4 shuffle( const v4 & a ) { - __m128 a_v = a.v; v4 b; + __m128 a_v = a.v; b.v = _mm_shuffle_ps( a_v, a_v, ( permute::value ) ); @@ -231,7 +231,8 @@ namespace v4 } /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ - inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) + inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) { __m128 t = _mm_load_ps( ( float * ) a ); @@ -241,21 +242,24 @@ namespace v4 // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, v4 &a ) { - a.v = _mm_setr_ps( ((const float *)a0)[0], - ((const float *)a1)[0], - ((const float *)a2)[0], - ((const float *)a3)[0] ); + a.v = _mm_setr_ps( ( (const float *) a0 )[0], + ( (const float *) a1 )[0], + ( (const float *) a2 )[0], + ( (const float *) a3 )[0] ); } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) + v4 &a, + v4 &b ) { __m128 a_v, b_v, t; @@ -275,7 +279,9 @@ namespace v4 const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, + v4 &b, + v4 &c ) { __m128 a_v, b_v, c_v, t, u; @@ -298,12 +304,16 @@ namespace v4 c.v = c_v; } -#if 0 + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) { + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { __m128 a_v, b_v, c_v, d_v, t, u; a_v = _mm_load_ps( (const float *)a0 ); b_v = _mm_load_ps( (const float *)a1 ); @@ -319,14 +329,18 @@ namespace v4 d_v = _mm_movehl_ps( u, t ); a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; } -#endif + #endif -#if 0 + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) { + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { __m128 a_v, b_v, c_v, d_v, t, u; a_v = _mm_load_ps( (const float *)a0 ); @@ -344,14 +358,18 @@ namespace v4 c.v = _mm_movelh_ps( t, u ); d.v = _mm_movehl_ps( u, t ); } -#endif + #endif -#if 0 + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) { + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { __m128 a_v, b_v, c_v, d_v, t, u; a_v = _mm_load_ps( (const float *)a0 ); @@ -369,13 +387,16 @@ namespace v4 d.v = _mm_movehl_ps( u, t ); c.v = _mm_movelh_ps( t, u ); } -#endif + #endif inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { __m128 a_v, b_v, c_v, d_v, t, u; @@ -396,18 +417,23 @@ namespace v4 } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, - void *a2, void *a3 ) + void *a0, + void *a1, + void *a2, + void *a3 ) { - ((float *)a0)[0] = a.f[0]; - ((float *)a1)[0] = a.f[1]; - ((float *)a2)[0] = a.f[2]; - ((float *)a3)[0] = a.f[3]; - } - - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + ( (float *) a0 )[0] = a.f[0]; + ( (float *) a1 )[0] = a.f[1]; + ( (float *) a2 )[0] = a.f[2]; + ( (float *) a3 )[0] = a.f[3]; + } + + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) { __m128 a_v = a.v, b_v = b.v, t; @@ -422,9 +448,13 @@ namespace v4 _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { __m128 a_v = a.v, b_v = b.v, t; @@ -445,10 +475,14 @@ namespace v4 } // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) - inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; @@ -602,6 +636,14 @@ namespace v4 return *this; \ } + ASSIGN(+=) + ASSIGN(-=) + ASSIGN(*=) + ASSIGN(/=) + ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) + inline v4int &operator =( const v4int &b ) { v = b.v; @@ -609,12 +651,6 @@ namespace v4 return *this; } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - inline v4int &operator ^=( const v4int &b ) { v = _mm_xor_ps( v, b.v ); @@ -636,9 +672,6 @@ namespace v4 return *this; } - ASSIGN(<<=) - ASSIGN(>>=) - #undef ASSIGN // v4int member access operator @@ -656,7 +689,7 @@ namespace v4 // v4int prefix unary operators -# define PREFIX_UNARY(op) \ + #define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ { \ v4int b; \ @@ -706,11 +739,11 @@ namespace v4 return b; } -# undef PREFIX_UNARY + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ + #define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ { \ v4int b; \ @@ -724,11 +757,11 @@ namespace v4 PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ + #define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ { \ v4int b; \ @@ -742,11 +775,11 @@ namespace v4 POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -762,6 +795,8 @@ namespace v4 BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) inline v4int operator ^( const v4int &a, const v4int &b ) { @@ -790,14 +825,11 @@ namespace v4 return c; } - BINARY(<<) - BINARY(>>) - -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -817,7 +849,7 @@ namespace v4 LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions @@ -905,9 +937,9 @@ namespace v4 // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -917,8 +949,8 @@ namespace v4 CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends @@ -974,6 +1006,11 @@ namespace v4 return *this; \ } + ASSIGN( +=, _mm_add_ps ) + ASSIGN( -=, _mm_sub_ps ) + ASSIGN( *=, _mm_mul_ps ) + ASSIGN( /=, _mm_div_ps ) + inline v4float &operator =( const v4float &b ) { v = b.v; @@ -981,11 +1018,6 @@ namespace v4 return *this; } - ASSIGN( +=, _mm_add_ps ) - ASSIGN( -=, _mm_sub_ps ) - ASSIGN( *=, _mm_mul_ps ) - ASSIGN( /=, _mm_div_ps ) - #undef ASSIGN // v4float member access operator @@ -1140,11 +1172,11 @@ namespace v4 return c; } -# undef LOGICAL + #undef LOGICAL // v4float math library functions -# define CMATH_FR1(fn) \ + #define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ { \ v4float b; \ @@ -1155,7 +1187,7 @@ namespace v4 return b; \ } -# define CMATH_FR2(fn) \ + #define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ { \ v4float c; \ diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index bdd30925..1734f62a 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -162,6 +162,10 @@ namespace v4 inline v4 splat( const v4 & a ) { v4 b; + // __m128 a_v = a.v; + + // b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) ); + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -174,6 +178,9 @@ namespace v4 inline v4 shuffle( const v4 & a ) { v4 b; + // __m128 a_v = a.v; + + // b.v = _mm_shuffle_ps( a_v, a_v, ( permute::value ) ); b.i[0] = a.i[i0]; b.i[1] = a.i[i1]; @@ -187,6 +194,12 @@ namespace v4 inline void swap( v4 &a, v4 &b ) { + // __m128 a_v = a.v; + + // a.v = b.v; + + // b.v = a_v; + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) sw( a.i[j], b.i[j] ); @@ -288,22 +301,32 @@ namespace v4 inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - a.i[j] = ((const int * ALIGNED(16))p)[j]; + a.v = vld1q_f32( ( float * ) p ); + + // a.v = _mm_load_ps( ( float * ) p ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // a.i[j] = ((const int * ALIGNED(16))p)[j]; } inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; + vst1q_f32( ( float * ) p, a.v ); + + // _mm_store_ps( ( float * ) p, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // ((int * ALIGNED(16))p)[j] = a.i[j]; } inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { + // _mm_stream_ps( ( float * ) p, a.v ); + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) ((int * ALIGNED(16))p)[j] = a.i[j]; @@ -311,101 +334,165 @@ namespace v4 inline void clear_4x1( void * ALIGNED(16) p ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = 0; + vst1q_f32( ( float * ) p, vdupq_n_f32( 0.0f ) ); + + // _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // ((int * ALIGNED(16))p)[j] = 0; } // FIXME: Ordering semantics inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; + vst1q_f32( ( float * ) dst, vld1q_f32( ( const float * ) src ) ); + + // _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; } inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) { - int t; + float32x4_t t = vld1q_f32( ( float * ) a ); - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - { - t = ((int * ALIGNED(16))a)[j]; - ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; - ((int * ALIGNED(16))b)[j] = t; - } + vst1q_f32( ( float * ) a, vld1q_f32( ( float * ) b ) ); + vst1q_f32( ( float * ) b, t ); + + // __m128 t = _mm_load_ps( ( float * ) a ); + + // _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); + // _mm_store_ps( ( float * ) b, t ); + + // int t; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // { + // t = ((int * ALIGNED(16))a)[j]; + // ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; + // ((int * ALIGNED(16))b)[j] = t; + // } } // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, v4 &a ) { - float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + // a.v = _mm_setr_ps( ( (const float *) a0 )[0], + // ( (const float *) a1 )[0], + // ( (const float *) a2 )[0], + // ( (const float *) a3 )[0] ); - a.v = mat.val[0]; + // Not correct. + // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - // a.i[0] = ((const int *)a0)[0]; - // a.i[1] = ((const int *)a1)[0]; - // a.i[2] = ((const int *)a2)[0]; - // a.i[3] = ((const int *)a3)[0]; + // a.v = mat.val[0]; + + a.i[0] = ((const int *)a0)[0]; + a.i[1] = ((const int *)a1)[0]; + a.i[2] = ((const int *)a2)[0]; + a.i[3] = ((const int *)a3)[0]; } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) + v4 &a, + v4 &b ) { - float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + // __m128 a_v, b_v, t; - a.v = mat.val[0]; - b.v = mat.val[1]; + // b_v = _mm_setzero_ps(); + + // t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); + // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); + + // a_v = _mm_shuffle_ps( t, b_v, 0x88 ); + // b_v = _mm_shuffle_ps( t, b_v, 0xdd ); + + // a.v = a_v; + // b.v = b_v; + + // Not correct. + // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + + // a.v = mat.val[0]; + // b.v = mat.val[1]; - // a.i[0] = ((const int * ALIGNED(8))a0)[0]; - // b.i[0] = ((const int * ALIGNED(8))a0)[1]; + a.i[0] = ((const int * ALIGNED(8))a0)[0]; + b.i[0] = ((const int * ALIGNED(8))a0)[1]; - // a.i[1] = ((const int * ALIGNED(8))a1)[0]; - // b.i[1] = ((const int * ALIGNED(8))a1)[1]; + a.i[1] = ((const int * ALIGNED(8))a1)[0]; + b.i[1] = ((const int * ALIGNED(8))a1)[1]; - // a.i[2] = ((const int * ALIGNED(8))a2)[0]; - // b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.i[2] = ((const int * ALIGNED(8))a2)[0]; + b.i[2] = ((const int * ALIGNED(8))a2)[1]; - // a.i[3] = ((const int * ALIGNED(8))a3)[0]; - // b.i[3] = ((const int * ALIGNED(8))a3)[1]; + a.i[3] = ((const int * ALIGNED(8))a3)[0]; + b.i[3] = ((const int * ALIGNED(8))a3)[1]; } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, + v4 &b, + v4 &c ) { - float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + // __m128 a_v, b_v, c_v, t, u; - a.v = mat.val[0]; - b.v = mat.val[1]; - c.v = mat.val[2]; + // t = _mm_load_ps( (const float *)a0 ); + // b_v = _mm_load_ps( (const float *)a1 ); + // c_v = _mm_load_ps( (const float *)a2 ); + // u = _mm_load_ps( (const float *)a3 ); - // a.i[0] = ((const int * ALIGNED(16))a0)[0]; - // b.i[0] = ((const int * ALIGNED(16))a0)[1]; - // c.i[0] = ((const int * ALIGNED(16))a0)[2]; + // a_v = _mm_unpacklo_ps( t, b_v ); + // b_v = _mm_unpackhi_ps( t, b_v ); + // t = _mm_unpacklo_ps( c_v, u ); + // u = _mm_unpackhi_ps( c_v, u ); - // a.i[1] = ((const int * ALIGNED(16))a1)[0]; - // b.i[1] = ((const int * ALIGNED(16))a1)[1]; - // c.i[1] = ((const int * ALIGNED(16))a1)[2]; + // c_v = _mm_movelh_ps( b_v, u ); + // b_v = _mm_movehl_ps( t, a_v ); + // a_v = _mm_movelh_ps( a_v, t ); - // a.i[2] = ((const int * ALIGNED(16))a2)[0]; - // b.i[2] = ((const int * ALIGNED(16))a2)[1]; - // c.i[2] = ((const int * ALIGNED(16))a2)[2]; + // a.v = a_v; + // b.v = b_v; + // c.v = c_v; - // a.i[3] = ((const int * ALIGNED(16))a3)[0]; - // b.i[3] = ((const int * ALIGNED(16))a3)[1]; - // c.i[3] = ((const int * ALIGNED(16))a3)[2]; + // Not correct. + // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + + // a.v = mat.val[0]; + // b.v = mat.val[1]; + // c.v = mat.val[2]; + + a.i[0] = ((const int * ALIGNED(16))a0)[0]; + b.i[0] = ((const int * ALIGNED(16))a0)[1]; + c.i[0] = ((const int * ALIGNED(16))a0)[2]; + + a.i[1] = ((const int * ALIGNED(16))a1)[0]; + b.i[1] = ((const int * ALIGNED(16))a1)[1]; + c.i[1] = ((const int * ALIGNED(16))a1)[2]; + + a.i[2] = ((const int * ALIGNED(16))a2)[0]; + b.i[2] = ((const int * ALIGNED(16))a2)[1]; + c.i[2] = ((const int * ALIGNED(16))a2)[2]; + + a.i[3] = ((const int * ALIGNED(16))a3)[0]; + b.i[3] = ((const int * ALIGNED(16))a3)[1]; + c.i[3] = ((const int * ALIGNED(16))a3)[2]; } #if 0 @@ -413,8 +500,12 @@ namespace v4 const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { + // Not correct. float32x4x4_t mat = vld4q_f32( (const float *) a0 ); a.v = mat.val[0]; @@ -449,7 +540,10 @@ namespace v4 const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { //----------------------------------------------------------------- float32x4_t a_v, b_v, c_v, d_v, t, u; @@ -572,58 +666,102 @@ namespace v4 #endif inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, - void *a2, void *a3 ) + void *a0, + void *a1, + void *a2, + void *a3 ) { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; + ( (int *) a0 )[0] = a.i[0]; + ( (int *) a1 )[0] = a.i[1]; + ( (int *) a2 )[0] = a.i[2]; + ( (int *) a3 )[0] = a.i[3]; } - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; + // __m128 a_v = a.v, b_v = b.v, t; + + // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; + // _mm_storel_pi( (__m64 *)a0, t ); // a0 b0 -> a0 + // _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1 -> a1 - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; + // t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; + // _mm_storel_pi( (__m64 *)a2, t ); // a2 b2 -> a2 + // _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 + + ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; + + ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; + + ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; + + ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; + // __m128 a_v = a.v, b_v = b.v, t; + + // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + + // _mm_storel_pi( (__m64 *)a0, t ); // a0 b0 -> a0 + // _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1 -> a1 - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; + // t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; + // _mm_storel_pi( (__m64 *)a2, t ); // a2 b2 -> a2 + // _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; + // ((float *)a0)[2] = c.f[0]; + // ((float *)a1)[2] = c.f[1]; + // ((float *)a2)[2] = c.f[2]; + // ((float *)a3)[2] = c.f[3]; + + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } #if 0 - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { + // Not correct. float32x4x4_t mat; mat.val[0] = a.v; @@ -656,9 +794,14 @@ namespace v4 #endif #if 1 - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { //----------------------------------------------------------------- float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; @@ -930,17 +1073,45 @@ namespace v4 return *this; \ } - ASSIGN( =) ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) ASSIGN(<<=) ASSIGN(>>=) + // ASSIGN( =) + // ASSIGN(^=) + // ASSIGN(&=) + // ASSIGN(|=) + + inline v4int &operator =( const v4int &b ) + { + v = b.v; + + return *this; + } + + inline v4int &operator ^=( const v4int &b ) + { + vsi = veorq_s32( vsi, b.vsi ); + + return *this; + } + + inline v4int &operator &=( const v4int &b ) + { + vsi = vandq_s32( vsi, b.vsi ); + + return *this; + } + + inline v4int &operator |=( const v4int &b ) + { + vsi = vorrq_s32( vsi, b.vsi ); + + return *this; + } #undef ASSIGN @@ -1038,11 +1209,38 @@ namespace v4 BINARY(*) BINARY(/) BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) BINARY(<<) BINARY(>>) + // BINARY(^) + // BINARY(&) + // BINARY(|) + + inline v4int operator ^( const v4int &a, const v4int &b ) + { + v4int c; + + c.vsi = veorq_s32( a.vsi, b.vsi ); + + return c; + } + + inline v4int operator &( const v4int &a, const v4int &b ) + { + v4int c; + + c.vsi = vandq_s32( a.vsi, b.vsi ); + + return c; + } + + inline v4int operator |( const v4int &a, const v4int &b ) + { + v4int c; + + c.vsi = vorrq_s32( a.vsi, b.vsi ); + + return c; + } #undef BINARY @@ -1086,9 +1284,13 @@ namespace v4 { v4 b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + b.vsi = vbicq_s32( c.vsi, a.vsi ); + + // b.v = _mm_andnot_ps( c.v, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.i[j] = a.i[j] & ~c.i[j]; return b; } @@ -1097,9 +1299,13 @@ namespace v4 { v4 b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] & c.i[j]; + b.vsi = vandq_s32( c.vsi, a.vsi ); + + // b.v = _mm_and_ps( c.v, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.i[j] = a.i[j] & c.i[j]; return b; } @@ -1108,9 +1314,18 @@ namespace v4 { v4 m; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + m.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ), + vandq_s32( c.vsi, t.vsi ) ); + + // __m128 c_v = c.v; + // v4 tf; + + // tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), + // _mm_and_ps( c_v, t.v ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return m; } @@ -1225,6 +1440,8 @@ namespace v4 v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { + // v = _mm_setr_ps( f0, f1, f2, f3 ); + f[0] = f0; f[1] = f1; f[2] = f2; @@ -1242,6 +1459,11 @@ namespace v4 return *this; \ } + ASSIGN( +=, vaddq_f32 ) + ASSIGN( -=, vsubq_f32 ) + ASSIGN( *=, vmulq_f32 ) + ASSIGN( /=, vdivq_f32 ) + inline v4float &operator =( const v4float &b ) { v = b.v; @@ -1249,11 +1471,6 @@ namespace v4 return *this; } - ASSIGN( +=, vaddq_f32 ) - ASSIGN( -=, vsubq_f32 ) - ASSIGN( *=, vmulq_f32 ) - ASSIGN( /=, vdivq_f32 ) - #undef ASSIGN // #define ASSIGN(op) \ @@ -1292,9 +1509,11 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = +a.f[j]; + b.v = a.v; + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = +a.f[j]; return b; } @@ -1303,6 +1522,8 @@ namespace v4 { v4float b; + // b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.f[j] = -a.f[j]; @@ -1314,6 +1535,8 @@ namespace v4 { v4int b; + // b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.i[j] = a.i[j] ? 0 : -1; @@ -1327,6 +1550,11 @@ namespace v4 { v4float b; + // __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); + + // a.v = t; + // b.v = t; + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.f[j] = ++a.f[j]; @@ -1338,6 +1566,11 @@ namespace v4 { v4float b; + // __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); + + // a.v = t; + // b.v = t; + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.f[j] = --a.f[j]; @@ -1351,6 +1584,11 @@ namespace v4 { v4float b; + // __m128 a_v = a.v; + + // a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) ); + // b.v = a_v; + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.f[j] = a.f[j]++; @@ -1362,6 +1600,11 @@ namespace v4 { v4float b; + // __m128 a_v = a.v; + + // a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) ); + // b.v = a_v; + ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) b.f[j] = a.f[j]--; @@ -1650,9 +1893,11 @@ namespace v4 { v4float d; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = c.f[j] - a.f[j] * b.f[j]; + d.v = vsubq_f32( vdupq_n_f32( 0.0f ), vfmsq_f32( a.v, b.v, c.v ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // d.f[j] = c.f[j] - a.f[j] * b.f[j]; return d; } @@ -1661,9 +1906,13 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = ( ~m.i[j] ) & a.i[j]; + b.vsi = vbicq_s32( m.vsi, a.vsi ); + + // b.v = _mm_andnot_ps( m.v, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.i[j] = ( ~m.i[j] ) & a.i[j]; return b; } @@ -1672,9 +1921,13 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = m.i[j] | a.i[j]; + b.vsi = vorrq_s32( m.vsi, a.vsi ); + + // b.v = _mm_or_ps( m.v, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.i[j] = m.i[j] | a.i[j]; return b; } @@ -1683,32 +1936,48 @@ namespace v4 { v4float b; - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = m.i[j] ^ a.i[j]; + b.vsi = veorq_s32( m.vsi, a.vsi ); + + // b.v = _mm_xor_ps( m.v, a.v ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.i[j] = m.i[j] ^ a.i[j]; return b; } inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] += a.f[j]; + vst1q_f32( p, vaddq_f32( vld1q_f32( p ), a.v ) ); + + // _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // p[j] += a.f[j]; } inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] -= a.f[j]; + vst1q_f32( p, vsubq_f32( vld1q_f32( p ), a.v ) ); + + // _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // p[j] -= a.f[j]; } inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] *= a.f[j]; + vst1q_f32( p, vmulq_f32( vld1q_f32( p ), a.v ) ); + + // _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); + + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // p[j] *= a.f[j]; } inline void trilinear( v4float & wl, v4float & wh ) From b62ab7a96997754474148c07c0e49c783c113dfc Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 22 Jul 2019 17:27:25 -0600 Subject: [PATCH 32/95] Add another NEON implementation of transpose. --- src/util/v4/v4_neon.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 1734f62a..6272d322 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -205,6 +205,24 @@ namespace v4 sw( a.i[j], b.i[j] ); } + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { + float32x4_t r, s, t, u; + + r = vtrn1q_f32( a0.v, a1.v ); + s = vtrn2q_f32( a0.v, a1.v ); + + t = vtrn1q_f32( a2.v, a3.v ); + u = vtrn2q_f32( a2.v, a3.v ); + + a0.v = vtrn1q_f64( r, t ); + a2.v = vtrn2q_f64( r, t ); + + a1.v = vtrn1q_f64( s, u ); + a3.v = vtrn2q_f64( s, u ); + } + + #if 0 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { float32x4_t a0_v, a2_v, t, u; @@ -293,6 +311,17 @@ namespace v4 // sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); // sw( a2.i[3],a3.i[2] ); } + #endif + + #if 0 + // Portable version. + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { + sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); + sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); + sw( a2.i[3],a3.i[2] ); + } + #endif #undef sw From 70191a65e580b9e5200ce7d1fa7e407dbd68bad3 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 23 Jul 2019 09:43:50 -0600 Subject: [PATCH 33/95] Add new NEON implementations for load and store transpose functions. --- src/util/v4/v4_neon.h | 193 +++++++++++++++++++++++++++++++++--------- 1 file changed, 151 insertions(+), 42 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 6272d322..86fe7446 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -205,6 +205,7 @@ namespace v4 sw( a.i[j], b.i[j] ); } + #if 1 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { float32x4_t r, s, t, u; @@ -221,6 +222,7 @@ namespace v4 a1.v = vtrn1q_f64( s, u ); a3.v = vtrn2q_f64( s, u ); } + #endif #if 0 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) @@ -306,10 +308,6 @@ namespace v4 //----------------------------------------------------------------- // a3.v = _mm_movehl_ps( u, t ); //----------------------------------------------------------------- - - // sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); - // sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); - // sw( a2.i[3],a3.i[2] ); } #endif @@ -433,6 +431,7 @@ namespace v4 a.i[3] = ((const int *)a3)[0]; } + #if 1 inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, @@ -440,37 +439,46 @@ namespace v4 v4 &a, v4 &b ) { - // __m128 a_v, b_v, t; + float32x4_t r, s, t, u, a2_v, a3_v; - // b_v = _mm_setzero_ps(); + a.v = vld1q_f32( (const float *) a0 ); + b.v = vld1q_f32( (const float *) a1 ); + a2_v = vld1q_f32( (const float *) a2 ); + a3_v = vld1q_f32( (const float *) a3 ); - // t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); - // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); - - // a_v = _mm_shuffle_ps( t, b_v, 0x88 ); - // b_v = _mm_shuffle_ps( t, b_v, 0xdd ); - - // a.v = a_v; - // b.v = b_v; + r = vtrn1q_f32( a.v, b.v ); + s = vtrn2q_f32( a.v, b.v ); - // Not correct. - // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); + t = vtrn1q_f32( a2_v, a3_v ); + u = vtrn2q_f32( a2_v, a3_v ); - // a.v = mat.val[0]; - // b.v = mat.val[1]; + a.v = vtrn1q_f64( r, t ); + b.v = vtrn1q_f64( s, u ); + } + #endif - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; + #if 0 + // Portable version. + inline void load_4x2_tr( const void * ALIGNED(8) a0, + const void * ALIGNED(8) a1, + const void * ALIGNED(8) a2, + const void * ALIGNED(8) a3, + v4 &a, + v4 &b ) + { + a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; + a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; + a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; } + #endif inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, @@ -565,6 +573,36 @@ namespace v4 #endif #if 1 + inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { + float32x4_t r, s, t, u; + + a.v = vld1q_f32( (const float *) a0 ); + b.v = vld1q_f32( (const float *) a1 ); + c.v = vld1q_f32( (const float *) a2 ); + d.v = vld1q_f32( (const float *) a3 ); + + r = vtrn1q_f32( a.v, b.v ); + s = vtrn2q_f32( a.v, b.v ); + + t = vtrn1q_f32( c.v, d.v ); + u = vtrn2q_f32( c.v, d.v ); + + a.v = vtrn1q_f64( r, t ); + b.v = vtrn1q_f64( s, u ); + c.v = vtrn2q_f64( r, t ); + d.v = vtrn2q_f64( s, u ); + } + #endif + + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -671,26 +709,39 @@ namespace v4 //----------------------------------------------------------------- // d.v = _mm_movehl_ps( u, t ); //----------------------------------------------------------------- + } + #endif - // a.i[0] = ((const int * ALIGNED(16))a0)[0]; - // b.i[0] = ((const int * ALIGNED(16))a0)[1]; - // c.i[0] = ((const int * ALIGNED(16))a0)[2]; - // d.i[0] = ((const int * ALIGNED(16))a0)[3]; + #if 0 + // Portable version. + inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { + a.i[0] = ((const int * ALIGNED(16))a0)[0]; + b.i[0] = ((const int * ALIGNED(16))a0)[1]; + c.i[0] = ((const int * ALIGNED(16))a0)[2]; + d.i[0] = ((const int * ALIGNED(16))a0)[3]; - // a.i[1] = ((const int * ALIGNED(16))a1)[0]; - // b.i[1] = ((const int * ALIGNED(16))a1)[1]; - // c.i[1] = ((const int * ALIGNED(16))a1)[2]; - // d.i[1] = ((const int * ALIGNED(16))a1)[3]; + a.i[1] = ((const int * ALIGNED(16))a1)[0]; + b.i[1] = ((const int * ALIGNED(16))a1)[1]; + c.i[1] = ((const int * ALIGNED(16))a1)[2]; + d.i[1] = ((const int * ALIGNED(16))a1)[3]; - // a.i[2] = ((const int * ALIGNED(16))a2)[0]; - // b.i[2] = ((const int * ALIGNED(16))a2)[1]; - // c.i[2] = ((const int * ALIGNED(16))a2)[2]; - // d.i[2] = ((const int * ALIGNED(16))a2)[3]; + a.i[2] = ((const int * ALIGNED(16))a2)[0]; + b.i[2] = ((const int * ALIGNED(16))a2)[1]; + c.i[2] = ((const int * ALIGNED(16))a2)[2]; + d.i[2] = ((const int * ALIGNED(16))a2)[3]; - // a.i[3] = ((const int * ALIGNED(16))a3)[0]; - // b.i[3] = ((const int * ALIGNED(16))a3)[1]; - // c.i[3] = ((const int * ALIGNED(16))a3)[2]; - // d.i[3] = ((const int * ALIGNED(16))a3)[3]; + a.i[3] = ((const int * ALIGNED(16))a3)[0]; + b.i[3] = ((const int * ALIGNED(16))a3)[1]; + c.i[3] = ((const int * ALIGNED(16))a3)[2]; + d.i[3] = ((const int * ALIGNED(16))a3)[3]; } #endif @@ -823,6 +874,31 @@ namespace v4 #endif #if 1 + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + float32x4_t r, s, t, u; + + r = vtrn1q_f32( a.v, b.v ); + s = vtrn2q_f32( a.v, b.v ); + + t = vtrn1q_f32( c.v, d.v ); + u = vtrn2q_f32( c.v, d.v ); + + vst1q_f32( (float *) a0, vtrn1q_f64( r, t ) ); + vst1q_f32( (float *) a1, vtrn1q_f64( s, u ) ); + vst1q_f32( (float *) a2, vtrn2q_f64( r, t ) ); + vst1q_f32( (float *) a3, vtrn2q_f64( s, u ) ); + } + #endif + + #if 0 inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, @@ -952,6 +1028,39 @@ namespace v4 } #endif + #if 0 + // Portable version. + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; + + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; + + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; + + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; + } + #endif + ////////////// // v4int class From 2762aece44025ae6f671fd9ccc62084d20208900 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Tue, 23 Jul 2019 11:41:48 -0600 Subject: [PATCH 34/95] Try another idea for implementing the transpose function. --- src/util/v4/v4_neon.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 86fe7446..9d99bbf7 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -207,6 +207,22 @@ namespace v4 #if 1 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { + float32x4x2_t r, s; + + r = vtrnq_f32( a0.v, a1.v ); + s = vtrnq_f32( a2.v, a3.v ); + + a0.v = vtrn1q_f64( r.val[0], s.val[0] ); + a2.v = vtrn2q_f64( r.val[0], s.val[0] ); + + a1.v = vtrn1q_f64( r.val[1], s.val[1] ); + a3.v = vtrn2q_f64( r.val[1], s.val[1] ); + } + #endif + + #if 0 + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { float32x4_t r, s, t, u; From d82f07d2bd87127e89f6ec55e3202d75285bdfc4 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 25 Jul 2019 13:35:23 -0600 Subject: [PATCH 35/95] Add support for using load_4x8_tr and load_4x16_tr for the ARM NEON case. --- .../standard/pipeline/center_p_pipeline_v4.cc | 146 +++++++++++ .../pipeline/uncenter_p_pipeline_v4.cc | 162 +++++++++++- src/util/v4/v4_neon.h | 243 ++++++++++++------ 3 files changed, 461 insertions(+), 90 deletions(-) diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc index dc6d5e18..611bd5d5 100644 --- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc @@ -6,6 +6,150 @@ using namespace v4; +#ifdef V4_NEON_ACCELERATION + +void +center_p_pipeline_v4( center_p_pipeline_args_t * args, + int pipeline_rank, + int n_pipeline ) +{ + const interpolator_t * ALIGNED(128) f0 = args->f0; + + particle_t * ALIGNED(128) p; + + const float * ALIGNED(16) vp00; + const float * ALIGNED(16) vp01; + const float * ALIGNED(16) vp02; + const float * ALIGNED(16) vp03; + + const v4float qdt_2mc( args->qdt_2mc); + const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate. + const v4float one(1.0); + const v4float one_third(1.0/3.0); + const v4float two_fifteenths(2.0/15.0); + + v4float dx, dy, dz, ux, uy, uz, q; + v4float hax, hay, haz, cbx, cby, cbz; + v4float v00, v01, v02, v03, v04, v05; + v4float v06, v07, v08, v09, v10; + v4int ii; + + int itmp, nq; + + // Determine which particle blocks this pipeline processes. + + DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq ); + + p = args->p0 + itmp; + + nq >>= 2; + + // Process the particle blocks for this pipeline. + + for( ; nq; nq--, p+=4 ) + { + //-------------------------------------------------------------------------- + // Load particle position data. + //-------------------------------------------------------------------------- + load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, + dx, dy, dz, ii, ux, uy, uz, q ); + + // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, + // dx, dy, dz, ii ); + + //-------------------------------------------------------------------------- + // Set field interpolation pointers. + //-------------------------------------------------------------------------- + vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) ); + vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) ); + vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) ); + vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + load_4x16_tr( vp00, vp01, vp02, vp03, + hax, v00, v01, v02, + hay, v03, v04, v05, + haz, v06, v07, v08, + cbx, v09, cby, v10 ); + + // load_4x4_tr( vp00, vp01, vp02, vp03, + // hax, v00, v01, v02 ); + + hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4, + // hay, v03, v04, v05 ); + + hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8, + // haz, v00, v01, v02 ); + + haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12, + // cbx, v03, cby, v04 ); + + cbx = fma( v09, dx, cbx ); + cby = fma( v10, dy, cby ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles, final. + //-------------------------------------------------------------------------- + load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16, + cbz, v05 ); + + cbz = fma( v05, dz, cbz ); + + //-------------------------------------------------------------------------- + // Load particle momentum data. Could use load_4x3_tr. + //-------------------------------------------------------------------------- + // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux, + // ux, uy, uz, q ); + + //-------------------------------------------------------------------------- + // Update momentum. + //-------------------------------------------------------------------------- + ux += hax; + uy += hay; + uz += haz; + + v00 = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) ); + v01 = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) ); + v02 = ( v00 * v00 ) * v01; + v03 = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one ); + v04 = v03 * rcp( fma( v03 * v03, v01, one ) ); + v04 += v04; + + v00 = fma( fms( uy, cbz, uz * cby ), v03, ux ); + v01 = fma( fms( uz, cbx, ux * cbz ), v03, uy ); + v02 = fma( fms( ux, cby, uy * cbx ), v03, uz ); + + ux = fma( fms( v01, cbz, v02 * cby ), v04, ux ); + uy = fma( fms( v02, cbx, v00 * cbz ), v04, uy ); + uz = fma( fms( v00, cby, v01 * cbx ), v04, uz ); + + //-------------------------------------------------------------------------- + // Store particle momentum data. Could use store_4x3_tr. + //-------------------------------------------------------------------------- + store_4x4_tr( ux, uy, uz, q, + &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux ); + } +} + +#else + void center_p_pipeline_v4( center_p_pipeline_args_t * args, int pipeline_rank, @@ -136,6 +280,8 @@ center_p_pipeline_v4( center_p_pipeline_args_t * args, } } +#endif + #else void diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc index 908fedec..8d33c7a0 100644 --- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc @@ -6,6 +6,150 @@ using namespace v4; +#ifdef V4_NEON_ACCELERATION + +void +uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, + int pipeline_rank, + int n_pipeline ) +{ + const interpolator_t * ALIGNED(128) f0 = args->f0; + + particle_t * ALIGNED(128) p; + + const float * ALIGNED(16) vp00; + const float * ALIGNED(16) vp01; + const float * ALIGNED(16) vp02; + const float * ALIGNED(16) vp03; + + const v4float qdt_2mc( -args->qdt_2mc); // For backward half advance. + const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate. + const v4float one(1.0); + const v4float one_third(1.0/3.0); + const v4float two_fifteenths(2.0/15.0); + + v4float dx, dy, dz, ux, uy, uz, q; + v4float hax, hay, haz, cbx, cby, cbz; + v4float v00, v01, v02, v03, v04, v05; + v4float v06, v07, v08, v09, v10; + v4int ii; + + int first, nq; + + // Determine which particle quads this pipeline processes. + + DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq ); + + p = args->p0 + first; + + nq >>= 2; + + // Process the particle quads for this pipeline. + + for( ; nq; nq--, p+=4 ) + { + //-------------------------------------------------------------------------- + // Load particle position data. + //-------------------------------------------------------------------------- + load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, + dx, dy, dz, ii, ux, uy, uz, q ); + + // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, + // dx, dy, dz, ii ); + + //-------------------------------------------------------------------------- + // Set field interpolation pointers. + //-------------------------------------------------------------------------- + vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) ); + vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) ); + vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) ); + vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + load_4x16_tr( vp00, vp01, vp02, vp03, + hax, v00, v01, v02, + hay, v03, v04, v05, + haz, v06, v07, v08, + cbx, v09, cby, v10 ); + + // load_4x4_tr( vp00, vp01, vp02, vp03, + // hax, v00, v01, v02 ); + + hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4, + // hay, v03, v04, v05 ); + + hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8, + // haz, v00, v01, v02 ); + + haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles. + //-------------------------------------------------------------------------- + // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12, + // cbx, v03, cby, v04 ); + + cbx = fma( v09, dx, cbx ); + cby = fma( v10, dy, cby ); + + //-------------------------------------------------------------------------- + // Load interpolation data for particles, final. + //-------------------------------------------------------------------------- + load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16, + cbz, v05 ); + + cbz = fma( v05, dz, cbz ); + + //-------------------------------------------------------------------------- + // Load particle momentum data. Could use load_4x3_tr. + //-------------------------------------------------------------------------- + // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux, + // ux, uy, uz, q ); + + //-------------------------------------------------------------------------- + // Update momentum. + //-------------------------------------------------------------------------- + v00 = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) ); + v01 = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) ); + v02 = ( v00 * v00 ) * v01; + v03 = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one ); + v04 = v03 * rcp( fma( v03 * v03, v01, one ) ); + v04 += v04; + + v00 = fma( fms( uy, cbz, uz * cby ), v03, ux ); + v01 = fma( fms( uz, cbx, ux * cbz ), v03, uy ); + v02 = fma( fms( ux, cby, uy * cbx ), v03, uz ); + + ux = fma( fms( v01, cbz, v02 * cby ), v04, ux ); + uy = fma( fms( v02, cbx, v00 * cbz ), v04, uy ); + uz = fma( fms( v00, cby, v01 * cbx ), v04, uz ); + + ux += hax; + uy += hay; + uz += haz; + + //-------------------------------------------------------------------------- + // Store particle data. Could use store_4x3_tr. + //-------------------------------------------------------------------------- + store_4x4_tr( ux, uy, uz, q, + &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux ); + } +} + +#else + void uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, int pipeline_rank, @@ -49,7 +193,7 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, // Load particle position data. //-------------------------------------------------------------------------- load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, - dx, dy, dz, ii ); + dx, dy, dz, ii ); //-------------------------------------------------------------------------- // Set field interpolation pointers. @@ -63,31 +207,31 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, // Load interpolation data for particles. //-------------------------------------------------------------------------- load_4x4_tr( vp00, vp01, vp02, vp03, - hax, v00, v01, v02 ); + hax, v00, v01, v02 ); - hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); + hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); //-------------------------------------------------------------------------- // Load interpolation data for particles. //-------------------------------------------------------------------------- load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4, - hay, v03, v04, v05 ); + hay, v03, v04, v05 ); - hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); + hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); //-------------------------------------------------------------------------- // Load interpolation data for particles. //-------------------------------------------------------------------------- load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8, - haz, v00, v01, v02 ); + haz, v00, v01, v02 ); - haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) ); + haz = qdt_2mc * fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) ); //-------------------------------------------------------------------------- // Load interpolation data for particles. //-------------------------------------------------------------------------- load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12, - cbx, v03, cby, v04 ); + cbx, v03, cby, v04 ); cbx = fma( v03, dx, cbx ); cby = fma( v04, dy, cby ); @@ -136,6 +280,8 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, } } +#endif + #else void diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 9d99bbf7..50142a2f 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -97,6 +97,28 @@ namespace v4 const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; + friend inline void load_4x8_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &b00, v4 &b01, + v4 &b02, v4 &b03, + v4 &b04, v4 &b05, + v4 &b06, v4 &b07 ) ALWAYS_INLINE; + + friend inline void load_4x16_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &b00, v4 &b01, + v4 &b02, v4 &b03, + v4 &b04, v4 &b05, + v4 &b06, v4 &b07, + v4 &b08, v4 &b09, + v4 &b10, v4 &b11, + v4 &b12, v4 &b13, + v4 &b14, v4 &b15 ) ALWAYS_INLINE; + friend inline void store_4x1_tr( const v4 &a, void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; @@ -119,6 +141,15 @@ namespace v4 void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void store_4x8_tr( const v4 &b00, const v4 &b01, + const v4 &b02, const v4 &b03, + const v4 &b04, const v4 &b05, + const v4 &b06, const v4 &b07, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) ALWAYS_INLINE; + protected: union @@ -548,46 +579,6 @@ namespace v4 c.i[3] = ((const int * ALIGNED(16))a3)[2]; } - #if 0 - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - // Not correct. - float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - - a.v = mat.val[0]; - b.v = mat.val[1]; - c.v = mat.val[2]; - d.v = mat.val[3]; - - // a.i[0] = ((const int * ALIGNED(16))a0)[0]; - // b.i[0] = ((const int * ALIGNED(16))a0)[1]; - // c.i[0] = ((const int * ALIGNED(16))a0)[2]; - // d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - // a.i[1] = ((const int * ALIGNED(16))a1)[0]; - // b.i[1] = ((const int * ALIGNED(16))a1)[1]; - // c.i[1] = ((const int * ALIGNED(16))a1)[2]; - // d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - // a.i[2] = ((const int * ALIGNED(16))a2)[0]; - // b.i[2] = ((const int * ALIGNED(16))a2)[1]; - // c.i[2] = ((const int * ALIGNED(16))a2)[2]; - // d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - // a.i[3] = ((const int * ALIGNED(16))a3)[0]; - // b.i[3] = ((const int * ALIGNED(16))a3)[1]; - // c.i[3] = ((const int * ALIGNED(16))a3)[2]; - // d.i[3] = ((const int * ALIGNED(16))a3)[3]; - } - #endif - #if 1 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, @@ -761,6 +752,105 @@ namespace v4 } #endif + #if 1 + inline void load_4x8_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &b00, + v4 &b01, + v4 &b02, + v4 &b03, + v4 &b04, + v4 &b05, + v4 &b06, + v4 &b07 ) + { + float32x4x4_t mat0 = vld4q_f32( (const float *) a0 ); + float32x4x4_t mat2 = vld4q_f32( (const float *) a2 ); + + b00.v = vuzp1q_f32( mat0.val[0], mat2.val[0] ); + b01.v = vuzp1q_f32( mat0.val[1], mat2.val[1] ); + b02.v = vuzp1q_f32( mat0.val[2], mat2.val[2] ); + b03.v = vuzp1q_f32( mat0.val[3], mat2.val[3] ); + + b04.v = vuzp2q_f32( mat0.val[0], mat2.val[0] ); + b05.v = vuzp2q_f32( mat0.val[1], mat2.val[1] ); + b06.v = vuzp2q_f32( mat0.val[2], mat2.val[2] ); + b07.v = vuzp2q_f32( mat0.val[3], mat2.val[3] ); + } + #endif + + #if 1 + inline void load_4x16_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &b00, + v4 &b01, + v4 &b02, + v4 &b03, + v4 &b04, + v4 &b05, + v4 &b06, + v4 &b07, + v4 &b08, + v4 &b09, + v4 &b10, + v4 &b11, + v4 &b12, + v4 &b13, + v4 &b14, + v4 &b15 ) + { + float32x4 c00, c01, c02, c03, c04, c05, c06, c07; + float32x4 c08, c09, c10, c11, c12, c13, c14, c15; + + float32x4x4_t mat0 = vld4q_f32( (const float *) a0 ); + float32x4x4_t mat1 = vld4q_f32( (const float *) a1 ); + float32x4x4_t mat2 = vld4q_f32( (const float *) a2 ); + float32x4x4_t mat3 = vld4q_f32( (const float *) a3 ); + + c00 = vuzp1q_f32( mat0.val[0], mat1.val[0] ); + c01 = vuzp1q_f32( mat0.val[1], mat1.val[1] ); + c02 = vuzp1q_f32( mat0.val[2], mat1.val[2] ); + c03 = vuzp1q_f32( mat0.val[3], mat1.val[3] ); + + c04 = vuzp2q_f32( mat0.val[0], mat1.val[0] ); + c05 = vuzp2q_f32( mat0.val[1], mat1.val[1] ); + c06 = vuzp2q_f32( mat0.val[2], mat1.val[2] ); + c07 = vuzp2q_f32( mat0.val[3], mat1.val[3] ); + + c08 = vuzp1q_f32( mat2.val[0], mat3.val[0] ); + c09 = vuzp1q_f32( mat2.val[1], mat3.val[1] ); + c10 = vuzp1q_f32( mat2.val[2], mat3.val[2] ); + c11 = vuzp1q_f32( mat2.val[3], mat3.val[3] ); + + c12 = vuzp2q_f32( mat2.val[0], mat3.val[0] ); + c13 = vuzp2q_f32( mat2.val[1], mat3.val[1] ); + c14 = vuzp2q_f32( mat2.val[2], mat3.val[2] ); + c15 = vuzp2q_f32( mat2.val[3], mat3.val[3] ); + + b00.v = vuzp1q_f32( c00, c08 ); + b01.v = vuzp1q_f32( c01, c09 ); + b02.v = vuzp1q_f32( c02, c10 ); + b03.v = vuzp1q_f32( c03, c11 ); + b04.v = vuzp1q_f32( c04, c12 ); + b05.v = vuzp1q_f32( c05, c13 ); + b06.v = vuzp1q_f32( c06, c14 ); + b07.v = vuzp1q_f32( c07, c15 ); + + b08.v = vuzp2q_f32( c00, c08 ); + b09.v = vuzp2q_f32( c01, c09 ); + b10.v = vuzp2q_f32( c02, c10 ); + b11.v = vuzp2q_f32( c03, c11 ); + b12.v = vuzp2q_f32( c04, c12 ); + b13.v = vuzp2q_f32( c05, c13 ); + b14.v = vuzp2q_f32( c06, c14 ); + b15.v = vuzp2q_f32( c07, c15 ); + } + #endif + inline void store_4x1_tr( const v4 &a, void *a0, void *a1, @@ -847,48 +937,6 @@ namespace v4 ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } - #if 0 - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - // Not correct. - float32x4x4_t mat; - - mat.val[0] = a.v; - mat.val[1] = b.v; - mat.val[2] = c.v; - mat.val[3] = d.v; - - vst4q_f32( (const float *) a0, mat ); - - // ((int * ALIGNED(16))a0)[0] = a.i[0]; - // ((int * ALIGNED(16))a0)[1] = b.i[0]; - // ((int * ALIGNED(16))a0)[2] = c.i[0]; - // ((int * ALIGNED(16))a0)[3] = d.i[0]; - - // ((int * ALIGNED(16))a1)[0] = a.i[1]; - // ((int * ALIGNED(16))a1)[1] = b.i[1]; - // ((int * ALIGNED(16))a1)[2] = c.i[1]; - // ((int * ALIGNED(16))a1)[3] = d.i[1]; - - // ((int * ALIGNED(16))a2)[0] = a.i[2]; - // ((int * ALIGNED(16))a2)[1] = b.i[2]; - // ((int * ALIGNED(16))a2)[2] = c.i[2]; - // ((int * ALIGNED(16))a2)[3] = d.i[2]; - - // ((int * ALIGNED(16))a3)[0] = a.i[3]; - // ((int * ALIGNED(16))a3)[1] = b.i[3]; - // ((int * ALIGNED(16))a3)[2] = c.i[3]; - // ((int * ALIGNED(16))a3)[3] = d.i[3]; - } - #endif - #if 1 inline void store_4x4_tr( const v4 &a, const v4 &b, @@ -1077,6 +1125,37 @@ namespace v4 } #endif + #if 1 + inline void store_4x8_tr( const v4 &b00, + const v4 &b01, + const v4 &b02, + const v4 &b03, + const v4 &b04, + const v4 &b05, + const v4 &b06, + const v4 &b07, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + float32x4x4_t mat0, mat2; + + mat0.val[0] = vuzp1q_f32( b00.v, b04.v ); + mat0.val[1] = vuzp1q_f32( b01.v, b05.v ); + mat0.val[2] = vuzp1q_f32( b02.v, b06.v ); + mat0.val[3] = vuzp1q_f32( b03.v, b07.v ); + + mat2.val[0] = vuzp2q_f32( b00.v, b04.v ); + mat2.val[1] = vuzp2q_f32( b01.v, b05.v ); + mat2.val[2] = vuzp2q_f32( b02.v, b06.v ); + mat2.val[3] = vuzp2q_f32( b03.v, b07.v ); + + vst4q_f32( (float *) a0, mat0 ); + vst4q_f32( (float *) a2, mat2 ); + } + #endif + ////////////// // v4int class From b2f9017dbbe341357ee608268d3b4e03ee0fdd31 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 25 Jul 2019 15:47:55 -0600 Subject: [PATCH 36/95] Add test cases to V4 unit tests. --- src/util/v4/test/v4.cc | 57 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc index 7e0db0e5..73a51540 100644 --- a/src/util/v4/test/v4.cc +++ b/src/util/v4/test/v4.cc @@ -274,6 +274,63 @@ TEST_CASE("TEST_CASE_load_4x4_tr", "[v4]") { REQUIRE( i==16 ); } // TEST_CASE +#ifdef V4_NEON_ACCELERATION +TEST_CASE("TEST_CASE_load_4x8_tr", "[v4]") { + DECLARE_ALIGNED_ARRAY( int, 64, mem, 32 ); + v4int a0, a1, a2, a3, a4, a5, a6, a7; + int i; + for( i=0; i<32; i++ ) mem[i] = i; + load_4x8_tr(mem,mem+8,mem+16,mem+24,a0,a1,a2,a3,a4,a5,a6,a7); + for( i=0; i<32; i++ ) if( mem[i]!=i ) break; + //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) || + //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 ); + + REQUIRE( any(a0==v4int( 0, 8, 16, 24 )) ); + REQUIRE( any(a1==v4int( 1, 9, 17, 25 )) ); + REQUIRE( any(a2==v4int( 2, 10, 18, 26 )) ); + REQUIRE( any(a3==v4int( 3, 11, 19, 27 )) ); + REQUIRE( any(a4==v4int( 4, 12, 20, 28 )) ); + REQUIRE( any(a5==v4int( 5, 13, 21, 29 )) ); + REQUIRE( any(a6==v4int( 6, 14, 22, 30 )) ); + REQUIRE( any(a7==v4int( 7, 15, 23, 31 )) ); + REQUIRE( i==32 ); +} // TEST_CASE +#endif + +#ifdef V4_NEON_ACCELERATION +TEST_CASE("TEST_CASE_load_4x16_tr", "[v4]") { + DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 ); + v4int a00, a01, a02, a03, a04, a05, a06, a07; + v4int a08, a09, a10, a11, a12, a13, a14, a15; + int i; + for( i=0; i<64; i++ ) mem[i] = i; + load_4x16_tr(mem,mem+16,mem+32,mem+48, + a00,a01,a02,a03,a04,a05,a06,a07, + a08,a09,a10,a11,a12,a13,a14,a15); + for( i=0; i<64; i++ ) if( mem[i]!=i ) break; + //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) || + //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 ); + + REQUIRE( any(a00==v4int( 0, 16, 32, 48 )) ); + REQUIRE( any(a01==v4int( 1, 17, 33, 49 )) ); + REQUIRE( any(a02==v4int( 2, 18, 34, 50 )) ); + REQUIRE( any(a03==v4int( 3, 19, 35, 51 )) ); + REQUIRE( any(a04==v4int( 4, 20, 36, 52 )) ); + REQUIRE( any(a05==v4int( 5, 21, 37, 53 )) ); + REQUIRE( any(a06==v4int( 6, 22, 38, 54 )) ); + REQUIRE( any(a07==v4int( 7, 23, 39, 55 )) ); + REQUIRE( any(a08==v4int( 8, 24, 40, 56 )) ); + REQUIRE( any(a09==v4int( 9, 25, 41, 57 )) ); + REQUIRE( any(a10==v4int( 10, 26, 42, 58 )) ); + REQUIRE( any(a11==v4int( 11, 27, 43, 59 )) ); + REQUIRE( any(a12==v4int( 12, 28, 44, 60 )) ); + REQUIRE( any(a13==v4int( 13, 29, 45, 61 )) ); + REQUIRE( any(a14==v4int( 14, 30, 46, 62 )) ); + REQUIRE( any(a15==v4int( 15, 31, 47, 63 )) ); + REQUIRE( i==64 ); +} // TEST_CASE +#endif + TEST_CASE("TEST_CASE_store_4x1_tr", "[v4]") { DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 ); v4int a0( 0, 4, 8,12), a1( 1, 5, 9,13), a2( 2, 6,10,14), a3( 3, 7,11,15); From 15b8da39e69cd1ae7e9c5413e8000e760b1676e1 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 25 Jul 2019 16:02:31 -0600 Subject: [PATCH 37/95] Fix a declaration error. --- src/util/v4/v4_neon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 50142a2f..f4c7a8a7 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -803,8 +803,8 @@ namespace v4 v4 &b14, v4 &b15 ) { - float32x4 c00, c01, c02, c03, c04, c05, c06, c07; - float32x4 c08, c09, c10, c11, c12, c13, c14, c15; + float32x4_t c00, c01, c02, c03, c04, c05, c06, c07; + float32x4_t c08, c09, c10, c11, c12, c13, c14, c15; float32x4x4_t mat0 = vld4q_f32( (const float *) a0 ); float32x4x4_t mat1 = vld4q_f32( (const float *) a1 ); From 60703bded811ce00b52a51a194760ee04a3eb2b4 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Thu, 25 Jul 2019 16:17:12 -0600 Subject: [PATCH 38/95] Test different memory alignment for V4 NEON implementation. --- src/sf_interface/sf_interface.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h index 6dc86883..f49be9eb 100644 --- a/src/sf_interface/sf_interface.h +++ b/src/sf_interface/sf_interface.h @@ -52,6 +52,15 @@ #endif +// Temporary hack. +#ifdef V4_NEON_ACCELERATION + +#define PAD_SIZE_INTERPOLATOR 14 +#define PAD_SIZE_ACCUMULATOR 4 +#define PAD_SIZE_HYDRO 2 + +#endif + /*****************************************************************************/ // Interpolator arrays shall be a (nx+2) x (ny+2) x (nz+2) allocation From cbdc2ccf0dcd36878512b07b39b95a14191f1622 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sat, 27 Jul 2019 19:57:52 -0600 Subject: [PATCH 39/95] Format tweak. --- .../standard/pipeline/advance_p_pipeline.cc | 4 ++-- .../standard/pipeline/advance_p_pipeline_v4.cc | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc index 3cdc4d10..e2ccfd93 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc @@ -183,7 +183,7 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args, a = (float *)( a0 + ii ); // Get accumulator -# define ACCUMULATE_J(X,Y,Z,offset) \ + #define ACCUMULATE_J(X,Y,Z,offset) \ v4 = q*u##X; /* v2 = q ux */ \ v1 = v4*d##Y; /* v1 = q ux dy */ \ v0 = v4-v1; /* v0 = q ux (1-dy) */ \ @@ -207,7 +207,7 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args, ACCUMULATE_J( y, z, x, 4 ); ACCUMULATE_J( z, x, y, 8 ); -# undef ACCUMULATE_J + #undef ACCUMULATE_J } else // Unlikely diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc index 19d82ade..88f83b34 100644 --- a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc @@ -245,7 +245,7 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args, //-------------------------------------------------------------------------- // Accumulate current density. //-------------------------------------------------------------------------- -# define ACCUMULATE_J(X,Y,Z,offset) \ + #define ACCUMULATE_J(X,Y,Z,offset) \ v04 = q*u##X; /* v04 = q ux */ \ v01 = v04*d##Y; /* v01 = q ux dy */ \ v00 = v04-v01; /* v00 = q ux (1-dy) */ \ @@ -270,14 +270,14 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args, ACCUMULATE_J( y, z, x, 4 ); ACCUMULATE_J( z, x, y, 8 ); -# undef ACCUMULATE_J + #undef ACCUMULATE_J //-------------------------------------------------------------------------- // Update position and accumulate current density for out of bounds // particles. //-------------------------------------------------------------------------- -# define MOVE_OUTBND(N) \ + #define MOVE_OUTBND(N) \ if ( outbnd(N) ) /* Unlikely */ \ { \ local_pm->dispx = ux(N); \ @@ -302,7 +302,7 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args, MOVE_OUTBND( 2); MOVE_OUTBND( 3); -# undef MOVE_OUTBND + #undef MOVE_OUTBND } args->seg[pipeline_rank].pm = pm; From 577b42eefa87c7f0f09ad25c327f66a2c95792f7 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 28 Jul 2019 22:45:59 -0600 Subject: [PATCH 40/95] Do not use special test version of V4 NEON implementations of center_p and uncenter_p. --- src/species_advance/standard/pipeline/center_p_pipeline_v4.cc | 2 +- src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc index 611bd5d5..2a25611f 100644 --- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc @@ -6,7 +6,7 @@ using namespace v4; -#ifdef V4_NEON_ACCELERATION +#ifdef V4_NEON_ACCELERATION_SNOUT void center_p_pipeline_v4( center_p_pipeline_args_t * args, diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc index 8d33c7a0..d4bfc425 100644 --- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc @@ -6,7 +6,7 @@ using namespace v4; -#ifdef V4_NEON_ACCELERATION +#ifdef V4_NEON_ACCELERATION_SNOUT void uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, From 7c6c53dc40ed33f8f985c3ed96cff7361630ab95 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Sun, 28 Jul 2019 23:24:26 -0600 Subject: [PATCH 41/95] Revert rcp and rsqrt functions back to their portable versions to try to isolate NaN problem. --- src/sf_interface/sf_interface.h | 2 +- src/util/v4/v4_neon.h | 58 ++++++++++++++++----------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h index f49be9eb..fd9b72eb 100644 --- a/src/sf_interface/sf_interface.h +++ b/src/sf_interface/sf_interface.h @@ -53,7 +53,7 @@ #endif // Temporary hack. -#ifdef V4_NEON_ACCELERATION +#ifdef V4_NEON_ACCELERATION_SNOUT #define PAD_SIZE_INTERPOLATOR 14 #define PAD_SIZE_ACCUMULATOR 4 diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index f4c7a8a7..22e8dff6 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -2038,26 +2038,26 @@ namespace v4 { v4float b; - float32x4_t a_v = a.v, b_v; - - b_v = vrsqrteq_f32( a_v ); - - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! - b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), - vsubq_f32( b_v, - vmulq_f32( a_v, - vmulq_f32( b_v, - vmulq_f32( b_v, b_v ) - ) - ) - ) - ) - ); + // float32x4_t a_v = a.v, b_v; + + // b_v = vrsqrteq_f32( a_v ); + + // // Note: It is quicker to just call div_ps and sqrt_ps if more + // // refinement desired! + // b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), + // vsubq_f32( b_v, + // vmulq_f32( a_v, + // vmulq_f32( b_v, + // vmulq_f32( b_v, b_v ) + // ) + // ) + // ) + // ) + // ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.f[j] = ::sqrt( 1.0f / a.f[j] ); + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; } @@ -2079,19 +2079,19 @@ namespace v4 { v4float b; - float32x4_t a_v = a.v, b_v; + // float32x4_t a_v = a.v, b_v; - b_v = vrecpeq_f32( a_v ); + // b_v = vrecpeq_f32( a_v ); - b.v = vsubq_f32( vaddq_f32( b_v, b_v ), - vmulq_f32( a_v, - vmulq_f32( b_v, b_v ) - ) - ); + // b.v = vsubq_f32( vaddq_f32( b_v, b_v ), + // vmulq_f32( a_v, + // vmulq_f32( b_v, b_v ) + // ) + // ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.f[j] = 1.0f / a.f[j]; + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; } From bd000c43119140bb51e936722de8812236dcce36 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 10:01:38 -0600 Subject: [PATCH 42/95] Format tweaks and cleanup before merging into other branches. --- src/util/v4/v4_altivec.h | 1241 +++++++++++++++++++--------------- src/util/v4/v4_avx.h | 1041 ++++++++++++++++++---------- src/util/v4/v4_avx2.h | 414 +++++++----- src/util/v4/v4_neon.h | 897 ++++-------------------- src/util/v4/v4_portable.h | 436 ++++++------ src/util/v4/v4_portable_v0.h | 428 ++++++------ src/util/v4/v4_portable_v1.h | 338 ++++----- src/util/v4/v4_sse.h | 1025 ++++++++++++++++++---------- 8 files changed, 3088 insertions(+), 2732 deletions(-) diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h index 6ff3f58c..f1361278 100644 --- a/src/util/v4/v4_altivec.h +++ b/src/util/v4/v4_altivec.h @@ -1,10 +1,13 @@ -#ifndef _v4_altivec_h_ + #ifndef _v4_altivec_h_ #define _v4_altivec_h_ #ifndef IN_v4_h #error "Do not include v4_altivec.h directly; use v4.h" #endif +#include +#include + #define V4_ACCELERATION #define V4_ALTIVEC_ACCELERATION @@ -12,31 +15,30 @@ #define ALIGNED(n) #endif -#include -#include - // See if this fixes a problem when compiling with GNU compilers. #ifdef __GNUC__ #undef bool #undef vector #endif -namespace v4 { +#define ALWAYS_INLINE __attribute__((always_inline)) +namespace v4 +{ class v4; class v4int; class v4float; -# define _v4_int __vector int -# define _v4_uint __vector unsigned int -# define _v4_float __vector float -# define _v16_uchar __vector unsigned char + #define _v4_int __vector int + #define _v4_uint __vector unsigned int + #define _v4_float __vector float + #define _v16_uchar __vector unsigned char -# define _PERM(i0,i1,i2,i3) \ - ((_v16_uchar){ 4*(i0), 4*(i0)+1, 4*(i0)+2, 4*(i0)+3, \ - 4*(i1), 4*(i1)+1, 4*(i1)+2, 4*(i1)+3, \ - 4*(i2), 4*(i2)+1, 4*(i2)+2, 4*(i2)+3, \ - 4*(i3), 4*(i3)+1, 4*(i3)+2, 4*(i3)+3 }) + #define _PERM(i0,i1,i2,i3) \ + ( (_v16_uchar) { 4*(i0), 4*(i0)+1, 4*(i0)+2, 4*(i0)+3, \ + 4*(i1), 4*(i1)+1, 4*(i1)+2, 4*(i1)+3, \ + 4*(i2), 4*(i2)+1, 4*(i2)+2, 4*(i2)+3, \ + 4*(i3), 4*(i3)+1, 4*(i3)+2, 4*(i3)+3 } ) // FIXME: IS IT FASTER TO SPLAT THESE ON THE FLY @@ -44,123 +46,104 @@ namespace v4 { const _v4_int _true = { -1, -1, -1, -1 }; const _v4_int _ione = { 1, 1, 1, 1 }; - const _v4_float _zero = { 0.0f, 0.0f, 0.0f, 0.0f }; - const _v4_float _half = { 0.5f, 0.5f, 0.5f, 0.5f }; - const _v4_float _one = { 1.0f, 1.0f, 1.0f, 1.0f }; - const _v4_float _sign = {-0.0f,-0.0f,-0.0f,-0.0f }; - const _v4_float _n02 = {-0.0f,+0.0f,-0.0f,+0.0f }; + const _v4_float _zero = { 0.0f, 0.0f, 0.0f, 0.0f }; + const _v4_float _half = { 0.5f, 0.5f, 0.5f, 0.5f }; + const _v4_float _one = { 1.0f, 1.0f, 1.0f, 1.0f }; + const _v4_float _sign = { -0.0f, -0.0f, -0.0f, -0.0f }; + const _v4_float _n02 = { -0.0f, +0.0f, -0.0f, +0.0f }; //////////////// // v4 base class - - class v4 { - + + class v4 + { friend class v4int; friend class v4float; - // ----------------------------------------------------------------------------- - // hacks that need to be resolved more elegantly + // v4 miscellaneous friends -/* friend inline v4 operator *( const v4 &a, const v4 &b ); */ + friend inline int any( const v4 &a ) ALWAYS_INLINE; + friend inline int all( const v4 &a ) ALWAYS_INLINE; -/* # define ASSIGN(op,instr) \ */ -/* inline v4 &operator op( const v4 &b ) \ */ -/* { \ */ -/* instr; \ */ -/* return *this; \ */ -/* } */ + template + friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; -/* ASSIGN(=, v = b.v ); */ + template + friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; -/* # undef ASSIGN */ + friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; + friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; -/* # define BINARY(op,instr) \ */ -/* inline v4 operator op( const v4 &a, const v4 &b ) \ */ -/* { \ */ -/* v4 c; \ */ -/* instr; \ */ -/* return c; \ */ -/* } */ + // v4int miscellaneous friends -/* BINARY(+, c.v = vec_add( a.v, b.v ) ) */ -/* BINARY(-, c.v = vec_sub( a.v, b.v ) ) */ -/* BINARY(*, c.v = vec_mul( a.v, b.v ) ) */ -/* // BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */ + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; -/* # undef BINARY */ - // end hacks - // ----------------------------------------------------------------------------- + // v4 memory manipulation friends - // v4 miscellenous friends + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; - friend inline int any( const v4 &a ); - friend inline int all( const v4 &a ); - template - friend inline v4 splat( const v4 &a ); - // friend inline v4 splat( const v4 &a, int n ); - template - friend inline v4 shuffle( const v4 &a ); - // friend inline v4 shuffle( const v4 &a, - // int i0, int i1, int i2, int i3 ); - friend inline void swap( v4 &a, v4 &b ); - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ); + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; - // v4int miscellaneous friends + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ); - friend inline v4 notczero( const v4int &c, const v4 &a ); - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ); + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - // v4 memory manipulation friends - - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ); - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ); - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ); - friend inline void clear_4x1( void * ALIGNED(16) dst ); - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ); - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ); + friend inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - // Note: Half aligned values are permissible in the 4x2_tr variants! friend inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, - v4 &a ); + v4 &a ) ALWAYS_INLINE; + friend inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ); + v4 &a, v4 &b ) ALWAYS_INLINE; + friend inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ); + v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; + friend inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ); - + v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; + friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ); + void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; + friend inline void store_4x2_tr( const v4 &a, const v4 &b, void * ALIGNED(8) a0, void * ALIGNED(8) a1, void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ); + void * ALIGNED(8) a3 ) ALWAYS_INLINE; + friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ); + void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ); + void * ALIGNED(16) a3 ) ALWAYS_INLINE; protected: public: // wdn @@ -170,46 +153,58 @@ namespace v4 { public: v4() {} // Default constructor - v4(const v4 &a) { // Copy constructor + + v4( const v4 &a ) // Copy constructor + { v = a.v; } - ~v4() {} // Default destructor + ~v4() {} // Default destructor }; - + // v4 miscellaneous functions - inline int any( const v4 &a ) { - return vec_any_ne( (_v4_int)a.v, _false ); + inline int any( const v4 &a ) + { + return vec_any_ne( (_v4_int) a.v, _false ); } - - inline int all( const v4 &a ) { - return vec_all_ne( (_v4_int)a.v, _false ); + + inline int all( const v4 &a ) + { + return vec_all_ne( (_v4_int) a.v, _false ); } template - inline v4 splat( const v4 & a ) { + inline v4 splat( const v4 & a ) + { v4 b; + b.v = vec_splat( a.v, n ); + return b; } template - inline v4 shuffle( const v4 & a ) { - _v4_float a_v = a.v; + inline v4 shuffle( const v4 & a ) + { v4 b; - b.v = vec_perm( a_v, a_v, _PERM( i0, i1, i2, i3 ) ); + + b.v = vec_perm( a.v, a.v, _PERM( i0, i1, i2, i3 ) ); + return b; } - inline void swap( v4 &a, v4 &b ) { - _v4_float t; - t = a.v; + inline void swap( v4 &a, v4 &b ) + { + _v4_float t = a.v; + a.v = b.v; + b.v = t; } - inline void transpose( v4 &a, v4 &b, v4 &c, v4 &d ) { + inline void transpose( v4 &a, v4 &b, v4 &c, v4 &d ) + { _v4_float a0 = a.v; // a0 = 0 1 2 3 _v4_float b0 = b.v; // b0 = 4 5 6 7 _v4_float c1 = c.v; // c1 = 8 9 10 11 @@ -231,40 +226,38 @@ namespace v4 { } // v4 memory manipulation functions - + inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.v = vec_ld( 0, ( const float * ) p ); } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { vec_st( a.v, 0, ( float * ) p ); } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { vec_stl( a.v, 0, ( float * ) p ); } - // FIXME: Ordering semantics - inline void clear_4x1( void * ALIGNED(16) d ) + inline void clear_4x1( void * ALIGNED(16) p ) { - vec_st( _zero, 0, ( float * ) d ); + vec_st( _zero, 0, ( float * ) p ); } - // FIXME: Ordering semantics - inline void copy_4x1( void * ALIGNED(16) d, - const void * ALIGNED(16) s ) + inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) { - vec_st( vec_ld( 0, ( const float * ) s ), 0, ( float * ) d ); + vec_st( vec_ld( 0, ( const float * ) src ), 0, ( float * ) dst ); } inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { _v4_float va = vec_ld( 0, ( float * ) a ); _v4_float vb = vec_ld( 0, ( float * ) b ); @@ -275,285 +268,330 @@ namespace v4 { // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *pa, - const void *pb, - const void *pc, - const void *pd, - v4 &a ) { - a.v = (_v4_float){ ((const float *)pa)[0], - ((const float *)pb)[0], - ((const float *)pc)[0], - ((const float *)pd)[0] }; - } - - #if 0 - inline void load_4x2_tr( const void * ALIGNED(8) pa, - const void * ALIGNED(8) pb, - const void * ALIGNED(8) pc, - const void * ALIGNED(8) pd, - v4 &a, v4 &b ) { // FIXME: UGLY!! - a.v = (_v4_float){ ((const float *)pa)[0], - ((const float *)pb)[0], - ((const float *)pc)[0], - ((const float *)pd)[0] }; - b.v = (_v4_float){ ((const float *)pa)[1], - ((const float *)pb)[1], - ((const float *)pc)[1], - ((const float *)pd)[1] }; - } - #endif - - inline void load_4x2_tr( const void * ALIGNED(8) pa, - const void * ALIGNED(8) pb, - const void * ALIGNED(8) pc, - const void * ALIGNED(8) pd, - v4 &a, v4 &b ) + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, + v4 &a ) + { + a.v = (_v4_float){ ( (const float *) a0 )[0], + ( (const float *) a1 )[0], + ( (const float *) a2 )[0], + ( (const float *) a3 )[0] }; + } + + inline void load_4x2_tr( const void * ALIGNED(8) a0, + const void * ALIGNED(8) a1, + const void * ALIGNED(8) a2, + const void * ALIGNED(8) a3, + v4 &a, + v4 &b ) { - _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 = 0 1 2 3 - _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 = 4 5 6 7 - _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 = 8 9 10 11 - _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 15 + _v4_float r = vec_ld( 0, (const float *) a0 ); // r = 0 1 2 3 + _v4_float s = vec_ld( 0, (const float *) a1 ); // s = 4 5 6 7 + _v4_float t = vec_ld( 0, (const float *) a2 ); // t = 8 9 10 11 + _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15 // Step 1: Interleave top and bottom half - _v4_float a1 = vec_mergeh( a0, c1 ); // a1 = 0 8 1 9 - _v4_float b1 = vec_mergeh( b0, d1 ); // b1 = 4 12 5 13 + _v4_float v = vec_mergeh( r, t ); // v = 0 8 1 9 + _v4_float w = vec_mergeh( s, u ); // w = 4 12 5 13 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( a1, b1 ); // a = 0 4 8 12 - b.v = vec_mergel( a1, b1 ); // b = 1 5 9 13 + a.v = vec_mergeh( v, w ); // a = 0 4 8 12 + b.v = vec_mergel( v, w ); // b = 1 5 9 13 } - - inline void load_4x3_tr( const void * ALIGNED(16) pa, - const void * ALIGNED(16) pb, - const void * ALIGNED(16) pc, - const void * ALIGNED(16) pd, - v4 &a, v4 &b, v4 &c ) { - _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 = 0 1 2 x - _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 = 4 5 6 x - _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 = 8 9 10 x - _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 x + + inline void load_4x3_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c ) + { + _v4_float r, s, t, u, d_v; + + a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 x + b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 x + c.v = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 x + d_v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 x // Step 1: Interleave top and bottom half - _v4_float a1 = vec_mergeh( a0, c1 ); // a1 = 0 8 1 9 - _v4_float b1 = vec_mergeh( b0, d1 ); // b1 = 4 12 5 13 - c1 = vec_mergel( a0, c1 ); // c1 = 2 10 x x - d1 = vec_mergel( b0, d1 ); // d1 = 6 14 x x + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d_v ); // s = 4 12 5 13 + + t = vec_mergel( a.v, c.v ); // t = 2 10 x x + u = vec_mergel( b.v, d_v ); // u = 6 14 x x // Step 2: Interleave even and odd rows - a.v = vec_mergeh( a1, b1 ); // a = 0 4 8 12 - b.v = vec_mergel( a1, b1 ); // b = 1 5 9 13 - c.v = vec_mergeh( c1, d1 ); // c = 2 6 10 14 + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 + c.v = vec_mergeh( t, u ); // c = 2 6 10 14 } - inline void load_4x4_tr( const void * ALIGNED(16) pa, - const void * ALIGNED(16) pb, - const void * ALIGNED(16) pc, - const void * ALIGNED(16) pd, - v4 &a, v4 &b, v4 &c, v4 &d ) + inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { - _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 = 0 1 2 3 - _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 = 4 5 6 7 - _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 = 8 9 10 11 - _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 15 + _v4_float r, s, t, u; + + a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 3 + b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 7 + c.v = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 11 + d.v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15 // Step 1: Interleave top and bottom half - _v4_float a1 = vec_mergeh( a0, c1 ); // a1 = 0 8 1 9 - _v4_float b1 = vec_mergeh( b0, d1 ); // b1 = 4 12 5 13 - c1 = vec_mergel( a0, c1 ); // c1 = 2 10 3 11 - d1 = vec_mergel( b0, d1 ); // d1 = 6 14 7 15 + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 + + t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 + u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( a1, b1 ); // a = 0 4 8 12 - b.v = vec_mergel( a1, b1 ); // b = 1 5 9 13 - c.v = vec_mergeh( c1, d1 ); // c = 2 6 10 14 - d.v = vec_mergel( c1, d1 ); // d = 3 7 11 15 + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 + c.v = vec_mergeh( t, u ); // c = 2 6 10 14 + d.v = vec_mergel( t, u ); // d = 3 7 11 15 } inline void store_4x1_tr( const v4 &a, - void * pa, - void * pb, - void * pc, - void * pd ) { - _v4_float a_v = a.v; - vec_ste( vec_splat(a_v,0), 0, (float *)pa ); - vec_ste( vec_splat(a_v,1), 0, (float *)pb ); - vec_ste( vec_splat(a_v,2), 0, (float *)pc ); - vec_ste( vec_splat(a_v,3), 0, (float *)pd ); - } - - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) pa, - void * ALIGNED(8) pb, - void * ALIGNED(8) pc, - void * ALIGNED(8) pd ) { - _v4_float t, a_v = a.v, b_v = b.v; - t = vec_perm( a_v, b_v, _PERM(0,4,0,4) ); vec_ste( t, 0, (float *)pa ); - vec_ste( t, 4, (float *)pa ); - t = vec_perm( a_v, b_v, _PERM(1,5,1,5) ); vec_ste( t, 0, (float *)pb ); - vec_ste( t, 4, (float *)pb ); - t = vec_perm( a_v, b_v, _PERM(2,6,2,6) ); vec_ste( t, 0, (float *)pc ); - vec_ste( t, 4, (float *)pc ); - t = vec_perm( a_v, b_v, _PERM(3,7,3,7) ); vec_ste( t, 0, (float *)pd ); - vec_ste( t, 4, (float *)pd ); - } - - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) pa, - void * ALIGNED(16) pb, - void * ALIGNED(16) pc, - void * ALIGNED(16) pd ) { + void *a0, + void *a1, + void *a2, + void *a3 ) + { + vec_ste( vec_splat( a.v, 0 ), 0, (float *) a0 ); + vec_ste( vec_splat( a.v, 1 ), 0, (float *) a1 ); + vec_ste( vec_splat( a.v, 2 ), 0, (float *) a2 ); + vec_ste( vec_splat( a.v, 3 ), 0, (float *) a3 ); + } + + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) + { + _v4_float t; + + t = vec_perm( a.v, b.v, _PERM(0,4,0,4) ); + + vec_ste( t, 0, (float *) a0 ); + vec_ste( t, 4, (float *) a0 ); + + t = vec_perm( a.v, b.v, _PERM(1,5,1,5) ); + + vec_ste( t, 0, (float *) a1 ); + vec_ste( t, 4, (float *) a1 ); + + t = vec_perm( a.v, b.v, _PERM(2,6,2,6) ); + + vec_ste( t, 0, (float *) a2 ); + vec_ste( t, 4, (float *) a2 ); + + t = vec_perm( a.v, b.v, _PERM(3,7,3,7) ); + + vec_ste( t, 0, (float *) a3 ); + vec_ste( t, 4, (float *) a3 ); + } + + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { _v4_float a_v = a.v; // a = 0 1 2 3 _v4_float b_v = b.v; // b = 4 5 6 7 _v4_float c_v = c.v; // c = 8 9 10 11 + _v4_float t, u, v; + t = vec_mergeh( a_v, c_v ); // t = 0 8 1 9 u = vec_mergeh( b_v, b_v ); // u = 4 x 5 x - v = vec_mergeh( t, u ); vec_ste( v, 0, (float *)pa ); - vec_ste( v, 4, (float *)pa ); - vec_ste( v, 8, (float *)pa ); - v = vec_mergel( t, u ); vec_ste( v, 0, (float *)pb ); - vec_ste( v, 4, (float *)pb ); - vec_ste( v, 8, (float *)pb ); + + v = vec_mergeh( t, u ); + + vec_ste( v, 0, (float *) a0 ); + vec_ste( v, 4, (float *) a0 ); + vec_ste( v, 8, (float *) a0 ); + + v = vec_mergel( t, u ); + + vec_ste( v, 0, (float *) a1 ); + vec_ste( v, 4, (float *) a1 ); + vec_ste( v, 8, (float *) a1 ); + t = vec_mergel( a_v, c_v ); // t = 2 10 3 11 u = vec_mergel( b_v, b_v ); // u = 6 x 7 x - v = vec_mergeh( t, u ); vec_ste( v, 0, (float *)pc ); - vec_ste( v, 4, (float *)pc ); - vec_ste( v, 8, (float *)pc ); - v = vec_mergel( t, u ); vec_ste( v, 0, (float *)pd ); - vec_ste( v, 4, (float *)pd ); - vec_ste( v, 8, (float *)pd ); + + v = vec_mergeh( t, u ); + + vec_ste( v, 0, (float *) a2 ); + vec_ste( v, 4, (float *) a2 ); + vec_ste( v, 8, (float *) a2 ); + + v = vec_mergel( t, u ); + + vec_ste( v, 0, (float *) a3 ); + vec_ste( v, 4, (float *) a3 ); + vec_ste( v, 8, (float *) a3 ); } - - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) pa, - void * ALIGNED(16) pb, - void * ALIGNED(16) pc, - void * ALIGNED(16) pd ) { - _v4_float a0 = a.v; // a0 = 0 1 2 3 - _v4_float b0 = b.v; // b0 = 4 5 6 7 - _v4_float c1 = c.v; // c1 = 8 9 10 11 - _v4_float d1 = d.v; // d1 = 12 13 14 15 + + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + _v4_float r, s, t, u; + + // a = 0 1 2 3 + // b = 4 5 6 7 + // c = 8 9 10 11 + // d = 12 13 14 15 // Step 1: Interleave top and bottom half - _v4_float a1 = vec_mergeh( a0, c1 ); // a1 = 0 8 1 9 - _v4_float b1 = vec_mergeh( b0, d1 ); // b1 = 4 12 5 13 - c1 = vec_mergel( a0, c1 ); // c1 = 2 10 3 11 - d1 = vec_mergel( b0, d1 ); // d1 = 6 14 7 15 + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 + t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 + u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 // Step 2: Interleave even and odd rows - vec_st( vec_mergeh( a1, b1 ), 0, (float *)pa ); // a = 0 4 8 12 - vec_st( vec_mergel( a1, b1 ), 0, (float *)pb ); // b = 1 5 9 13 - vec_st( vec_mergeh( c1, d1 ), 0, (float *)pc ); // c = 2 6 10 14 - vec_st( vec_mergel( c1, d1 ), 0, (float *)pd ); // d = 3 7 11 15 + vec_st( vec_mergeh( r, s ), 0, (float *) a0 ); // a0 = 0 4 8 12 + vec_st( vec_mergel( r, s ), 0, (float *) a1 ); // a1 = 1 5 9 13 + vec_st( vec_mergeh( t, u ), 0, (float *) a2 ); // a2 = 2 6 10 14 + vec_st( vec_mergel( t, u ), 0, (float *) a3 ); // a3 = 3 7 11 15 } ////////////// // v4int class - class v4int : public v4 { - + class v4int : public v4 + { // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ); - friend inline v4int operator -( const v4int & a ); - friend inline v4int operator ~( const v4int & a ); - friend inline v4int operator !( const v4int & a ); + friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ); - friend inline v4int operator --( v4int & a ); + friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ); - friend inline v4int operator --( v4int & a, int ); + friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ); - friend inline v4int operator -( const v4int &a, const v4int &b ); - friend inline v4int operator *( const v4int &a, const v4int &b ); - friend inline v4int operator /( const v4int &a, const v4int &b ); - friend inline v4int operator %( const v4int &a, const v4int &b ); - friend inline v4int operator ^( const v4int &a, const v4int &b ); - friend inline v4int operator &( const v4int &a, const v4int &b ); - friend inline v4int operator |( const v4int &a, const v4int &b ); - friend inline v4int operator <<( const v4int &a, const v4int &b ); - friend inline v4int operator >>( const v4int &a, const v4int &b ); + friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ); - friend inline v4int operator >( const v4int &a, const v4int &b ); - friend inline v4int operator ==( const v4int &a, const v4int &b ); - friend inline v4int operator !=( const v4int &a, const v4int &b ); - friend inline v4int operator <=( const v4int &a, const v4int &b ); - friend inline v4int operator >=( const v4int &a, const v4int &b ); - friend inline v4int operator &&( const v4int &a, const v4int &b ); - friend inline v4int operator ||( const v4int &a, const v4int &b ); + friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ); - friend inline v4 czero( const v4int &c, const v4 &a ); - friend inline v4 notczero( const v4int &c, const v4 &a ); + friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ); + friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ); + friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ); - friend inline v4int operator >( const v4float &a, const v4float &b ); - friend inline v4int operator ==( const v4float &a, const v4float &b ); - friend inline v4int operator !=( const v4float &a, const v4float &b ); - friend inline v4int operator <=( const v4float &a, const v4float &b ); - friend inline v4int operator >=( const v4float &a, const v4float &b ); - friend inline v4int operator &&( const v4float &a, const v4float &b ); - friend inline v4int operator ||( const v4float &a, const v4float &b ); + friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ); - friend inline v4float set_bits( const v4int &m, const v4float &a ); - friend inline v4float toggle_bits( const v4int &m, const v4float &a ); + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; public: // v4int constructors / destructors - + v4int() {} // Default constructor - v4int( const v4int &a ) { // Copy constructor + + v4int( const v4int &a ) // Copy constructor + { v = a.v; } - v4int( const v4 &a ) { // Init from mixed + + v4int( const v4 &a ) // Init from mixed + { v = a.v; } - v4int( int a ) { // Init from scalar - v = (_v4_float)((_v4_int){ a, a, a, a }); + + v4int( int a ) // Init from scalar + { + v = (_v4_float) ( (_v4_int) { a, a, a, a } ); } - v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars - v = (_v4_float)((_v4_int){ i0, i1, i2, i3 }); + + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + { + v = (_v4_float) ( (_v4_int) { i0, i1, i2, i3 } ); } + ~v4int() {} // Destructor - + // v4int assignment operators - -# define ASSIGN(op,instr) \ - inline v4int &operator op( const v4int &b ) { \ + + #define ASSIGN(op,instr) \ + inline v4int &operator op( const v4int &b ) \ + { \ instr; \ return *this; \ } @@ -588,76 +626,103 @@ namespace v4 { ASSIGN(<<=, v = (_v4_float)vec_sl( (_v4_int)v, (_v4_uint)b.v ) ) ASSIGN(>>=, v = (_v4_float)vec_sr( (_v4_int)v, (_v4_uint)b.v ) ) -# undef ASSIGN + #undef ASSIGN // v4int member access operator - - // FIXME: [] operation probably breaks the compiler if used to modify - // a vector! - inline int &operator []( int n ) { return ((int *)&v)[n]; } - inline int operator ()( int n ) { - union { int i[4]; _v4_float v; } t; t.v = v; return t.i[n]; + inline int &operator []( int n ) + { + return ( (int *) &v )[n]; } + inline int operator ()( int n ) + { + union + { + int i[4]; + _v4_float v; + } t; + + t.v = v; + + return t.i[n]; + } }; // v4int prefix unary operators -# define PREFIX_UNARY(op,instr) \ - inline v4int operator op( const v4int & a ) { \ + #define PREFIX_UNARY(op,instr) \ + inline v4int operator op( const v4int &a ) \ + { \ v4int b; \ instr; \ return b; \ } - PREFIX_UNARY(+, b.v = a.v ) - PREFIX_UNARY(-, b.v = (_v4_float)vec_sub( _false, (_v4_int)a.v ) ) - PREFIX_UNARY(!, b.v = (_v4_float)vec_cmpeq( _false, (_v4_int)a.v ) ) - PREFIX_UNARY(~, b.v = (_v4_float)vec_xor( _true, (_v4_int)a.v ) ) - -# undef PREFIX_UNARY + PREFIX_UNARY( +, b.v = a.v ) + PREFIX_UNARY( -, b.v = (_v4_float) vec_sub( _false, (_v4_int) a.v ) ) + PREFIX_UNARY( !, b.v = (_v4_float) vec_cmpeq( _false, (_v4_int) a.v ) ) + PREFIX_UNARY( ~, b.v = (_v4_float) vec_xor( _true, (_v4_int) a.v ) ) + + #undef PREFIX_UNARY // v4int prefix increment / decrement operators - inline v4int operator ++( v4int &a ) { - _v4_float a_v = (_v4_float)vec_add( (_v4_int)a.v, _ione ); + inline v4int operator ++( v4int &a ) + { + _v4_float a_v = (_v4_float) vec_add( (_v4_int) a.v, _ione ); + v4int b; + a.v = a_v; b.v = a_v; + return b; } - inline v4int operator --( v4int &a ) { - _v4_float a_v = (_v4_float)vec_sub( (_v4_int)a.v, _ione ); + inline v4int operator --( v4int &a ) + { + _v4_float a_v = (_v4_float) vec_sub( (_v4_int) a.v, _ione ); + v4int b; + a.v = a_v; b.v = a_v; + return b; } // v4int postfix increment / decrement operators - inline v4int operator ++( v4int &a, int ) { + inline v4int operator ++( v4int &a, int ) + { _v4_float a_v = a.v; + v4int b; - a.v = (_v4_float)vec_add( (_v4_int)a_v, _ione ); + + a.v = (_v4_float) vec_add( (_v4_int) a_v, _ione ); b.v = a_v; + return b; } - inline v4int operator --( v4int &a, int ) { + inline v4int operator --( v4int &a, int ) + { _v4_float a_v = a.v; + v4int b; - a.v = (_v4_float)vec_sub( (_v4_int)a_v, _ione ); + + a.v = (_v4_float) vec_sub( (_v4_int) a_v, _ione ); b.v = a_v; + return b; } // v4int binary operators - -# define BINARY(op,instr) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + + #define BINARY(op,instr) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ instr; \ return c; \ @@ -692,12 +757,13 @@ namespace v4 { BINARY(<<, c.v = (_v4_float)vec_sl( (_v4_int)a.v, (_v4_uint)b.v ) ) BINARY(>>, c.v = (_v4_float)vec_sr( (_v4_int)a.v, (_v4_uint)b.v ) ) -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op,instr) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + #define LOGICAL(op,instr) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ instr; \ return c; \ @@ -723,86 +789,92 @@ namespace v4 { vec_cmpeq( (_v4_int)b.v, _false ) ) ) ) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions - inline v4int abs( const v4int &a ) { + inline v4int abs( const v4int &a ) + { v4int b; - b.v = (_v4_float)vec_abs( (_v4_int)a.v ); + + b.v = (_v4_float) vec_abs( (_v4_int) a.v ); + return b; } - inline v4 czero( const v4int &c, const v4 &a ) { + inline v4 czero( const v4int &c, const v4 &a ) + { v4 b; + b.v = vec_andc( a.v, c.v ); + return b; } - inline v4 notczero( const v4int &c, const v4 &a ) { + inline v4 notczero( const v4int &c, const v4 &a ) + { v4 b; + b.v = vec_and( a.v, c.v ); + return b; } - - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - v4 m; - m.v = vec_sel( f.v, t.v, (_v4_uint)c.v ); - return m; + + inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) + { + v4 tf; + + tf.v = vec_sel( f.v, t.v, (_v4_uint) c.v ); + + return tf; } //////////////// // v4float class - class v4float : public v4 { - + class v4float : public v4 + { // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ); - friend inline v4float operator -( const v4float &a ); - friend inline v4float operator ~( const v4float &a ); - friend inline v4int operator !( const v4float &a ); + friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; + friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ); - friend inline v4float operator --( v4float &a ); + friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ); - friend inline v4float operator --( v4float &a, int ); + friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ); - friend inline v4float operator -( const v4float &a, const v4float &b ); - friend inline v4float operator *( const v4float &a, const v4float &b ); - friend inline v4float operator /( const v4float &a, const v4float &b ); - - // ------------------------------------------------------------------------- - // begin hacks - // friend inline v4float operator *( const v4float &a, const v4 &b ); - // end hacks - // ------------------------------------------------------------------------- + friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ); - friend inline v4int operator >( const v4float &a, const v4float &b ); - friend inline v4int operator ==( const v4float &a, const v4float &b ); - friend inline v4int operator !=( const v4float &a, const v4float &b ); - friend inline v4int operator <=( const v4float &a, const v4float &b ); - friend inline v4int operator >=( const v4float &a, const v4float &b ); - friend inline v4int operator &&( const v4float &a, const v4float &b ); - friend inline v4int operator ||( const v4float &a, const v4float &b ); + friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -812,61 +884,73 @@ namespace v4 { CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ); - friend inline v4float rsqrt( const v4float &a ); - friend inline v4float rcp_approx( const v4float &a ); - friend inline v4float rcp( const v4float &a ); - friend inline v4float fma( const v4float &a, const v4float &b, const v4float &c ); - friend inline v4float fms( const v4float &a, const v4float &b, const v4float &c ); - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ); - friend inline v4float clear_bits( const v4int &m, const v4float &a ); - friend inline v4float set_bits( const v4int &m, const v4float &a ); - friend inline v4float toggle_bits( const v4int &m, const v4float &a ); - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ); - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ); - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ); - // FIXME: crack - friend inline void trilinear( v4float & wl, v4float & wh ); - + friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; + friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + public: // v4float constructors / destructors - + v4float() {} // Default constructor - v4float( const v4float &a ) { // Copy constructor + + v4float( const v4float &a ) // Copy constructor + { v = a.v; } - v4float( const v4 &a ) { // Init from mixed + + v4float( const v4 &a ) // Init from mixed + { v = a.v; } - v4float( float a ) { // Init from scalar - v = (_v4_float){ a, a, a, a }; + + v4float( float a ) // Init from scalar + { + v = (_v4_float) { a, a, a, a }; } - v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars - v = (_v4_float){ f0, f1, f2, f3 }; + + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + { + v = (_v4_float) { f0, f1, f2, f3 }; } + ~v4float() {} // Destructor // v4float assignment operators -# define ASSIGN(op,instr) \ - inline v4float &operator op( const v4float &b ) { \ + #define ASSIGN(op,instr) \ + inline v4float &operator op( const v4float &b ) \ + { \ instr; \ return *this; \ } - ASSIGN(=, v = b.v ); - ASSIGN(+=, v = vec_add(v,b.v) ); - ASSIGN(-=, v = vec_sub(v,b.v) ); - ASSIGN(*=, v = vec_madd(v,b.v,_zero) ); + ASSIGN( =, v = b.v ); + ASSIGN( +=, v = vec_add( v, b.v ) ); + ASSIGN( -=, v = vec_sub( v, b.v ) ); + ASSIGN( *=, v = vec_madd( v, b.v, _zero ) ); + + #undef ASSIGN // This does one NR iteration and is supposed to be accurate enough. - inline v4float &operator /=( const v4float &a ) { + inline v4float &operator /=( const v4float &a ) + { _v4_float a_v = a.v, b_v; // Compute an estimate of the reciprocal of a (??-bit accurate) @@ -892,7 +976,8 @@ namespace v4 { #if 0 // This is a more accurate version that does two NR iterations. - inline v4float &operator /=( const v4float &a ) { + inline v4float &operator /=( const v4float &a ) + { _v4_float a_v = a.v, b_v; // Compute an estimate of the reciprocal of a (??-bit accurate) @@ -918,93 +1003,130 @@ namespace v4 { } #endif -# undef ASSIGN - // v4float member access operator - // FIXME: [] operation probably breaks the compiler if used to modify - // a vector! - - inline float &operator []( int n ) { return ((float *)&v)[n]; } - inline float operator ()( int n ) { - union { float f[4]; _v4_float v; } t; t.v = v; return t.f[n]; + inline float &operator []( int n ) + { + return ( (float *) &v )[n]; } + inline float operator ()( int n ) + { + union + { + float f[4]; + _v4_float v; + } t; + + t.v = v; + + return t.f[n]; + } }; // v4float prefix unary operators - inline v4float operator +( const v4float &a ) { + inline v4float operator +( const v4float &a ) + { v4float b; + b.v = a.v; + return b; } - inline v4float operator -( const v4float &a ) { + inline v4float operator -( const v4float &a ) + { v4float b; + b.v = vec_sub( _zero, a.v ); + return b; } - inline v4int operator !( const v4float &a ) { + inline v4int operator !( const v4float &a ) + { v4int b; - b.v = (_v4_float)vec_cmpeq( a.v, _zero ); + + b.v = (_v4_float) vec_cmpeq( a.v, _zero ); + return b; } // v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) { - _v4_float a_v = vec_add( a.v, _one ); + inline v4float operator ++( v4float &a ) + { v4float b; - a.v = a_v; - b.v = a_v; + + _v4_float t = vec_add( a.v, _one ); + + a.v = t; + b.v = t; + return b; } - inline v4float operator --( v4float &a ) { - _v4_float a_v = vec_sub( a.v, _one ); + inline v4float operator --( v4float &a ) + { v4float b; - a.v = a_v; - b.v = a_v; + + _v4_float t = vec_sub( a.v, _one ); + + a.v = t; + b.v = t; + return b; } // v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) { - _v4_float a_v = a.v; + inline v4float operator ++( v4float &a, int ) + { v4float b; + + _v4_float a_v = a.v; + a.v = vec_add( a_v, _one ); b.v = a_v; + return b; } - inline v4float operator --( v4float &a, int ) { - _v4_float a_v = a.v; + inline v4float operator --( v4float &a, int ) + { v4float b; + + _v4_float a_v = a.v; + a.v = vec_sub( a_v, _one ); b.v = a_v; + return b; } // v4float binary operators -# define BINARY(op,instr) \ - inline v4float operator op( const v4float &a, const v4float &b ) { \ + #define BINARY(op,instr) \ + inline v4float operator op( const v4float &a, const v4float &b ) \ + { \ v4float c; \ instr; \ return c; \ } - BINARY(+, c.v = vec_add( a.v, b.v ) ) - BINARY(-, c.v = vec_sub( a.v, b.v ) ) - BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) + BINARY( +, c.v = vec_add( a.v, b.v ) ) + BINARY( -, c.v = vec_sub( a.v, b.v ) ) + BINARY( *, c.v = vec_madd( a.v, b.v, _zero ) ) - inline v4float operator /( const v4float &n, const v4float &a ) { - _v4_float a_v = a.v, b_v; + #undef BINARY + + inline v4float operator /( const v4float &n, const v4float &a ) + { v4float c; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the reciprocal of a (??-bit accurate) b_v = vec_re( a_v ); @@ -1028,10 +1150,12 @@ namespace v4 { #if 0 // This is a more accurate version that does two NR iterations. - inline v4float operator /( const v4float &n, const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float operator /( const v4float &n, const v4float &a ) + { v4float c; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the reciprocal of a (??-bit accurate) b_v = vec_re( a_v ); @@ -1055,72 +1179,59 @@ namespace v4 { } #endif -# undef BINARY - - // ------------------------------------------------------------------------- - // begin hacks -/* # define BINARY(op,instr) \ */ -/* inline v4float operator op( const v4float &a, const v4 &b ) { \ */ -/* v4float c; \ */ -/* instr; \ */ -/* return c; \ */ -/* } */ - -/* BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */ - -/* # undef BINARY */ - // end hacks - // ------------------------------------------------------------------------- - // v4float logical operators -# define LOGICAL(op,instr) \ - inline v4int operator op( const v4float &a, const v4float &b ) { \ + #define LOGICAL(op,instr) \ + inline v4int operator op( const v4float &a, const v4float &b ) \ + { \ v4int c; \ instr; \ return c; \ } - LOGICAL(<, c.v = (_v4_float)vec_cmplt( a.v, b.v ) ) - LOGICAL(>, c.v = (_v4_float)vec_cmpgt( a.v, b.v ) ) - LOGICAL(==, c.v = (_v4_float)vec_cmpeq( a.v, b.v ) ) - LOGICAL(!=, c.v = (_v4_float)vec_xor( vec_cmpeq( a.v, b.v ), _true ) ) - LOGICAL(<=, c.v = (_v4_float)vec_cmple( a.v, b.v ) ) - LOGICAL(>=, c.v = (_v4_float)vec_cmpge( a.v, b.v ) ) - LOGICAL(&&, c.v = (_v4_float)vec_xor( vec_or( vec_cmpeq( a.v, _zero ), - vec_cmpeq( b.v, _zero ) ), - _true ) ) - LOGICAL(||, c.v = (_v4_float)vec_xor( vec_and( vec_cmpeq( a.v, _zero ), - vec_cmpeq( b.v, _zero ) ), - _true ) ) + LOGICAL( <, c.v = (_v4_float) vec_cmplt( a.v, b.v ) ) + LOGICAL( >, c.v = (_v4_float) vec_cmpgt( a.v, b.v ) ) + LOGICAL( ==, c.v = (_v4_float) vec_cmpeq( a.v, b.v ) ) + LOGICAL( <=, c.v = (_v4_float) vec_cmple( a.v, b.v ) ) + LOGICAL( >=, c.v = (_v4_float) vec_cmpge( a.v, b.v ) ) + LOGICAL( !=, c.v = (_v4_float) vec_xor( vec_cmpeq( a.v, b.v ), + _true ) ) + LOGICAL( &&, c.v = (_v4_float) vec_xor( vec_or( vec_cmpeq( a.v, _zero ), + vec_cmpeq( b.v, _zero ) ), + _true ) ) + LOGICAL( ||, c.v = (_v4_float) vec_xor( vec_and( vec_cmpeq( a.v, _zero ), + vec_cmpeq( b.v, _zero ) ), + _true ) ) -# undef LOGICAL + #undef LOGICAL // v4float math library functions -# define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) { \ - union { float f[4]; _v4_float v; } t; \ - v4float b; \ - t.v = a.v; \ - b.v = (_v4_float){ (float) ::fn( t.f[0] ), \ + #define CMATH_FR1(fn) \ + inline v4float fn( const v4float &a ) \ + { \ + union { float f[4]; _v4_float v; } t; \ + v4float b; \ + t.v = a.v; \ + b.v = (_v4_float){ (float) ::fn( t.f[0] ), \ (float) ::fn( t.f[1] ), \ (float) ::fn( t.f[2] ), \ (float) ::fn( t.f[3] ) }; \ - return b; \ + return b; \ } - -# define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) { \ + + #define CMATH_FR2(fn) \ + inline v4float fn( const v4float &a, const v4float &b ) \ + { \ union { float f[4]; _v4_float v; } t; \ union { float f[4]; _v4_float v; } u; \ v4float c; \ t.v = a.v; \ u.v = b.v; \ - c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ), \ - (float) ::fn( t.f[1], u.f[1] ), \ - (float) ::fn( t.f[2], u.f[2] ), \ - (float) ::fn( t.f[3], u.f[3] ) }; \ + c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ), \ + (float) ::fn( t.f[1], u.f[1] ), \ + (float) ::fn( t.f[2], u.f[2] ), \ + (float) ::fn( t.f[3], u.f[3] ) }; \ return c; \ } @@ -1130,17 +1241,25 @@ namespace v4 { CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) - inline v4float fabs( const v4float &a ) { + #undef CMATH_FR1 + #undef CMATH_FR2 + + inline v4float fabs( const v4float &a ) + { v4float b; + b.v = vec_andc( a.v, _sign ); + return b; } // This version does one NR iteration and is supposed to be accurate enough. - inline v4float sqrt( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float sqrt( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the rsqrt (??-bit accurate) b_v = vec_rsqrte( a_v ); @@ -1148,7 +1267,7 @@ namespace v4 { // FIXME: CHECK NUMERICS. HOW MANY N-R STEPS NECESSARY? // APPLE'S ALTIVEC PAGE SUGGESTS TWO. - b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); @@ -1161,10 +1280,12 @@ namespace v4 { #if 0 // This is a more accurate version that does two NR iterations. - inline v4float sqrt( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float sqrt( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the rsqrt (??-bit accurate) b_v = vec_rsqrte( a_v ); @@ -1172,10 +1293,10 @@ namespace v4 { // FIXME: CHECK NUMERICS. HOW MANY N-R STEPS NECESSARY? // APPLE'S ALTIVEC PAGE SUGGESTS TWO. - b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); - b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); @@ -1187,28 +1308,33 @@ namespace v4 { } #endif - inline v4float copysign( const v4float &a, const v4float &b ) { + inline v4float copysign( const v4float &a, const v4float &b ) + { v4float c; + c.v = vec_or( vec_andc( a.v, _sign ), vec_and( b.v, _sign ) ); + return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 + // v4float miscellaneous functions - // v4float miscelleanous functions - - inline v4float rsqrt_approx( const v4float &a ) { + inline v4float rsqrt_approx( const v4float &a ) + { v4float b; + b.v = vec_rsqrte( a.v ); + return b; } // This version does one NR iteration and is supposed to be accurate enough. - inline v4float rsqrt( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float rsqrt( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the rsqrt (??-bit accurate) b_v = vec_rsqrte( a_v ); @@ -1219,7 +1345,8 @@ namespace v4 { // b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), // vec_madd( b_v, _half, _zero ), // b_v ); - b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + + b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); @@ -1228,10 +1355,12 @@ namespace v4 { #if 0 // This is a more accurate version that does two NR iterations. - inline v4float rsqrt( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float rsqrt( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the rsqrt (??-bit accurate) b_v = vec_rsqrte( a_v ); @@ -1239,10 +1368,11 @@ namespace v4 { // FIXME: CHECK NUMERICS. HOW MANY N-R STEPS NECESSARY? // APPLE'S ALTIVEC PAGE SUGGESTS TWO. - b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); - b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), + + b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), vec_madd( b_v, _half, _zero ), b_v ); @@ -1250,17 +1380,22 @@ namespace v4 { } #endif - inline v4float rcp_approx( const v4float &a ) { + inline v4float rcp_approx( const v4float &a ) + { v4float b; + b.v = vec_re( a.v ); + return b; } // This version does one NR iteration and is supposed to be accurate enough. - inline v4float rcp( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float rcp( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the reciprocal of a (??-bit accurate) b_v = vec_re( a_v ); @@ -1273,7 +1408,8 @@ namespace v4 { // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE // ACCURATE (OR AT LEAST USES FEWER CONSTANTS). - // b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); + // b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); + b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); return b; @@ -1281,10 +1417,12 @@ namespace v4 { #if 0 // This is a more accurate version that does two NR iterations. - inline v4float rcp( const v4float &a ) { - _v4_float a_v = a.v, b_v; + inline v4float rcp( const v4float &a ) + { v4float b; + _v4_float a_v = a.v, b_v; + // Compute an estimate of the reciprocal of a (??-bit accurate) b_v = vec_re( a_v ); @@ -1298,76 +1436,113 @@ namespace v4 { // ACCURATE (OR AT LEAST USES FEWER CONSTANTS). b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); + b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); return b; } #endif - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = vec_madd( a.v, b.v, c.v ); + return d; } - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; - // d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ... + + // d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ... + d.v = vec_msub( a.v, b.v, c.v ) ; + return d; } - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = vec_nmsub( a.v, b.v, c.v ); + return d; } - inline v4float clear_bits( const v4int &m, const v4float &a ) { + inline v4float clear_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = vec_andc( a.v, m.v ); + return b; } - inline v4float set_bits( const v4int &m, const v4float &a ) { + inline v4float set_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = vec_or( a.v, m.v ); + return b; } - inline v4float toggle_bits( const v4int &m, const v4float &a ) { + inline v4float toggle_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = vec_xor( a.v, m.v ); + return b; } - - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) { + + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) + { vec_st( vec_add( vec_ld( 0, p ), a.v ), 0, p ); } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) + { vec_st( vec_sub( vec_ld( 0, p ), a.v ), 0, p ); } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) + { vec_st( vec_madd( vec_ld( 0, p ), a.v, _zero ), 0, p ); } - inline void trilinear( v4float & wl, v4float & wh ) { + // Given wl = x y z w, compute: + // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) + // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) + inline void trilinear( v4float &wl, v4float &wh ) + { _v4_float z = wl.v, xy; - xy = vec_add( _one, vec_xor( _n02, vec_mergeh(z,z) ) ); - z = vec_add( _one, vec_xor( _n02, vec_splat(z,2) ) ); - xy = vec_madd( vec_perm(xy,xy,_PERM(0,1,0,1)), vec_mergel(xy,xy), _zero ); - wl.v = vec_madd( xy, vec_splat(z,0), _zero ); - wh.v = vec_madd( xy, vec_splat(z,1), _zero ); + + xy = vec_add( _one, vec_xor( _n02, vec_mergeh( z, z ) ) ); + + z = vec_add( _one, vec_xor( _n02, vec_splat( z, 2 ) ) ); + + xy = vec_madd( vec_perm( xy, xy, _PERM(0,1,0,1) ), + vec_mergel( xy, xy ), + _zero ); + + wl.v = vec_madd( xy, vec_splat( z, 0 ), _zero ); + + wh.v = vec_madd( xy, vec_splat( z, 1 ), _zero ); } -# undef _v4_int -# undef _v4_uint -# undef _v4_float -# undef _v16_uchar + #undef _v4_int + #undef _v4_uint + #undef _v4_float + #undef _v16_uchar -# undef _PERM + #undef _PERM } // namespace v4 diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h index 3c48096e..29612f45 100644 --- a/src/util/v4/v4_avx.h +++ b/src/util/v4/v4_avx.h @@ -5,47 +5,41 @@ #error "Do not include v4_avx.h directly; use v4.h" #endif -#define V4_ACCELERATION -#define V4_AVX_ACCELERATION - #include #include +#define V4_ACCELERATION +#define V4_AVX_ACCELERATION + #ifndef ALIGNED #define ALIGNED(n) #endif -// FIXME: IN PORTABLE, ALTIVEC, SPU -// - UPDATE V4INT, V4FLOAT - -// This requires gcc-3.3 and up -// Also, Bug 12902 has not been resolved on gcc-3.x.x. See README.patches for -// details. gcc-4.x.x does not seem to have this bug but may suffer from -// other problems (use "-fno-strict-aliasing" on these platforms) - #define ALWAYS_INLINE __attribute__((always_inline)) -namespace v4 { - +namespace v4 +{ class v4; class v4int; class v4float; - template struct permute { + template + struct permute + { constexpr static int value = i0 + i1*4 + i2*16 + i3*64; - }; // permute + }; + + #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) -# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) - //////////////// // v4 base class - - class v4 { - + + class v4 + { friend class v4int; friend class v4float; - - // v4 miscellenous friends + + // v4 miscellaneous friends friend inline int any( const v4 &a ) ALWAYS_INLINE; friend inline int all( const v4 &a ) ALWAYS_INLINE; @@ -61,53 +55,68 @@ namespace v4 { // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + + friend inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends friend inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, v4 &a ) ALWAYS_INLINE; + friend inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, v4 &a, v4 &b ) ALWAYS_INLINE; + friend inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; + friend inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - + friend inline void store_4x1_tr( const v4 &a, void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; + friend inline void store_4x2_tr( const v4 &a, const v4 &b, void * ALIGNED(8) a0, void * ALIGNED(8) a1, void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) ALWAYS_INLINE; + friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, @@ -117,81 +126,102 @@ namespace v4 { protected: - union { + union + { int i[4]; float f[4]; __m128 v; }; - + public: v4() {} // Default constructor - v4(const v4 &a) { v=a.v; } // Copy constructor - ~v4() {} // Default destructor + v4( const v4 &a ) // Copy constructor + { + v = a.v; + } + + ~v4() {} // Default destructor }; - + // v4 miscellaneous functions - inline int any( const v4 &a ) { + inline int any( const v4 &a ) + { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - - inline int all( const v4 &a ) { + + inline int all( const v4 &a ) + { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - // Note: n MUST BE AN IMMEDIATE! template - inline v4 splat(const v4 & a) { - __m128 a_v = a.v; + inline v4 splat( const v4 & a ) + { v4 b; - b.v = _mm_shuffle_ps( a_v, a_v, (n*permute<1,1,1,1>::value)); + + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); + return b; } - // Note: i0:3 MUST BE IMMEDIATES! */ template - inline v4 shuffle( const v4 & a ) { - __m128 a_v = a.v; + inline v4 shuffle( const v4 & a ) + { v4 b; - b.v = _mm_shuffle_ps( a_v, a_v, (permute::value) ); + + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); + return b; } - inline void swap( v4 &a, v4 &b ) { - __m128 a_v = a.v; a.v = b.v; b.v = a_v; + inline void swap( v4 &a, v4 &b ) + { + __m128 t = a.v; + + a.v = b.v; + + b.v = t; } - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u; + t = _mm_unpackhi_ps( a0_v, a1_v ); a0_v = _mm_unpacklo_ps( a0_v, a1_v ); u = _mm_unpackhi_ps( a2_v, a3_v ); a2_v = _mm_unpacklo_ps( a2_v, a3_v ); + a1_v = _mm_movehl_ps( a2_v, a0_v ); a0_v = _mm_movelh_ps( a0_v, a2_v ); a2_v = _mm_movelh_ps( t, u ); a3_v = _mm_movehl_ps( u, t ); - a0.v = a0_v; a1.v = a1_v; a2.v = a2_v; a3.v = a3_v; + + a0.v = a0_v; + a1.v = a1_v; + a2.v = a2_v; + a3.v = a3_v; } // v4 memory manipulation functions - + inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.v = _mm_load_ps( ( float * ) p ); } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_store_ps( ( float * ) p, a.v ); } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_stream_ps( ( float * ) p, a.v ); } @@ -207,9 +237,8 @@ namespace v4 { _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } - /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { __m128 t = _mm_load_ps( ( float * ) a ); @@ -219,129 +248,180 @@ namespace v4 { // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, v4 &a ) { - a.v = _mm_setr_ps( ((const float *)a0)[0], - ((const float *)a1)[0], - ((const float *)a2)[0], - ((const float *)a3)[0] ); + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, + v4 &a ) + { + a.v = _mm_setr_ps( ( (const float *) a0 )[0], + ( (const float *) a1 )[0], + ( (const float *) a2 )[0], + ( (const float *) a3 )[0] ); } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) { + v4 &a, + v4 &b ) + { __m128 a_v, b_v, t; + b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); - a_v = _mm_shuffle_ps( t, b_v, 0x88 ); - b_v = _mm_shuffle_ps( t, b_v, 0xdd ); - a.v = a_v; b.v = b_v; + + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); + + a.v = _mm_shuffle_ps( t, b_v, 0x88 ); + b.v = _mm_shuffle_ps( t, b_v, 0xdd ); } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) { - __m128 a_v, b_v, c_v, t, u; - t = _mm_load_ps( (const float *)a0 ); - b_v = _mm_load_ps( (const float *)a1 ); - c_v = _mm_load_ps( (const float *)a2 ); - u = _mm_load_ps( (const float *)a3 ); - a_v = _mm_unpacklo_ps( t, b_v ); - b_v = _mm_unpackhi_ps( t, b_v ); - t = _mm_unpacklo_ps( c_v, u ); - u = _mm_unpackhi_ps( c_v, u ); - c_v = _mm_movelh_ps( b_v, u ); - b_v = _mm_movehl_ps( t, a_v ); - a_v = _mm_movelh_ps( a_v, t ); - a.v = a_v; b.v = b_v; c.v = c_v; + v4 &a, + v4 &b, + v4 &c ) + { + __m128 r, s, t, u, d_v; + + a.v = _mm_load_ps( (const float *) a0 ); + b.v = _mm_load_ps( (const float *) a1 ); + c.v = _mm_load_ps( (const float *) a2 ); + d_v = _mm_load_ps( (const float *) a3 ); + + r = _mm_unpacklo_ps( a.v, b.v ); + s = _mm_unpackhi_ps( a.v, b.v ); + + t = _mm_unpacklo_ps( c.v, d_v ); + u = _mm_unpackhi_ps( c.v, d_v ); + + a.v = _mm_movelh_ps( r, t ); + b.v = _mm_movehl_ps( t, r ); + c.v = _mm_movelh_ps( s, u ); } inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) { - __m128 a_v, b_v, c_v, d_v, t, u; - a_v = _mm_load_ps( (const float *)a0 ); - b_v = _mm_load_ps( (const float *)a1 ); - c_v = _mm_load_ps( (const float *)a2 ); - d_v = _mm_load_ps( (const float *)a3 ); - t = _mm_unpackhi_ps( a_v, b_v ); - a_v = _mm_unpacklo_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); - c_v = _mm_unpacklo_ps( c_v, d_v ); - b_v = _mm_movehl_ps( c_v, a_v ); - a_v = _mm_movelh_ps( a_v, c_v ); - c_v = _mm_movelh_ps( t, u ); - d_v = _mm_movehl_ps( u, t ); - a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { + __m128 r, s, t, u; + + a.v = _mm_load_ps( (const float *) a0 ); + b.v = _mm_load_ps( (const float *) a1 ); + c.v = _mm_load_ps( (const float *) a2 ); + d.v = _mm_load_ps( (const float *) a3 ); + + r = _mm_unpackhi_ps( a.v, b.v ); + s = _mm_unpacklo_ps( a.v, b.v ); + + t = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpacklo_ps( c.v, d.v ); + + a.v = _mm_movelh_ps( s, u ); + b.v = _mm_movehl_ps( u, s ); + c.v = _mm_movelh_ps( r, t ); + d.v = _mm_movehl_ps( t, r ); } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) { - ((float *)a0)[0] = a.f[0]; - ((float *)a1)[0] = a.f[1]; - ((float *)a2)[0] = a.f[2]; - ((float *)a3)[0] = a.f[3]; - } - - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) { - __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t - _mm_storel_pi((__m64 *)a0,t); // a0 b0 -> a0 - _mm_storeh_pi((__m64 *)a1,t); // a1 b1 -> a1 - t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t - _mm_storel_pi((__m64 *)a2,t); // a2 b2 -> a2 - _mm_storeh_pi((__m64 *)a3,t); // a3 b3 -> a3 - } - - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { - __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t - _mm_storel_pi((__m64 *)a0,t); // a0 b0 -> a0 - _mm_storeh_pi((__m64 *)a1,t); // a1 b1 -> a1 - t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t - _mm_storel_pi((__m64 *)a2,t); // a2 b2 -> a2 - _mm_storeh_pi((__m64 *)a3,t); // a3 b3 -> a3 - ((float *)a0)[2] = c.f[0]; - ((float *)a1)[2] = c.f[1]; - ((float *)a2)[2] = c.f[2]; - ((float *)a3)[2] = c.f[3]; - } - - /* FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) */ - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { - __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; - t = _mm_unpackhi_ps( a_v, b_v ); - a_v = _mm_unpacklo_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); - c_v = _mm_unpacklo_ps( c_v, d_v ); + void *a0, + void *a1, + void *a2, + void *a3 ) + { + ( (float *) a0 )[0] = a.f[0]; + ( (float *) a1 )[0] = a.f[1]; + ( (float *) a2 )[0] = a.f[2]; + ( (float *) a3 )[0] = a.f[3]; + } + + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) + { + __m128 t; + + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + } + + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + __m128 t; + + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + + ( (float *) a0 )[2] = c.f[0]; + ( (float *) a1 )[2] = c.f[1]; + ( (float *) a2 )[2] = c.f[2]; + ( (float *) a3 )[2] = c.f[3]; + } + + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { + __m128 a_v, b_v, c_v, d_v, t, u; + + t = _mm_unpackhi_ps( a.v, b.v ); + a_v = _mm_unpacklo_ps( a.v, b.v ); + u = _mm_unpackhi_ps( c.v, d.v ); + c_v = _mm_unpacklo_ps( c.v, d.v ); + b_v = _mm_movehl_ps( c_v, a_v ); a_v = _mm_movelh_ps( a_v, c_v ); c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *)a0, a_v ); - _mm_store_ps( (float *)a1, b_v ); - _mm_store_ps( (float *)a2, c_v ); - _mm_store_ps( (float *)a3, d_v ); + + _mm_store_ps( (float *) a0, a_v ); + _mm_store_ps( (float *) a1, b_v ); + _mm_store_ps( (float *) a2, c_v ); + _mm_store_ps( (float *) a3, d_v ); } ////////////// // v4int class - class v4int : public v4 { - + class v4int : public v4 + { // v4int prefix unary operator friends friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; @@ -409,33 +489,61 @@ namespace v4 { // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; public: // v4int constructors / destructors - + v4int() {} // Default constructor - v4int( const v4int &a ) { v = a.v; } // Copy constructor - v4int( const v4 &a ) { v = a.v; } // Init from mixed - v4int( int a ) { // Init from scalar - union { int i; float f; } u; + + v4int( const v4int &a ) // Copy constructor + { + v = a.v; + } + + v4int( const v4 &a ) // Init from mixed + { + v = a.v; + } + + v4int( int a ) // Init from scalar + { + union + { + int i; + float f; + } u; + u.i = a; - v = _mm_set1_ps( u.f ); + v = _mm_set1_ps( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars - union { int i; float f; } u0, u1, u2, u3; - u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3; + + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + { + union + { + int i; + float f; + } u0, u1, u2, u3; + + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; + v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); } - ~v4int() {}; // Destructor - + + ~v4int() {} // Destructor + // v4int assignment operators - -# define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) { \ + + #define ASSIGN(op) \ + inline v4int &operator op( const v4int &b ) \ + { \ i[0] op b.i[0]; \ i[1] op b.i[1]; \ i[2] op b.i[2]; \ @@ -443,121 +551,153 @@ namespace v4 { return *this; \ } - inline v4int &operator =(const v4int &b) { - v = b.v; - return *this; - } - ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) + + #undef ASSIGN - inline v4int &operator ^=(const v4int &b) { + inline v4int &operator =( const v4int &b ) + { + v = b.v; + + return *this; + } + + inline v4int &operator ^=( const v4int &b ) + { v = _mm_xor_ps( v, b.v ); + return *this; } - inline v4int &operator &=(const v4int &b) { + inline v4int &operator &=( const v4int &b ) + { v = _mm_and_ps( v, b.v ); + return *this; } - inline v4int &operator |=(const v4int &b) { + inline v4int &operator |=( const v4int &b ) + { v = _mm_or_ps( v, b.v ); + return *this; } - ASSIGN(<<=) - ASSIGN(>>=) - -# undef ASSIGN - // v4int member access operator - - inline int &operator []( int n ) { return i[n]; } - inline int operator ()( int n ) { return i[n]; } + inline int &operator []( int n ) + { + return i[n]; + } + + inline int operator ()( int n ) + { + return i[n]; + } }; // v4int prefix unary operators -# define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) { \ + #define PREFIX_UNARY(op) \ + inline v4int operator op( const v4int &a ) \ + { \ v4int b; \ - b.i[0] = (op a.i[0]); \ - b.i[1] = (op a.i[1]); \ - b.i[2] = (op a.i[2]); \ - b.i[3] = (op a.i[3]); \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ return b; \ } - inline v4int operator +( const v4int & a ) { + inline v4int operator +( const v4int &a ) + { v4int b; + b.v = a.v; + return b; } PREFIX_UNARY(-) - inline v4int operator !( const v4int & a ) { + inline v4int operator !( const v4int &a ) + { v4int b; - b.i[0] = -(!a.i[0]); - b.i[1] = -(!a.i[1]); - b.i[2] = -(!a.i[2]); - b.i[3] = -(!a.i[3]); + + b.i[0] = - ( ! a.i[0] ); + b.i[1] = - ( ! a.i[1] ); + b.i[2] = - ( ! a.i[2] ); + b.i[3] = - ( ! a.i[3] ); + return b; } - inline v4int operator ~( const v4int & a ) { + inline v4int operator ~( const v4int &a ) + { v4int b; - union { int i; float f; } u; + + union + { + int i; + float f; + } u; + u.i = -1; + b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); + return b; } - -# undef PREFIX_UNARY + + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) { \ + #define PREFIX_INCDEC(op) \ + inline v4int operator op( v4int &a ) \ + { \ v4int b; \ - b.i[0] = (op a.i[0]); \ - b.i[1] = (op a.i[1]); \ - b.i[2] = (op a.i[2]); \ - b.i[3] = (op a.i[3]); \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ return b; \ } PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) { \ + #define POSTFIX_INCDEC(op) \ + inline v4int operator op( v4int &a, int ) \ + { \ v4int b; \ - b.i[0] = (a.i[0] op); \ - b.i[1] = (a.i[1] op); \ - b.i[2] = (a.i[2] op); \ - b.i[3] = (a.i[3] op); \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ return b; \ } POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators - -# define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + + #define BINARY(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ c.i[0] = a.i[0] op b.i[0]; \ c.i[1] = a.i[1] op b.i[1]; \ @@ -571,39 +711,48 @@ namespace v4 { BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) - inline v4int operator ^( const v4int &a, const v4int &b ) { + #undef BINARY + + inline v4int operator ^( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_xor_ps( a.v, b.v ); + return c; } - inline v4int operator &( const v4int &a, const v4int &b ) { + inline v4int operator &( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_and_ps( a.v, b.v ); + return c; } - inline v4int operator |( const v4int &a, const v4int &b ) { + inline v4int operator |( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_or_ps( a.v, b.v ); + return c; } - BINARY(<<) - BINARY(>>) - -# undef BINARY - // v4int logical operators -# define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + #define LOGICAL(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ - c.i[0] = -(a.i[0] op b.i[0]); \ - c.i[1] = -(a.i[1] op b.i[1]); \ - c.i[2] = -(a.i[2] op b.i[2]); \ - c.i[3] = -(a.i[3] op b.i[3]); \ + c.i[0] = - ( a.i[0] op b.i[0] ); \ + c.i[1] = - ( a.i[1] op b.i[1] ); \ + c.i[2] = - ( a.i[2] op b.i[2] ); \ + c.i[3] = - ( a.i[3] op b.i[3] ); \ return c; \ } @@ -615,44 +764,58 @@ namespace v4 { LOGICAL(>=) LOGICAL(&&) LOGICAL(||) - -# undef LOGICAL + + #undef LOGICAL // v4int miscellaneous functions - inline v4int abs( const v4int &a ) { + inline v4int abs( const v4int &a ) + { v4int b; - b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0]; - b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1]; - b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2]; - b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3]; + + b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; + b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1]; + b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2]; + b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; + return b; } - inline v4 czero( const v4int &c, const v4 &a ) { + inline v4 czero( const v4int &c, const v4 &a ) + { v4 b; - b.v = _mm_andnot_ps(c.v,a.v); + + b.v = _mm_andnot_ps( c.v, a.v ); + return b; } - inline v4 notczero( const v4int &c, const v4 &a ) { + inline v4 notczero( const v4int &c, const v4 &a ) + { v4 b; - b.v = _mm_and_ps(c.v,a.v); + + b.v = _mm_and_ps( c.v, a.v ); + return b; } - - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - __m128 c_v = c.v; + + inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) + { v4 tf; - tf.v = _mm_or_ps(_mm_andnot_ps(c_v,f.v),_mm_and_ps(c_v,t.v)); + + __m128 c_v = c.v; + + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), + _mm_and_ps( c_v, t.v ) ); + return tf; } //////////////// // v4float class - class v4float : public v4 { - + class v4float : public v4 + { // v4float prefix unary operator friends friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; @@ -691,9 +854,9 @@ namespace v4 { // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -703,192 +866,252 @@ namespace v4 { CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - // FIXME: crack + friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; - + public: // v4float constructors / destructors - + v4float() {} // Default constructor - v4float( const v4float &a ) { v = a.v; } // Copy constructor - v4float( const v4 &a ) { v = a.v; } // Init from mixed - v4float( float a ) { // Init from scalar + + v4float( const v4float &a ) // Copy constructor + { + v = a.v; + } + + v4float( const v4 &a ) // Init from mixed + { + v = a.v; + } + + v4float( float a ) // Init from scalar + { v = _mm_set1_ps( a ); } - v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars + + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + { v = _mm_setr_ps( f0, f1, f2, f3 ); } + ~v4float() {} // Destructor // v4float assignment operators -# define ASSIGN(op,intrin) \ - inline v4float &operator op(const v4float &b) { \ - v = intrin(v,b.v); \ - return *this; \ + #define ASSIGN(op,intrin) \ + inline v4float &operator op( const v4float &b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } - inline v4float &operator =(const v4float &b) { + ASSIGN( +=, _mm_add_ps ) + ASSIGN( -=, _mm_sub_ps ) + ASSIGN( *=, _mm_mul_ps ) + ASSIGN( /=, _mm_div_ps ) + + #undef ASSIGN + + inline v4float &operator =( const v4float &b ) + { v = b.v; + return *this; } - ASSIGN(+=,_mm_add_ps) - ASSIGN(-=,_mm_sub_ps) - ASSIGN(*=,_mm_mul_ps) - ASSIGN(/=,_mm_div_ps) - -# undef ASSIGN - // v4float member access operator - inline float &operator []( int n ) { return f[n]; } - inline float operator ()( int n ) { return f[n]; } + inline float &operator []( int n ) + { + return f[n]; + } + inline float operator ()( int n ) + { + return f[n]; + } }; // v4float prefix unary operators - inline v4float operator +( const v4float &a ) { + inline v4float operator +( const v4float &a ) + { v4float b; + b.v = a.v; + return b; } - inline v4float operator -( const v4float &a ) { + inline v4float operator -( const v4float &a ) + { v4float b; - b.v = _mm_sub_ps(_mm_setzero_ps(),a.v); + + b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); + return b; } - inline v4int operator !( const v4float &a ) { + inline v4int operator !( const v4float &a ) + { v4int b; - b.v = _mm_cmpeq_ps(_mm_setzero_ps(),a.v); + + b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); + return b; } // v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) { + inline v4float operator ++( v4float &a ) + { v4float b; + __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); + a.v = t; b.v = t; + return b; } - inline v4float operator --( v4float &a ) { + inline v4float operator --( v4float &a ) + { v4float b; + __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); + a.v = t; b.v = t; + return b; } // v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) { + inline v4float operator ++( v4float &a, int ) + { v4float b; + __m128 a_v = a.v; + a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) ); b.v = a_v; + return b; } - inline v4float operator --( v4float &a, int ) { + inline v4float operator --( v4float &a, int ) + { v4float b; + __m128 a_v = a.v; - a.v = _mm_sub_ps(a_v, _mm_set1_ps( 1 ) ); + + a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) ); b.v = a_v; + return b; } // v4float binary operators - -# define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) { \ + + #define BINARY(op,intrin) \ + inline v4float operator op( const v4float &a, const v4float &b ) \ + { \ v4float c; \ - c.v = intrin(a.v,b.v); \ + c.v = intrin( a.v, b.v ); \ return c; \ } - BINARY(+,_mm_add_ps) - BINARY(-,_mm_sub_ps) - BINARY(*,_mm_mul_ps) - BINARY(/,_mm_div_ps) + BINARY( +, _mm_add_ps ) + BINARY( -, _mm_sub_ps ) + BINARY( *, _mm_mul_ps ) + BINARY( /, _mm_div_ps ) -# undef BINARY + #undef BINARY // v4float logical operators -# define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) { \ + #define LOGICAL(op,intrin) \ + inline v4int operator op( const v4float &a, const v4float &b ) \ + { \ v4int c; \ - c.v = intrin(a.v,b.v); \ + c.v = intrin( a.v, b.v ); \ return c; \ } - LOGICAL(<, _mm_cmplt_ps ) - LOGICAL(>, _mm_cmpgt_ps ) - LOGICAL(==,_mm_cmpeq_ps ) - LOGICAL(!=,_mm_cmpneq_ps) - LOGICAL(<=,_mm_cmple_ps ) - LOGICAL(>=,_mm_cmpge_ps ) + LOGICAL( <, _mm_cmplt_ps ) + LOGICAL( >, _mm_cmpgt_ps ) + LOGICAL( ==, _mm_cmpeq_ps ) + LOGICAL( <=, _mm_cmple_ps ) + LOGICAL( >=, _mm_cmpge_ps ) + LOGICAL( !=, _mm_cmpneq_ps ) - inline v4int operator &&( const v4float &a, const v4float &b ) { + #undef LOGICAL + + inline v4int operator &&( const v4float &a, const v4float &b ) + { v4int c; + __m128 vzero = _mm_setzero_ps(); - c.v = _mm_and_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero)); + + c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), + _mm_cmpneq_ps( b.v, vzero ) ); + return c; } - inline v4int operator ||( const v4float &a, const v4float &b ) { + inline v4int operator ||( const v4float &a, const v4float &b ) + { v4int c; + __m128 vzero = _mm_setzero_ps(); - c.v = _mm_or_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero)); + + c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), + _mm_cmpneq_ps( b.v, vzero ) ); + return c; } -# undef LOGICAL - // v4float math library functions -# define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) { \ + #define CMATH_FR1(fn) \ + inline v4float fn( const v4float &a ) \ + { \ v4float b; \ - b.f[0] = ::fn(a.f[0]); \ - b.f[1] = ::fn(a.f[1]); \ - b.f[2] = ::fn(a.f[2]); \ - b.f[3] = ::fn(a.f[3]); \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ return b; \ } -# define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) { \ + #define CMATH_FR2(fn) \ + inline v4float fn( const v4float &a, const v4float &b ) \ + { \ v4float c; \ - c.f[0] = ::fn(a.f[0],b.f[0]); \ - c.f[1] = ::fn(a.f[1],b.f[1]); \ - c.f[2] = ::fn(a.f[2],b.f[2]); \ - c.f[3] = ::fn(a.f[3],b.f[3]); \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ return c; \ } @@ -898,126 +1121,202 @@ namespace v4 { CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) - inline v4float fabs( const v4float &a ) { + #undef CMATH_FR1 + #undef CMATH_FR2 + + inline v4float fabs( const v4float &a ) + { v4float b; - b.v = _mm_andnot_ps( _mm_set1_ps( -0.f ), a.v ); + + b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v ); + return b; } - inline v4float sqrt( const v4float &a ) { + inline v4float sqrt( const v4float &a ) + { v4float b; - b.v = _mm_sqrt_ps(a.v); + + b.v = _mm_sqrt_ps( a.v ); + return b; } - inline v4float copysign( const v4float &a, const v4float &b ) { + inline v4float copysign( const v4float &a, const v4float &b ) + { v4float c; - __m128 t = _mm_set1_ps( -0.f ); - c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) ); + + __m128 t = _mm_set1_ps( -0.0f ); + + c.v = _mm_or_ps( _mm_and_ps( t, b.v ), + _mm_andnot_ps( t, a.v ) ); + return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 + // v4float miscellaneous functions - // v4float miscelleanous functions - - inline v4float rsqrt_approx( const v4float &a ) { + inline v4float rsqrt_approx( const v4float &a ) + { v4float b; - b.v = _mm_rsqrt_ps(a.v); + + b.v = _mm_rsqrt_ps( a.v ); + return b; } - - inline v4float rsqrt( const v4float &a ) { + + inline v4float rsqrt( const v4float &a ) + { v4float b; + __m128 a_v = a.v, b_v; - b_v = _mm_rsqrt_ps(a_v); - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! - b.v = _mm_add_ps(b_v,_mm_mul_ps(_mm_set1_ps(0.5f), - _mm_sub_ps(b_v,_mm_mul_ps(a_v, - _mm_mul_ps(b_v, - _mm_mul_ps(b_v,b_v)))))); + + b_v = _mm_rsqrt_ps( a_v ); + + b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), + _mm_sub_ps( b_v, + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) + ) + ) + ) + ) + ); + return b; } - inline v4float rcp_approx( const v4float &a ) { + inline v4float rcp_approx( const v4float &a ) + { v4float b; - b.v = _mm_rcp_ps(a.v); + + b.v = _mm_rcp_ps( a.v ); + return b; } - - inline v4float rcp( const v4float &a ) { + + inline v4float rcp( const v4float &a ) + { v4float b; + __m128 a_v = a.v, b_v; - b_v = _mm_rcp_ps(a_v); - b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v))); + + b_v = _mm_rcp_ps( a_v ); + + b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ), + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, b_v ) + ) + ); + return b; } - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v ); + return d; } - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v ); + return d; } - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) ); + return d; } - inline v4float clear_bits( const v4int &m, const v4float &a ) { + inline v4float clear_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_andnot_ps( m.v, a.v ); + return b; } - inline v4float set_bits( const v4int &m, const v4float &a ) { + inline v4float set_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_or_ps( m.v, a.v ); + return b; } - inline v4float toggle_bits( const v4int &m, const v4float &a ) { + inline v4float toggle_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_xor_ps( m.v, a.v ); + return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); } // Given wl = x y z w, compute: // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) { - __m128 l = _mm_set1_ps(1), s = _mm_setr_ps(-0.f,+0.f,-0.f,+0.f); + inline void trilinear( v4float &wl, v4float &wh ) + { + __m128 l = _mm_set1_ps( 1.0f ); + + __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); + __m128 z = wl.v, xy; - xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) ); - z = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) ); + + xy = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) + ) + ); + + z = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) + ) + ); + + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), + _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); + + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); + + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); } -# undef PERM + #undef PERM } // namespace v4 diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index 023ba95a..2cab8b9c 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -57,17 +57,26 @@ namespace v4 friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends @@ -152,9 +161,8 @@ namespace v4 inline v4 splat( const v4 & a ) { v4 b; - __m128 a_v = a.v; - b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); return b; } @@ -163,20 +171,19 @@ namespace v4 inline v4 shuffle( const v4 & a ) { v4 b; - __m128 a_v = a.v; - b.v = _mm_shuffle_ps( a_v, a_v, ( permute::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); return b; } inline void swap( v4 &a, v4 &b ) - { - __m128 a_v = a.v; + { + __m128 t = a.v; a.v = b.v; - b.v = a_v; + b.v = t; } inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) @@ -230,9 +237,8 @@ namespace v4 _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } - /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { __m128 t = _mm_load_ps( ( float * ) a ); @@ -243,9 +249,9 @@ namespace v4 // v4 transposed memory manipulation functions inline void load_4x1_tr( const void *a0, - const void *a1, + const void *a1, const void *a2, - const void *a3, + const void *a3, v4 &a ) { a.v = _mm_setr_ps( ( (const float *) a0 )[0], @@ -259,29 +265,53 @@ namespace v4 const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, v4 &a, - v4 &b ) + v4 &b ) { __m128 a_v, b_v, t; b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); - a_v = _mm_shuffle_ps( t, b_v, 0x88 ); - b_v = _mm_shuffle_ps( t, b_v, 0xdd ); + a.v = _mm_shuffle_ps( t, b_v, 0x88 ); + b.v = _mm_shuffle_ps( t, b_v, 0xdd ); + } - a.v = a_v; - b.v = b_v; + inline void load_4x3_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c ) + { + __m128 r, s, t, u, d_v; + + a.v = _mm_load_ps( (const float *) a0 ); + b.v = _mm_load_ps( (const float *) a1 ); + c.v = _mm_load_ps( (const float *) a2 ); + d_v = _mm_load_ps( (const float *) a3 ); + + r = _mm_unpacklo_ps( a.v, b.v ); + s = _mm_unpackhi_ps( a.v, b.v ); + + t = _mm_unpacklo_ps( c.v, d_v ); + u = _mm_unpackhi_ps( c.v, d_v ); + + a.v = _mm_movelh_ps( r, t ); + b.v = _mm_movehl_ps( t, r ); + c.v = _mm_movelh_ps( s, u ); } + #if 0 inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c ) + v4 &b, + v4 &c ) { __m128 a_v, b_v, c_v, t, u; @@ -303,6 +333,35 @@ namespace v4 b.v = b_v; c.v = c_v; } + #endif + + inline void load_4x4_tr( const void * ALIGNED(16) a0, + const void * ALIGNED(16) a1, + const void * ALIGNED(16) a2, + const void * ALIGNED(16) a3, + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { + __m128 r, s, t, u; + + a.v = _mm_load_ps( (const float *) a0 ); + b.v = _mm_load_ps( (const float *) a1 ); + c.v = _mm_load_ps( (const float *) a2 ); + d.v = _mm_load_ps( (const float *) a3 ); + + r = _mm_unpackhi_ps( a.v, b.v ); + s = _mm_unpacklo_ps( a.v, b.v ); + + t = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpacklo_ps( c.v, d.v ); + + a.v = _mm_movelh_ps( s, u ); + b.v = _mm_movehl_ps( u, s ); + c.v = _mm_movelh_ps( r, t ); + d.v = _mm_movehl_ps( t, r ); + } #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, @@ -310,24 +369,26 @@ namespace v4 const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c, - v4 &d ) + v4 &b, + v4 &c, + v4 &d ) { __m128 a_v, b_v, c_v, d_v, t, u; + a_v = _mm_load_ps( (const float *)a0 ); b_v = _mm_load_ps( (const float *)a1 ); c_v = _mm_load_ps( (const float *)a2 ); d_v = _mm_load_ps( (const float *)a3 ); + t = _mm_unpackhi_ps( a_v, b_v ); - a_v = _mm_unpacklo_ps( a_v, b_v ); u = _mm_unpackhi_ps( c_v, d_v ); + a_v = _mm_unpacklo_ps( a_v, b_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); - b_v = _mm_movehl_ps( c_v, a_v ); - a_v = _mm_movelh_ps( a_v, c_v ); - c_v = _mm_movelh_ps( t, u ); - d_v = _mm_movehl_ps( u, t ); - a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; + + a.v = _mm_movelh_ps( a_v, c_v ); + c.v = _mm_movelh_ps( t, u ); + b.v = _mm_movehl_ps( c_v, a_v ); + d.v = _mm_movehl_ps( u, t ); } #endif @@ -337,26 +398,24 @@ namespace v4 const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c, - v4 &d ) + v4 &b, + v4 &c, + v4 &d ) { __m128 a_v, b_v, c_v, d_v, t, u; - a_v = _mm_load_ps( (const float *)a0 ); b_v = _mm_load_ps( (const float *)a1 ); c_v = _mm_load_ps( (const float *)a2 ); d_v = _mm_load_ps( (const float *)a3 ); - t = _mm_unpackhi_ps( a_v, b_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); - - b.v = _mm_movehl_ps( c_v, a_v ); - a.v = _mm_movelh_ps( a_v, c_v ); - c.v = _mm_movelh_ps( t, u ); - d.v = _mm_movehl_ps( u, t ); + b_v = _mm_movehl_ps( c_v, a_v ); + a_v = _mm_movelh_ps( a_v, c_v ); + c_v = _mm_movelh_ps( t, u ); + d_v = _mm_movehl_ps( u, t ); + a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; } #endif @@ -366,9 +425,9 @@ namespace v4 const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c, - v4 &d ) + v4 &b, + v4 &c, + v4 &d ) { __m128 a_v, b_v, c_v, d_v, t, u; @@ -378,25 +437,26 @@ namespace v4 d_v = _mm_load_ps( (const float *)a3 ); t = _mm_unpackhi_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); + u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); - a.v = _mm_movelh_ps( a_v, c_v ); b.v = _mm_movehl_ps( c_v, a_v ); - d.v = _mm_movehl_ps( u, t ); + a.v = _mm_movelh_ps( a_v, c_v ); c.v = _mm_movelh_ps( t, u ); + d.v = _mm_movehl_ps( u, t ); } #endif + #if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c, - v4 &d ) + v4 &b, + v4 &c, + v4 &d ) { __m128 a_v, b_v, c_v, d_v, t, u; @@ -411,16 +471,17 @@ namespace v4 c_v = _mm_unpacklo_ps( c_v, d_v ); a.v = _mm_movelh_ps( a_v, c_v ); - c.v = _mm_movelh_ps( t, u ); b.v = _mm_movehl_ps( c_v, a_v ); d.v = _mm_movehl_ps( u, t ); + c.v = _mm_movelh_ps( t, u ); } + #endif inline void store_4x1_tr( const v4 &a, void *a0, - void *a1, + void *a1, void *a2, - void *a3 ) + void *a3 ) { ( (float *) a0 )[0] = a.f[0]; ( (float *) a1 )[0] = a.f[1]; @@ -429,77 +490,76 @@ namespace v4 } inline void store_4x2_tr( const v4 &a, - const v4 &b, + const v4 &b, void * ALIGNED(8) a0, - void * ALIGNED(8) a1, + void * ALIGNED(8) a1, void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) + void * ALIGNED(8) a3 ) { - __m128 a_v = a.v, b_v = b.v, t; + __m128 t; - t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *)a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *)a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 } inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, + const v4 &b, + const v4 &c, void * ALIGNED(16) a0, - void * ALIGNED(16) a1, + void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) + void * ALIGNED(16) a3 ) { - __m128 a_v = a.v, b_v = b.v, t; + __m128 t; - t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *)a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *)a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 - ((float *)a0)[2] = c.f[0]; - ((float *)a1)[2] = c.f[1]; - ((float *)a2)[2] = c.f[2]; - ((float *)a3)[2] = c.f[3]; + ( (float *) a0 )[2] = c.f[0]; + ( (float *) a1 )[2] = c.f[1]; + ( (float *) a2 )[2] = c.f[2]; + ( (float *) a3 )[2] = c.f[3]; } - // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) inline void store_4x4_tr( const v4 &a, - const v4 &b, + const v4 &b, const v4 &c, - const v4 &d, + const v4 &d, void * ALIGNED(16) a0, - void * ALIGNED(16) a1, + void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) + void * ALIGNED(16) a3 ) { - __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; + __m128 a_v, b_v, c_v, d_v, t, u; - t = _mm_unpackhi_ps( a_v, b_v ); - a_v = _mm_unpacklo_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); - c_v = _mm_unpacklo_ps( c_v, d_v ); + t = _mm_unpackhi_ps( a.v, b.v ); + a_v = _mm_unpacklo_ps( a.v, b.v ); + u = _mm_unpackhi_ps( c.v, d.v ); + c_v = _mm_unpacklo_ps( c.v, d.v ); b_v = _mm_movehl_ps( c_v, a_v ); a_v = _mm_movelh_ps( a_v, c_v ); c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *)a0, a_v ); - _mm_store_ps( (float *)a1, b_v ); - _mm_store_ps( (float *)a2, c_v ); - _mm_store_ps( (float *)a3, d_v ); + _mm_store_ps( (float *) a0, a_v ); + _mm_store_ps( (float *) a1, b_v ); + _mm_store_ps( (float *) a2, c_v ); + _mm_store_ps( (float *) a3, d_v ); } ////////////// @@ -644,6 +704,8 @@ namespace v4 ASSIGN(<<=) ASSIGN(>>=) + #undef ASSIGN + inline v4int &operator =( const v4int &b ) { v = b.v; @@ -672,8 +734,6 @@ namespace v4 return *this; } - #undef ASSIGN - // v4int member access operator inline int &operator []( int n ) @@ -690,7 +750,7 @@ namespace v4 // v4int prefix unary operators #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) \ + inline v4int operator op( const v4int &a ) \ { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ @@ -700,7 +760,7 @@ namespace v4 return b; \ } - inline v4int operator +( const v4int & a ) + inline v4int operator +( const v4int &a ) { v4int b; @@ -711,19 +771,19 @@ namespace v4 PREFIX_UNARY(-) - inline v4int operator !( const v4int & a ) + inline v4int operator !( const v4int &a ) { v4int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); + b.i[0] = - ( ! a.i[0] ); + b.i[1] = - ( ! a.i[1] ); + b.i[2] = - ( ! a.i[2] ); + b.i[3] = - ( ! a.i[3] ); return b; } - inline v4int operator ~( const v4int & a ) + inline v4int operator ~( const v4int &a ) { v4int b; @@ -734,6 +794,7 @@ namespace v4 } u; u.i = -1; + b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); return b; @@ -744,7 +805,7 @@ namespace v4 // v4int prefix increment / decrement #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) \ + inline v4int operator op( v4int &a ) \ { \ v4int b; \ b.i[0] = ( op a.i[0] ); \ @@ -762,7 +823,7 @@ namespace v4 // v4int postfix increment / decrement #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) \ + inline v4int operator op( v4int &a, int ) \ { \ v4int b; \ b.i[0] = ( a.i[0] op ); \ @@ -798,6 +859,8 @@ namespace v4 BINARY(<<) BINARY(>>) + #undef BINARY + inline v4int operator ^( const v4int &a, const v4int &b ) { v4int c; @@ -825,8 +888,6 @@ namespace v4 return c; } - #undef BINARY - // v4int logical operators #define LOGICAL(op) \ @@ -885,9 +946,10 @@ namespace v4 inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - __m128 c_v = c.v; v4 tf; + __m128 c_v = c.v; + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), _mm_and_ps( c_v, t.v ) ); @@ -1011,6 +1073,8 @@ namespace v4 ASSIGN( *=, _mm_mul_ps ) ASSIGN( /=, _mm_div_ps ) + #undef ASSIGN + inline v4float &operator =( const v4float &b ) { v = b.v; @@ -1018,8 +1082,6 @@ namespace v4 return *this; } - #undef ASSIGN - // v4float member access operator inline float &operator []( int n ) @@ -1116,7 +1178,7 @@ namespace v4 // v4float binary operators -# define BINARY(op,intrin) \ + #define BINARY(op,intrin) \ inline v4float operator op( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -1129,7 +1191,7 @@ namespace v4 BINARY( *, _mm_mul_ps ) BINARY( /, _mm_div_ps ) -# undef BINARY + #undef BINARY // v4float logical operators @@ -1141,12 +1203,14 @@ namespace v4 return c; \ } - LOGICAL( <, _mm_cmplt_ps ) - LOGICAL( >, _mm_cmpgt_ps ) - LOGICAL( ==, _mm_cmpeq_ps ) + LOGICAL( <, _mm_cmplt_ps ) + LOGICAL( >, _mm_cmpgt_ps ) + LOGICAL( ==, _mm_cmpeq_ps ) + LOGICAL( <=, _mm_cmple_ps ) + LOGICAL( >=, _mm_cmpge_ps ) LOGICAL( !=, _mm_cmpneq_ps ) - LOGICAL( <=, _mm_cmple_ps ) - LOGICAL( >=, _mm_cmpge_ps ) + + #undef LOGICAL inline v4int operator &&( const v4float &a, const v4float &b ) { @@ -1172,8 +1236,6 @@ namespace v4 return c; } - #undef LOGICAL - // v4float math library functions #define CMATH_FR1(fn) \ @@ -1204,6 +1266,9 @@ namespace v4 CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) + #undef CMATH_FR1 + #undef CMATH_FR2 + inline v4float fabs( const v4float &a ) { v4float b; @@ -1234,13 +1299,10 @@ namespace v4 return c; } - #undef CMATH_FR1 - #undef CMATH_FR2 - - // v4float miscelleanous functions + // v4float miscellaneous functions inline v4float rsqrt_approx( const v4float &a ) - { + { v4float b; b.v = _mm_rsqrt_ps( a.v ); @@ -1248,6 +1310,24 @@ namespace v4 return b; } + inline v4float rsqrt( const v4float &a ) + { + v4float b; + + __m128 b_v; + + b_v = _mm_rsqrt_ps( a.v ); + + b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), + _mm_fnmadd_ps( a.v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) ), + b_v ), + b_v ); + + return b; + } + #if 0 inline v4float rsqrt( const v4float &a ) { @@ -1257,8 +1337,6 @@ namespace v4 b_v = _mm_rsqrt_ps( a_v ); - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), _mm_sub_ps( b_v, _mm_mul_ps( a_v, @@ -1283,9 +1361,6 @@ namespace v4 b_v = _mm_rsqrt_ps( a_v ); - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! - b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), _mm_fnmadd_ps( a_v, _mm_mul_ps( b_v, @@ -1297,32 +1372,26 @@ namespace v4 } #endif - inline v4float rsqrt( const v4float &a ) + inline v4float rcp_approx( const v4float &a ) { v4float b; - __m128 b_v; - - b_v = _mm_rsqrt_ps( a.v ); - - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! - - b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), - _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) ), - b_v ), - b_v ); + b.v = _mm_rcp_ps( a.v ); return b; } - inline v4float rcp_approx( const v4float &a ) + inline v4float rcp( const v4float &a ) { v4float b; - b.v = _mm_rcp_ps( a.v ); + __m128 b_v; + + b_v = _mm_rcp_ps( a.v ); + + b.v = _mm_fnmadd_ps( a.v, + _mm_mul_ps( b_v, b_v ), + _mm_add_ps( b_v, b_v ) ); return b; } @@ -1363,21 +1432,6 @@ namespace v4 } #endif - inline v4float rcp( const v4float &a ) - { - v4float b; - - __m128 b_v; - - b_v = _mm_rcp_ps( a.v ); - - b.v = _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, b_v ), - _mm_add_ps( b_v, b_v ) ); - - return b; - } - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) { v4float d; @@ -1432,17 +1486,20 @@ namespace v4 return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) { _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) { _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) { _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); } @@ -1452,22 +1509,33 @@ namespace v4 // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) inline void trilinear( v4float &wl, v4float &wh ) { - __m128 l = _mm_set1_ps( 1.0f ), s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); + __m128 l = _mm_set1_ps( 1.0f ); + + __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); + __m128 z = wl.v, xy; - xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) ); + xy = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) + ) + ); - z = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) ); + z = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) + ) + ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) ); + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), + _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) ); + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) ); + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); } -# undef PERM + #undef PERM } // namespace v4 diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 22e8dff6..0152ad2b 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -61,17 +61,26 @@ namespace v4 friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends @@ -168,10 +177,6 @@ namespace v4 v4( const v4 &a ) // Copy constructor { v = a.v; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // i[j] = a.i[j]; } ~v4() {} // Default destructor @@ -193,10 +198,6 @@ namespace v4 inline v4 splat( const v4 & a ) { v4 b; - // __m128 a_v = a.v; - - // b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) ); - ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) @@ -209,9 +210,6 @@ namespace v4 inline v4 shuffle( const v4 & a ) { v4 b; - // __m128 a_v = a.v; - - // b.v = _mm_shuffle_ps( a_v, a_v, ( permute::value ) ); b.i[0] = a.i[i0]; b.i[1] = a.i[i1]; @@ -271,93 +269,6 @@ namespace v4 } #endif - #if 0 - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { - float32x4_t a0_v, a2_v, t, u; - - //----------------------------------------------------------------- - float32x2_t a0_vh = vget_high_f32( a0.v ); - float32x2_t a1_vh = vget_high_f32( a1.v ); - - float32x2x2_t res_a0a1_h = vzip_f32( a0_vh, a1_vh ); - - t = vcombine_f32( res_a0a1_h.val[0], res_a0a1_h.val[1] ); - //----------------------------------------------------------------- - // t = _mm_unpackhi_ps( a0.v, a1.v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a0_vl = vget_low_f32( a0.v ); - float32x2_t a1_vl = vget_low_f32( a1.v ); - - float32x2x2_t res_a0a1_l = vzip_f32( a0_vl, a1_vl ); - - a0_v = vcombine_f32( res_a0a1_l.val[0], res_a0a1_l.val[1] ); - //----------------------------------------------------------------- - // a0_v = _mm_unpacklo_ps( a0.v, a1.v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a2_vh = vget_high_f32( a2.v ); - float32x2_t a3_vh = vget_high_f32( a3.v ); - - float32x2x2_t res_a2a3_h = vzip_f32( a2_vh, a3_vh ); - - u = vcombine_f32( res_a2a3_h.val[0], res_a2a3_h.val[1] ); - //----------------------------------------------------------------- - // u = _mm_unpackhi_ps( a2.v, a3.v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a2_vl = vget_low_f32( a2.v ); - float32x2_t a3_vl = vget_low_f32( a3.v ); - - float32x2x2_t res_a2a3_l = vzip_f32( a2_vl, a3_vl ); - - a2_v = vcombine_f32( res_a2a3_l.val[0], res_a2a3_l.val[1] ); - //----------------------------------------------------------------- - // a2_v = _mm_unpacklo_ps( a2.v, a3.v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a0.v[0] = a0_v[0]; - a0.v[1] = a0_v[1]; - a0.v[2] = a2_v[0]; - a0.v[3] = a2_v[1]; - //----------------------------------------------------------------- - // a0.v = _mm_movelh_ps( a0_v, a2_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a1.v[0] = a0_v[2]; - a1.v[1] = a0_v[3]; - a1.v[2] = a2_v[2]; - a1.v[3] = a2_v[3]; - //----------------------------------------------------------------- - // a1.v = _mm_movehl_ps( a2_v, a0_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a2.v[0] = t[0]; - a2.v[1] = t[1]; - a2.v[2] = u[0]; - a2.v[3] = u[1]; - //----------------------------------------------------------------- - // a2.v = _mm_movelh_ps( t, u ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a3.v[0] = t[2]; - a3.v[1] = t[3]; - a3.v[2] = u[2]; - a3.v[3] = u[3]; - //----------------------------------------------------------------- - // a3.v = _mm_movehl_ps( u, t ); - //----------------------------------------------------------------- - } - #endif - #if 0 // Portable version. inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) @@ -376,58 +287,31 @@ namespace v4 v4 &a ) { a.v = vld1q_f32( ( float * ) p ); - - // a.v = _mm_load_ps( ( float * ) p ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // a.i[j] = ((const int * ALIGNED(16))p)[j]; } inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { vst1q_f32( ( float * ) p, a.v ); - - // _mm_store_ps( ( float * ) p, a.v ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // ((int * ALIGNED(16))p)[j] = a.i[j]; } inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - // _mm_stream_ps( ( float * ) p, a.v ); - ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; + ( (int * ALIGNED(16) ) p )[j] = a.i[j]; } inline void clear_4x1( void * ALIGNED(16) p ) { vst1q_f32( ( float * ) p, vdupq_n_f32( 0.0f ) ); - - // _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // ((int * ALIGNED(16))p)[j] = 0; } - // FIXME: Ordering semantics inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { vst1q_f32( ( float * ) dst, vld1q_f32( ( const float * ) src ) ); - - // _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; } inline void swap_4x1( void * ALIGNED(16) a, @@ -437,54 +321,28 @@ namespace v4 vst1q_f32( ( float * ) a, vld1q_f32( ( float * ) b ) ); vst1q_f32( ( float * ) b, t ); - - // __m128 t = _mm_load_ps( ( float * ) a ); - - // _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); - // _mm_store_ps( ( float * ) b, t ); - - // int t; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // { - // t = ((int * ALIGNED(16))a)[j]; - // ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; - // ((int * ALIGNED(16))b)[j] = t; - // } } // v4 transposed memory manipulation functions inline void load_4x1_tr( const void *a0, - const void *a1, + const void *a1, const void *a2, - const void *a3, + const void *a3, v4 &a ) { - // a.v = _mm_setr_ps( ( (const float *) a0 )[0], - // ( (const float *) a1 )[0], - // ( (const float *) a2 )[0], - // ( (const float *) a3 )[0] ); - - // Not correct. - // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - - // a.v = mat.val[0]; - - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; + a.i[0] = ( (const int *) a0 )[0]; + a.i[1] = ( (const int *) a1 )[0]; + a.i[2] = ( (const int *) a2 )[0]; + a.i[3] = ( (const int *) a3 )[0]; } - #if 1 inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, v4 &a, - v4 &b ) + v4 &b ) { float32x4_t r, s, t, u, a2_v, a3_v; @@ -502,92 +360,41 @@ namespace v4 a.v = vtrn1q_f64( r, t ); b.v = vtrn1q_f64( s, u ); } - #endif - - #if 0 - // Portable version. - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { - a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - - a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - - a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - - a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; - } - #endif inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c ) + v4 &b, + v4 &c ) { - // __m128 a_v, b_v, c_v, t, u; - - // t = _mm_load_ps( (const float *)a0 ); - // b_v = _mm_load_ps( (const float *)a1 ); - // c_v = _mm_load_ps( (const float *)a2 ); - // u = _mm_load_ps( (const float *)a3 ); - - // a_v = _mm_unpacklo_ps( t, b_v ); - // b_v = _mm_unpackhi_ps( t, b_v ); - // t = _mm_unpacklo_ps( c_v, u ); - // u = _mm_unpackhi_ps( c_v, u ); + float32x4_t r, s, t, u, d_v; - // c_v = _mm_movelh_ps( b_v, u ); - // b_v = _mm_movehl_ps( t, a_v ); - // a_v = _mm_movelh_ps( a_v, t ); - - // a.v = a_v; - // b.v = b_v; - // c.v = c_v; - - // Not correct. - // float32x4x4_t mat = vld4q_f32( (const float *) a0 ); - - // a.v = mat.val[0]; - // b.v = mat.val[1]; - // c.v = mat.val[2]; - - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; + a.v = vld1q_f32( (const float *) a0 ); + b.v = vld1q_f32( (const float *) a1 ); + c.v = vld1q_f32( (const float *) a2 ); + d_v = vld1q_f32( (const float *) a3 ); - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; + r = vtrn1q_f32( a.v, b.v ); + s = vtrn2q_f32( a.v, b.v ); - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; + t = vtrn1q_f32( c.v, d_v ); + u = vtrn2q_f32( c.v, d_v ); - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; + a.v = vtrn1q_f64( r, t ); + b.v = vtrn1q_f64( s, u ); + c.v = vtrn2q_f64( r, t ); } - #if 1 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, - v4 &b, - v4 &c, - v4 &d ) + v4 &b, + v4 &c, + v4 &d ) { float32x4_t r, s, t, u; @@ -607,150 +414,6 @@ namespace v4 c.v = vtrn2q_f64( r, t ); d.v = vtrn2q_f64( s, u ); } - #endif - - #if 0 - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - //----------------------------------------------------------------- - float32x4_t a_v, b_v, c_v, d_v, t, u; - //----------------------------------------------------------------- - // __m128 a_v, b_v, c_v, d_v, t, u; - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a_v = vld1q_f32( (const float *) a0 ); - b_v = vld1q_f32( (const float *) a1 ); - c_v = vld1q_f32( (const float *) a2 ); - d_v = vld1q_f32( (const float *) a3 ); - //----------------------------------------------------------------- - // a_v = _mm_load_ps( (const float *) a0 ); - // b_v = _mm_load_ps( (const float *) a1 ); - // c_v = _mm_load_ps( (const float *) a2 ); - // d_v = _mm_load_ps( (const float *) a3 ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a_vh = vget_high_f32( a_v ); - float32x2_t b_vh = vget_high_f32( b_v ); - - float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh ); - - t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] ); - //----------------------------------------------------------------- - // t = _mm_unpackhi_ps( a_v, b_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t c_vh = vget_high_f32( c_v ); - float32x2_t d_vh = vget_high_f32( d_v ); - - float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh ); - - u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] ); - //----------------------------------------------------------------- - // u = _mm_unpackhi_ps( c_v, d_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a_vl = vget_low_f32( a_v ); - float32x2_t b_vl = vget_low_f32( b_v ); - - float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl ); - - a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] ); - //----------------------------------------------------------------- - // a_v = _mm_unpacklo_ps( a_v, b_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t c_vl = vget_low_f32( c_v ); - float32x2_t d_vl = vget_low_f32( d_v ); - - float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl ); - - c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] ); - //----------------------------------------------------------------- - // c_v = _mm_unpacklo_ps( c_v, d_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a.v[0] = a_v[0]; - a.v[1] = a_v[1]; - a.v[2] = c_v[0]; - a.v[3] = c_v[1]; - //----------------------------------------------------------------- - // a.v = _mm_movelh_ps( a_v, c_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - c.v[0] = t[0]; - c.v[1] = t[1]; - c.v[2] = u[0]; - c.v[3] = u[1]; - //----------------------------------------------------------------- - // c.v = _mm_movelh_ps( t, u ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - b.v[0] = a_v[2]; - b.v[1] = a_v[3]; - b.v[2] = c_v[2]; - b.v[3] = c_v[3]; - //----------------------------------------------------------------- - // b.v = _mm_movehl_ps( c_v, a_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - d.v[0] = t[2]; - d.v[1] = t[3]; - d.v[2] = u[2]; - d.v[3] = u[3]; - //----------------------------------------------------------------- - // d.v = _mm_movehl_ps( u, t ); - //----------------------------------------------------------------- - } - #endif - - #if 0 - // Portable version. - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - } - #endif #if 1 inline void load_4x8_tr( const void * ALIGNED(16) a0, @@ -853,9 +516,9 @@ namespace v4 inline void store_4x1_tr( const v4 &a, void *a0, - void *a1, + void *a1, void *a2, - void *a3 ) + void *a3 ) { ( (int *) a0 )[0] = a.i[0]; ( (int *) a1 )[0] = a.i[1]; @@ -864,11 +527,11 @@ namespace v4 } inline void store_4x2_tr( const v4 &a, - const v4 &b, + const v4 &b, void * ALIGNED(8) a0, - void * ALIGNED(8) a1, + void * ALIGNED(8) a1, void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) + void * ALIGNED(8) a3 ) { // __m128 a_v = a.v, b_v = b.v, t; @@ -896,12 +559,12 @@ namespace v4 } inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, + const v4 &b, + const v4 &c, void * ALIGNED(16) a0, - void * ALIGNED(16) a1, + void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) + void * ALIGNED(16) a3 ) { // __m128 a_v = a.v, b_v = b.v, t; @@ -937,15 +600,14 @@ namespace v4 ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } - #if 1 inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, + const v4 &b, + const v4 &c, + const v4 &d, void * ALIGNED(16) a0, - void * ALIGNED(16) a1, + void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) + void * ALIGNED(16) a3 ) { float32x4_t r, s, t, u; @@ -960,184 +622,20 @@ namespace v4 vst1q_f32( (float *) a2, vtrn2q_f64( r, t ) ); vst1q_f32( (float *) a3, vtrn2q_f64( s, u ) ); } - #endif - - #if 0 - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - //----------------------------------------------------------------- - float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; - //----------------------------------------------------------------- - // __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a_vh = vget_high_f32( a_v ); - float32x2_t b_vh = vget_high_f32( b_v ); - - float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh ); - - t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] ); - //----------------------------------------------------------------- - // t = _mm_unpackhi_ps( a_v, b_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t a_vl = vget_low_f32( a_v ); - float32x2_t b_vl = vget_low_f32( b_v ); - - float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl ); - - a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] ); - //----------------------------------------------------------------- - // a_v = _mm_unpacklo_ps( a_v, b_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t c_vh = vget_high_f32( c_v ); - float32x2_t d_vh = vget_high_f32( d_v ); - - float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh ); - - u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] ); - //----------------------------------------------------------------- - // u = _mm_unpackhi_ps( c_v, d_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - float32x2_t c_vl = vget_low_f32( c_v ); - float32x2_t d_vl = vget_low_f32( d_v ); - - float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl ); - - c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] ); - //----------------------------------------------------------------- - // c_v = _mm_unpacklo_ps( c_v, d_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - b_v[0] = a_v[2]; - b_v[1] = a_v[3]; - b_v[2] = c_v[2]; - b_v[3] = c_v[3]; - //----------------------------------------------------------------- - // b_v = _mm_movehl_ps( c_v, a_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - a_v[0] = a_v[0]; - a_v[1] = a_v[1]; - a_v[2] = c_v[0]; - a_v[3] = c_v[1]; - //----------------------------------------------------------------- - // a_v = _mm_movelh_ps( a_v, c_v ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - c_v[0] = t[0]; - c_v[1] = t[1]; - c_v[2] = u[0]; - c_v[3] = u[1]; - //----------------------------------------------------------------- - // c_v = _mm_movelh_ps( t, u ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - d_v[0] = t[2]; - d_v[1] = t[3]; - d_v[2] = u[2]; - d_v[3] = u[3]; - //----------------------------------------------------------------- - // d_v = _mm_movehl_ps( u, t ); - //----------------------------------------------------------------- - - //----------------------------------------------------------------- - vst1q_f32( (float *) a0, a_v ); - vst1q_f32( (float *) a1, b_v ); - vst1q_f32( (float *) a2, c_v ); - vst1q_f32( (float *) a3, d_v ); - //----------------------------------------------------------------- - // _mm_store_ps( (float *) a0, a_v ); - // _mm_store_ps( (float *) a1, b_v ); - // _mm_store_ps( (float *) a2, c_v ); - // _mm_store_ps( (float *) a3, d_v ); - //----------------------------------------------------------------- - - // ((int * ALIGNED(16))a0)[0] = a.i[0]; - // ((int * ALIGNED(16))a0)[1] = b.i[0]; - // ((int * ALIGNED(16))a0)[2] = c.i[0]; - // ((int * ALIGNED(16))a0)[3] = d.i[0]; - - // ((int * ALIGNED(16))a1)[0] = a.i[1]; - // ((int * ALIGNED(16))a1)[1] = b.i[1]; - // ((int * ALIGNED(16))a1)[2] = c.i[1]; - // ((int * ALIGNED(16))a1)[3] = d.i[1]; - - // ((int * ALIGNED(16))a2)[0] = a.i[2]; - // ((int * ALIGNED(16))a2)[1] = b.i[2]; - // ((int * ALIGNED(16))a2)[2] = c.i[2]; - // ((int * ALIGNED(16))a2)[3] = d.i[2]; - - // ((int * ALIGNED(16))a3)[0] = a.i[3]; - // ((int * ALIGNED(16))a3)[1] = b.i[3]; - // ((int * ALIGNED(16))a3)[2] = c.i[3]; - // ((int * ALIGNED(16))a3)[3] = d.i[3]; - } - #endif - - #if 0 - // Portable version. - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; - } - #endif #if 1 inline void store_4x8_tr( const v4 &b00, - const v4 &b01, - const v4 &b02, - const v4 &b03, - const v4 &b04, - const v4 &b05, - const v4 &b06, - const v4 &b07, + const v4 &b01, + const v4 &b02, + const v4 &b03, + const v4 &b04, + const v4 &b05, + const v4 &b06, + const v4 &b07, void * ALIGNED(16) a0, - void * ALIGNED(16) a1, + void * ALIGNED(16) a1, void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) + void * ALIGNED(16) a3 ) { float32x4x4_t mat0, mat2; @@ -1241,19 +739,11 @@ namespace v4 v4int( const v4int &a ) // Copy constructor { v = a.v; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // i[j] = a.i[j]; } v4int( const v4 &a ) // Init from mixed { v = a.v; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // i[j] = a.i[j]; } v4int( int a ) // Init from scalar @@ -1266,10 +756,6 @@ namespace v4 u.i = a; v = vdupq_n_f32( u.f ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // i[j] = a; } v4int( int i0, int i1, int i2, int i3 ) // Init from scalars @@ -1313,10 +799,8 @@ namespace v4 ASSIGN(%=) ASSIGN(<<=) ASSIGN(>>=) - // ASSIGN( =) - // ASSIGN(^=) - // ASSIGN(&=) - // ASSIGN(|=) + + #undef ASSIGN inline v4int &operator =( const v4int &b ) { @@ -1346,8 +830,6 @@ namespace v4 return *this; } - #undef ASSIGN - // v4int member access operator inline int &operator []( int n ) @@ -1364,7 +846,7 @@ namespace v4 // v4int prefix unary operators #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) \ + inline v4int operator op( const v4int &a ) \ { \ v4int b; \ ALWAYS_VECTORIZE \ @@ -1376,7 +858,7 @@ namespace v4 PREFIX_UNARY(+) PREFIX_UNARY(-) - inline v4int operator !( const v4int & a ) + inline v4int operator !( const v4int &a ) { v4int b; @@ -1394,7 +876,7 @@ namespace v4 // v4int prefix increment / decrement #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) \ + inline v4int operator op( v4int &a ) \ { \ v4int b; \ ALWAYS_VECTORIZE \ @@ -1411,7 +893,7 @@ namespace v4 // v4int postfix increment / decrement #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) \ + inline v4int operator op( v4int &a, int ) \ { \ v4int b; \ ALWAYS_VECTORIZE \ @@ -1444,9 +926,8 @@ namespace v4 BINARY(%) BINARY(<<) BINARY(>>) - // BINARY(^) - // BINARY(&) - // BINARY(|) + + #undef BINARY inline v4int operator ^( const v4int &a, const v4int &b ) { @@ -1475,8 +956,6 @@ namespace v4 return c; } - #undef BINARY - // v4int logical operators #define LOGICAL(op) \ @@ -1517,13 +996,14 @@ namespace v4 { v4 b; - b.vsi = vbicq_s32( c.vsi, a.vsi ); + // This seems broken. + // b.vsi = vbicq_s32( c.vsi, a.vsi ); // b.v = _mm_andnot_ps( c.v, a.v ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.i[j] = a.i[j] & ~c.i[j]; + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; } @@ -1545,10 +1025,11 @@ namespace v4 inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - v4 m; + v4 tf; - m.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ), - vandq_s32( c.vsi, t.vsi ) ); + // This seems broken. + // tf.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ), + // vandq_s32( c.vsi, t.vsi ) ); // __m128 c_v = c.v; // v4 tf; @@ -1556,11 +1037,11 @@ namespace v4 // tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), // _mm_and_ps( c_v, t.v ) ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + ALWAYS_VECTORIZE + for( int j = 0; j < 4; j++ ) + m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); - return m; + return tf; } //////////////// @@ -1608,7 +1089,7 @@ namespace v4 #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -1647,28 +1128,16 @@ namespace v4 v4float( const v4float &a ) // Copy constructor { v = a.v; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // f[j] = a.f[j]; } v4float( const v4 &a ) // Init from mixed { v = a.v; - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // f[j] = a.f[j]; } v4float( float a ) // Init from scalar { v = vdupq_n_f32( a ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // f[j] = a; } v4float( float f0, float f1, float f2, float f3 ) // Init from scalars @@ -1697,6 +1166,8 @@ namespace v4 ASSIGN( *=, vmulq_f32 ) ASSIGN( /=, vdivq_f32 ) + #undef ASSIGN + inline v4float &operator =( const v4float &b ) { v = b.v; @@ -1704,25 +1175,6 @@ namespace v4 return *this; } - #undef ASSIGN - - // #define ASSIGN(op) \ - // inline v4float &operator op( const v4float &b ) \ - // { \ - // ALWAYS_VECTORIZE \ - // for( int j = 0; j < 4; j++ ) \ - // f[j] op b.f[j]; \ - // return *this; \ - // } - - // ASSIGN(=) - // ASSIGN(+=) - // ASSIGN(-=) - // ASSIGN(*=) - // ASSIGN(/=) - - // #undef ASSIGN - // v4float member access operator inline float &operator []( int n ) @@ -1744,10 +1196,6 @@ namespace v4 b.v = a.v; - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.f[j] = +a.f[j]; - return b; } @@ -1862,23 +1310,6 @@ namespace v4 #undef BINARY - // #define BINARY(op) \ - // inline v4float operator op( const v4float &a, const v4float &b ) \ - // { \ - // v4float c; \ - // ALWAYS_VECTORIZE \ - // for( int j = 0; j < 4; j++ ) \ - // c.f[j] = a.f[j] op b.f[j]; \ - // return c; \ - // } - - // BINARY(+) - // BINARY(-) - // BINARY(*) - // BINARY(/) - - // #undef BINARY - // v4float logical operators #define LOGICAL(op,intrin) \ @@ -1894,7 +1325,8 @@ namespace v4 LOGICAL( ==, vceqq_f32 ) LOGICAL( <=, vcleq_f32 ) LOGICAL( >=, vcgeq_f32 ) - // LOGICAL( !=, _mm_cmpneq_ps ) + + #undef LOGICAL inline v4int operator !=( const v4float &a, const v4float &b ) { @@ -1919,9 +1351,9 @@ namespace v4 // Is there a better way to do this than the SSE way? c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v, - vzero ) ), - vmvnq_u32( vceqq_f32( b.v, - vzero ) ) ); + vzero ) ), + vmvnq_u32( vceqq_f32( b.v, + vzero ) ) ); // c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), // _mm_cmpneq_ps( b.v, vzero ) ); @@ -1939,9 +1371,9 @@ namespace v4 // Is there a better way to do this than the SSE way? c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v, - vzero ) ), - vmvnq_u32( vceqq_f32( b.v, - vzero ) ) ); + vzero ) ), + vmvnq_u32( vceqq_f32( b.v, + vzero ) ) ); // c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), // _mm_cmpneq_ps( b.v, vzero ) ); @@ -1949,29 +1381,6 @@ namespace v4 return c; } - #undef LOGICAL - - // #define LOGICAL(op) \ - // inline v4int operator op( const v4float &a, const v4float &b ) \ - // { \ - // v4int c; \ - // ALWAYS_VECTORIZE \ - // for( int j = 0; j < 4; j++ ) \ - // c.i[j] = - ( a.f[j] op b.f[j] ); \ - // return c; \ - // } - - // LOGICAL(< ) - // LOGICAL(> ) - // LOGICAL(==) - // LOGICAL(!=) - // LOGICAL(<=) - // LOGICAL(>=) - // LOGICAL(&&) - // LOGICAL(||) - - // #undef LOGICAL - // v4float math library functions #define CMATH_FR1(fn) \ @@ -2000,6 +1409,9 @@ namespace v4 CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) + #undef CMATH_FR1 + #undef CMATH_FR2 + inline v4float copysign( const v4float &a, const v4float &b ) { v4float c; @@ -2016,9 +1428,6 @@ namespace v4 return c; } - #undef CMATH_FR1 - #undef CMATH_FR2 - // v4float miscellaneous functions inline v4float rsqrt_approx( const v4float &a ) @@ -2027,10 +1436,6 @@ namespace v4 b.v = vrsqrteq_f32( a.v ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.f[j] = ::sqrt( 1.0f / a.f[j] ); - return b; } @@ -2038,26 +1443,24 @@ namespace v4 { v4float b; - // float32x4_t a_v = a.v, b_v; + float32x4_t a_v = a.v, b_v; - // b_v = vrsqrteq_f32( a_v ); + b_v = vrsqrteq_f32( a_v ); - // // Note: It is quicker to just call div_ps and sqrt_ps if more - // // refinement desired! - // b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), - // vsubq_f32( b_v, - // vmulq_f32( a_v, - // vmulq_f32( b_v, - // vmulq_f32( b_v, b_v ) - // ) - // ) - // ) - // ) - // ); + b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), + vsubq_f32( b_v, + vmulq_f32( a_v, + vmulq_f32( b_v, + vmulq_f32( b_v, b_v ) + ) + ) + ) + ) + ); - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; } @@ -2068,10 +1471,6 @@ namespace v4 b.v = vrecpeq_f32( a.v ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // b.f[j] = 1.0f / a.f[j]; - return b; } @@ -2079,19 +1478,19 @@ namespace v4 { v4float b; - // float32x4_t a_v = a.v, b_v; + float32x4_t a_v = a.v, b_v; - // b_v = vrecpeq_f32( a_v ); + b_v = vrecpeq_f32( a_v ); - // b.v = vsubq_f32( vaddq_f32( b_v, b_v ), - // vmulq_f32( a_v, - // vmulq_f32( b_v, b_v ) - // ) - // ); + b.v = vsubq_f32( vaddq_f32( b_v, b_v ), + vmulq_f32( a_v, + vmulq_f32( b_v, b_v ) + ) + ); - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = 1.0f / a.f[j]; + // ALWAYS_VECTORIZE + // for( int j = 0; j < 4; j++ ) + // b.f[j] = 1.0f / a.f[j]; return b; } @@ -2100,11 +1499,10 @@ namespace v4 { v4float d; - d.v = vfmaq_f32( a.v, b.v, c.v ); + d.v = vaddq_f32( vmulq_f32( a.v, b.v ), c.v ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // d.f[j] = a.f[j] * b.f[j] + c.f[j]; + // This seems broken. + // d.v = vfmaq_f32( a.v, b.v, c.v ); return d; } @@ -2113,11 +1511,10 @@ namespace v4 { v4float d; - d.v = vfmsq_f32( a.v, b.v, c.v ); + d.v = vsubq_f32( vmulq_f32( a.v, b.v ), c.v ); - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // d.f[j] = a.f[j] * b.f[j] - c.f[j]; + // This seems broken. + // d.v = vfmsq_f32( a.v, b.v, c.v ); return d; } @@ -2126,11 +1523,7 @@ namespace v4 { v4float d; - d.v = vsubq_f32( vdupq_n_f32( 0.0f ), vfmsq_f32( a.v, b.v, c.v ) ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // d.f[j] = c.f[j] - a.f[j] * b.f[j]; + d.v = vsubq_f32( c.v, vmulq_f32( a.v, b.v ) ); return d; } @@ -2180,40 +1573,28 @@ namespace v4 return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) { vst1q_f32( p, vaddq_f32( vld1q_f32( p ), a.v ) ); - - // _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // p[j] += a.f[j]; } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) { vst1q_f32( p, vsubq_f32( vld1q_f32( p ), a.v ) ); - - // _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // p[j] -= a.f[j]; } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) { vst1q_f32( p, vmulq_f32( vld1q_f32( p ), a.v ) ); - - // _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); - - // ALWAYS_VECTORIZE - // for( int j = 0; j < 4; j++ ) - // p[j] *= a.f[j]; } - inline void trilinear( v4float & wl, v4float & wh ) + // Given wl = x y z w, compute: + // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) + // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) + inline void trilinear( v4float &wl, v4float &wh ) { float x = wl.f[0], y = wl.f[1], z = wl.f[2]; diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h index 6dbb790b..b192c514 100644 --- a/src/util/v4/v4_portable.h +++ b/src/util/v4/v4_portable.h @@ -48,17 +48,26 @@ namespace v4 friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends @@ -167,7 +176,7 @@ namespace v4 return b; } -# define sw(x,y) x^=y, y^=x, x^=y + #define sw(x,y) x^=y, y^=x, x^=y inline void swap( v4 &a, v4 &b ) { @@ -184,53 +193,52 @@ namespace v4 sw( a2.i[3],a3.i[2] ); } -# undef sw + #undef sw // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; + a.i[0] = ( ( const int * ALIGNED(16) ) p )[0]; + a.i[1] = ( ( const int * ALIGNED(16) ) p )[1]; + a.i[2] = ( ( const int * ALIGNED(16) ) p )[2]; + a.i[3] = ( ( const int * ALIGNED(16) ) p )[3]; } inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; + ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; + ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; + ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; } inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; + ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; + ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; + ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; } inline void clear_4x1( void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; + ( ( int * ALIGNED(16) ) p )[0] = 0; + ( ( int * ALIGNED(16) ) p )[1] = 0; + ( ( int * ALIGNED(16) ) p )[2] = 0; + ( ( int * ALIGNED(16) ) p )[3] = 0; } - // FIXME: Ordering semantics inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; + ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0]; + ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1]; + ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2]; + ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3]; } inline void swap_4x1( void * ALIGNED(16) a, @@ -238,175 +246,201 @@ namespace v4 { int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; + t = ( ( int * ALIGNED(16) ) a )[0]; + + ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0]; + ( ( int * ALIGNED(16) ) b )[0] = t; - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; + t = ( ( int * ALIGNED(16) ) a )[1]; - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; + ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1]; + ( ( int * ALIGNED(16) ) b )[1] = t; - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; + t = ( ( int * ALIGNED(16) ) a )[2]; + + ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2]; + ( ( int * ALIGNED(16) ) b )[2] = t; + + t = ( ( int * ALIGNED(16) ) a )[3]; + + ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3]; + ( ( int * ALIGNED(16) ) b )[3] = t; } // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, v4 &a ) { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; + a.i[0] = ( (const int *) a0 )[0]; + a.i[1] = ( (const int *) a1 )[0]; + a.i[2] = ( (const int *) a2 )[0]; + a.i[3] = ( (const int *) a3 )[0]; } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) + v4 &a, + v4 &b ) { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; + a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; + a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; + a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, + v4 &b, + v4 &c ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; } inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; + d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; + + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; + d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; + + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; + d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; + + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; + d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, - void *a2, void *a3 ) + void *a0, + void *a1, + void *a2, + void *a3 ) { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; + ( (int *) a0 )[0] = a.i[0]; + ( (int *) a1 )[0] = a.i[1]; + ( (int *) a2 )[0] = a.i[2]; + ( (int *) a3 )[0] = a.i[3]; } - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; + ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; + ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; + ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; + ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; + + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; + + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; + + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; } ////////////// @@ -527,7 +561,7 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ { \ i[0] op b.i[0]; \ @@ -537,19 +571,19 @@ namespace v4 return *this; \ } - ASSIGN( =) ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) + ASSIGN( =) ASSIGN(^=) ASSIGN(&=) ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) -# undef ASSIGN + #undef ASSIGN // v4int member access operator @@ -566,7 +600,7 @@ namespace v4 // v4int prefix unary operators -# define PREFIX_UNARY(op) \ + #define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ { \ v4int b; \ @@ -584,21 +618,21 @@ namespace v4 { v4int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); + b.i[0] = - ( ! a.i[0] ); + b.i[1] = - ( ! a.i[1] ); + b.i[2] = - ( ! a.i[2] ); + b.i[3] = - ( ! a.i[3] ); return b; } PREFIX_UNARY(~) -# undef PREFIX_UNARY + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ + #define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ { \ v4int b; \ @@ -612,11 +646,11 @@ namespace v4 PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ + #define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ { \ v4int b; \ @@ -630,11 +664,11 @@ namespace v4 POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -650,24 +684,24 @@ namespace v4 BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) BINARY(^) BINARY(&) BINARY(|) - BINARY(<<) - BINARY(>>) -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ - c.i[0] = -(a.i[0] op b.i[0]); \ - c.i[1] = -(a.i[1] op b.i[1]); \ - c.i[2] = -(a.i[2] op b.i[2]); \ - c.i[3] = -(a.i[3] op b.i[3]); \ + c.i[0] = - ( a.i[0] op b.i[0] ); \ + c.i[1] = - ( a.i[1] op b.i[1] ); \ + c.i[2] = - ( a.i[2] op b.i[2] ); \ + c.i[3] = - ( a.i[3] op b.i[3] ); \ return c; \ } @@ -680,7 +714,7 @@ namespace v4 LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions @@ -722,14 +756,14 @@ namespace v4 inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - v4 m; + v4 tf; - m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); - m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); - m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); - m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); + tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); + tf.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); + tf.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); + tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); - return m; + return tf; } //////////////// @@ -775,9 +809,9 @@ namespace v4 // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -787,8 +821,8 @@ namespace v4 CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends @@ -849,7 +883,7 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ { \ f[0] op b.f[0]; \ @@ -865,7 +899,7 @@ namespace v4 ASSIGN(*=) ASSIGN(/=) -# undef ASSIGN + #undef ASSIGN // v4float member access operator @@ -972,7 +1006,7 @@ namespace v4 // v4float binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -988,11 +1022,11 @@ namespace v4 BINARY(*) BINARY(/) -# undef BINARY + #undef BINARY // v4float logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ { \ v4int c; \ @@ -1006,17 +1040,17 @@ namespace v4 LOGICAL(< ) LOGICAL(> ) LOGICAL(==) - LOGICAL(!=) LOGICAL(<=) LOGICAL(>=) + LOGICAL(!=) LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4float math library functions -# define CMATH_FR1(fn) \ + #define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ { \ v4float b; \ @@ -1027,7 +1061,7 @@ namespace v4 return b; \ } -# define CMATH_FR2(fn) \ + #define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -1044,33 +1078,33 @@ namespace v4 CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) + #undef CMATH_FR1 + #undef CMATH_FR2 + inline v4float copysign( const v4float &a, const v4float &b ) { v4float c; float t; t = ::fabs( a.f[0] ); - if( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) t = -t; c.f[3] = t; return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 - // v4float miscellaneous functions inline v4float rsqrt_approx( const v4float &a ) @@ -1193,7 +1227,8 @@ namespace v4 return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] += a.f[0]; p[1] += a.f[1]; @@ -1201,7 +1236,8 @@ namespace v4 p[3] += a.f[3]; } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] -= a.f[0]; p[1] -= a.f[1]; @@ -1209,7 +1245,8 @@ namespace v4 p[3] -= a.f[3]; } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] *= a.f[0]; p[1] *= a.f[1]; @@ -1217,7 +1254,10 @@ namespace v4 p[3] *= a.f[3]; } - inline void trilinear( v4float & wl, v4float & wh ) + // Given wl = x y z w, compute: + // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) + // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) + inline void trilinear( v4float &wl, v4float &wh ) { float x = wl.f[0], y = wl.f[1], z = wl.f[2]; diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h index 6a89939e..b192c514 100644 --- a/src/util/v4/v4_portable_v0.h +++ b/src/util/v4/v4_portable_v0.h @@ -48,17 +48,26 @@ namespace v4 friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends @@ -167,7 +176,7 @@ namespace v4 return b; } -# define sw(x,y) x^=y, y^=x, x^=y + #define sw(x,y) x^=y, y^=x, x^=y inline void swap( v4 &a, v4 &b ) { @@ -184,53 +193,52 @@ namespace v4 sw( a2.i[3],a3.i[2] ); } -# undef sw + #undef sw // v4 memory manipulation functions inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; + a.i[0] = ( ( const int * ALIGNED(16) ) p )[0]; + a.i[1] = ( ( const int * ALIGNED(16) ) p )[1]; + a.i[2] = ( ( const int * ALIGNED(16) ) p )[2]; + a.i[3] = ( ( const int * ALIGNED(16) ) p )[3]; } inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; + ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; + ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; + ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; } inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; + ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; + ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; + ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; } inline void clear_4x1( void * ALIGNED(16) p ) { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; + ( ( int * ALIGNED(16) ) p )[0] = 0; + ( ( int * ALIGNED(16) ) p )[1] = 0; + ( ( int * ALIGNED(16) ) p )[2] = 0; + ( ( int * ALIGNED(16) ) p )[3] = 0; } - // FIXME: Ordering semantics inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; + ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0]; + ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1]; + ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2]; + ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3]; } inline void swap_4x1( void * ALIGNED(16) a, @@ -238,175 +246,201 @@ namespace v4 { int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; + t = ( ( int * ALIGNED(16) ) a )[0]; + + ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0]; + ( ( int * ALIGNED(16) ) b )[0] = t; - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; + t = ( ( int * ALIGNED(16) ) a )[1]; - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; + ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1]; + ( ( int * ALIGNED(16) ) b )[1] = t; - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; + t = ( ( int * ALIGNED(16) ) a )[2]; + + ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2]; + ( ( int * ALIGNED(16) ) b )[2] = t; + + t = ( ( int * ALIGNED(16) ) a )[3]; + + ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3]; + ( ( int * ALIGNED(16) ) b )[3] = t; } // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, v4 &a ) { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; + a.i[0] = ( (const int *) a0 )[0]; + a.i[1] = ( (const int *) a1 )[0]; + a.i[2] = ( (const int *) a2 )[0]; + a.i[3] = ( (const int *) a3 )[0]; } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) + v4 &a, + v4 &b ) { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; + a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; + a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; + a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, + v4 &b, + v4 &c ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; } inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; + d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; + + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; + d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; + + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; + d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; + + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; + d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, - void *a2, void *a3 ) + void *a0, + void *a1, + void *a2, + void *a3 ) { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; + ( (int *) a0 )[0] = a.i[0]; + ( (int *) a1 )[0] = a.i[1]; + ( (int *) a2 )[0] = a.i[2]; + ( (int *) a3 )[0] = a.i[3]; } - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; + ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; + ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; + ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; + ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; + + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; + + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; + + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; } ////////////// @@ -527,7 +561,7 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ { \ i[0] op b.i[0]; \ @@ -537,19 +571,19 @@ namespace v4 return *this; \ } - ASSIGN( =) ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) + ASSIGN( =) ASSIGN(^=) ASSIGN(&=) ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) -# undef ASSIGN + #undef ASSIGN // v4int member access operator @@ -566,7 +600,7 @@ namespace v4 // v4int prefix unary operators -# define PREFIX_UNARY(op) \ + #define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ { \ v4int b; \ @@ -584,21 +618,21 @@ namespace v4 { v4int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); + b.i[0] = - ( ! a.i[0] ); + b.i[1] = - ( ! a.i[1] ); + b.i[2] = - ( ! a.i[2] ); + b.i[3] = - ( ! a.i[3] ); return b; } PREFIX_UNARY(~) -# undef PREFIX_UNARY + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ + #define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ { \ v4int b; \ @@ -612,11 +646,11 @@ namespace v4 PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ + #define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ { \ v4int b; \ @@ -630,11 +664,11 @@ namespace v4 POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -650,17 +684,17 @@ namespace v4 BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) BINARY(^) BINARY(&) BINARY(|) - BINARY(<<) - BINARY(>>) -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -680,7 +714,7 @@ namespace v4 LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions @@ -722,14 +756,14 @@ namespace v4 inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - v4 m; + v4 tf; - m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); - m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); - m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); - m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); + tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); + tf.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); + tf.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); + tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); - return m; + return tf; } //////////////// @@ -775,9 +809,9 @@ namespace v4 // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -787,8 +821,8 @@ namespace v4 CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends @@ -849,7 +883,7 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ { \ f[0] op b.f[0]; \ @@ -865,7 +899,7 @@ namespace v4 ASSIGN(*=) ASSIGN(/=) -# undef ASSIGN + #undef ASSIGN // v4float member access operator @@ -972,7 +1006,7 @@ namespace v4 // v4float binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -988,11 +1022,11 @@ namespace v4 BINARY(*) BINARY(/) -# undef BINARY + #undef BINARY // v4float logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ { \ v4int c; \ @@ -1006,17 +1040,17 @@ namespace v4 LOGICAL(< ) LOGICAL(> ) LOGICAL(==) - LOGICAL(!=) LOGICAL(<=) LOGICAL(>=) + LOGICAL(!=) LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4float math library functions -# define CMATH_FR1(fn) \ + #define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ { \ v4float b; \ @@ -1027,7 +1061,7 @@ namespace v4 return b; \ } -# define CMATH_FR2(fn) \ + #define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -1044,33 +1078,33 @@ namespace v4 CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) + #undef CMATH_FR1 + #undef CMATH_FR2 + inline v4float copysign( const v4float &a, const v4float &b ) { v4float c; float t; t = ::fabs( a.f[0] ); - if( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) t = -t; c.f[3] = t; return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 - // v4float miscellaneous functions inline v4float rsqrt_approx( const v4float &a ) @@ -1193,7 +1227,8 @@ namespace v4 return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] += a.f[0]; p[1] += a.f[1]; @@ -1201,7 +1236,8 @@ namespace v4 p[3] += a.f[3]; } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] -= a.f[0]; p[1] -= a.f[1]; @@ -1209,7 +1245,8 @@ namespace v4 p[3] -= a.f[3]; } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) { p[0] *= a.f[0]; p[1] *= a.f[1]; @@ -1217,7 +1254,10 @@ namespace v4 p[3] *= a.f[3]; } - inline void trilinear( v4float & wl, v4float & wh ) + // Given wl = x y z w, compute: + // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) + // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) + inline void trilinear( v4float &wl, v4float &wh ) { float x = wl.f[0], y = wl.f[1], z = wl.f[2]; diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h index d67bf4b8..9a6cca87 100644 --- a/src/util/v4/v4_portable_v1.h +++ b/src/util/v4/v4_portable_v1.h @@ -60,17 +60,26 @@ namespace v4 friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends @@ -177,7 +186,7 @@ namespace v4 return b; } -# define sw(x,y) x^=y, y^=x, x^=y + #define sw(x,y) x^=y, y^=x, x^=y inline void swap( v4 &a, v4 &b ) { @@ -193,7 +202,7 @@ namespace v4 sw( a2.i[3],a3.i[2] ); } -# undef sw + #undef sw // v4 memory manipulation functions @@ -228,7 +237,6 @@ namespace v4 ((int * ALIGNED(16))p)[j] = 0; } - // FIXME: Ordering semantics inline void copy_4x1( void * ALIGNED(16) dst, const void * ALIGNED(16) src ) { @@ -253,156 +261,178 @@ namespace v4 // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, v4 &a ) { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; + a.i[0] = ( (const int *) a0 )[0]; + a.i[1] = ( (const int *) a1 )[0]; + a.i[2] = ( (const int *) a2 )[0]; + a.i[3] = ( (const int *) a3 )[0]; } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) + v4 &a, + v4 &b ) { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; + a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; + a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; + a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; + a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) + v4 &a, + v4 &b, + v4 &c ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; } inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) + v4 &a, + v4 &b, + v4 &c, + v4 &d ) { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; + a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; + b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; + c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; + d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; + + a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; + b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; + c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; + d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; + + a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; + b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; + c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; + d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; + + a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; + b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; + c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; + d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, - void *a2, void *a3 ) + void *a0, + void *a1, + void *a2, + void *a3 ) { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; + ( (int *) a0 )[0] = a.i[0]; + ( (int *) a1 )[0] = a.i[1]; + ( (int *) a2 )[0] = a.i[2]; + ( (int *) a3 )[0] = a.i[3]; } - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; + ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; + ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; + ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; + ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; } - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; + ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; + ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; + ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; + + ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; + ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; + ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; + + ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; + ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; + ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; + + ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; + ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; + ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; + ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; } ////////////// @@ -520,7 +550,7 @@ namespace v4 // v4int assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4int &operator op( const v4int &b ) \ { \ ALWAYS_VECTORIZE \ @@ -529,19 +559,19 @@ namespace v4 return *this; \ } - ASSIGN( =) ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) + ASSIGN( =) ASSIGN(^=) ASSIGN(&=) ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) -# undef ASSIGN + #undef ASSIGN // v4int member access operator @@ -558,7 +588,7 @@ namespace v4 // v4int prefix unary operators -# define PREFIX_UNARY(op) \ + #define PREFIX_UNARY(op) \ inline v4int operator op( const v4int & a ) \ { \ v4int b; \ @@ -584,11 +614,11 @@ namespace v4 PREFIX_UNARY(~) -# undef PREFIX_UNARY + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ + #define PREFIX_INCDEC(op) \ inline v4int operator op( v4int & a ) \ { \ v4int b; \ @@ -601,11 +631,11 @@ namespace v4 PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ + #define POSTFIX_INCDEC(op) \ inline v4int operator op( v4int & a, int ) \ { \ v4int b; \ @@ -618,11 +648,11 @@ namespace v4 POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -637,17 +667,17 @@ namespace v4 BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) BINARY(^) BINARY(&) BINARY(|) - BINARY(<<) - BINARY(>>) -# undef BINARY + #undef BINARY // v4int logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4int &a, const v4int &b ) \ { \ v4int c; \ @@ -666,7 +696,7 @@ namespace v4 LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4int miscellaneous functions @@ -705,13 +735,13 @@ namespace v4 inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - v4 m; + v4 tf; ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); - return m; + return tf; } //////////////// @@ -757,9 +787,9 @@ namespace v4 // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -769,8 +799,8 @@ namespace v4 CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends @@ -828,7 +858,7 @@ namespace v4 // v4float assignment operators -# define ASSIGN(op) \ + #define ASSIGN(op) \ inline v4float &operator op( const v4float &b ) \ { \ ALWAYS_VECTORIZE \ @@ -843,7 +873,7 @@ namespace v4 ASSIGN(*=) ASSIGN(/=) -# undef ASSIGN + #undef ASSIGN // v4float member access operator @@ -943,7 +973,7 @@ namespace v4 // v4float binary operators -# define BINARY(op) \ + #define BINARY(op) \ inline v4float operator op( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -958,11 +988,11 @@ namespace v4 BINARY(*) BINARY(/) -# undef BINARY + #undef BINARY // v4float logical operators -# define LOGICAL(op) \ + #define LOGICAL(op) \ inline v4int operator op( const v4float &a, const v4float &b ) \ { \ v4int c; \ @@ -975,17 +1005,17 @@ namespace v4 LOGICAL(< ) LOGICAL(> ) LOGICAL(==) - LOGICAL(!=) LOGICAL(<=) LOGICAL(>=) + LOGICAL(!=) LOGICAL(&&) LOGICAL(||) -# undef LOGICAL + #undef LOGICAL // v4float math library functions -# define CMATH_FR1(fn) \ + #define CMATH_FR1(fn) \ inline v4float fn( const v4float &a ) \ { \ v4float b; \ @@ -995,7 +1025,7 @@ namespace v4 return b; \ } -# define CMATH_FR2(fn) \ + #define CMATH_FR2(fn) \ inline v4float fn( const v4float &a, const v4float &b ) \ { \ v4float c; \ @@ -1011,6 +1041,9 @@ namespace v4 CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) + #undef CMATH_FR1 + #undef CMATH_FR2 + inline v4float copysign( const v4float &a, const v4float &b ) { v4float c; @@ -1027,9 +1060,6 @@ namespace v4 return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 - // v4float miscellaneous functions inline v4float rsqrt_approx( const v4float &a ) @@ -1142,28 +1172,34 @@ namespace v4 return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) p[j] += a.f[j]; } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) p[j] -= a.f[j]; } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) { ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) p[j] *= a.f[j]; } - inline void trilinear( v4float & wl, v4float & wh ) + // Given wl = x y z w, compute: + // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) + // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) + inline void trilinear( v4float &wl, v4float &wh ) { float x = wl.f[0], y = wl.f[1], z = wl.f[2]; diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h index b2ed5dcb..5f9e7d9d 100644 --- a/src/util/v4/v4_sse.h +++ b/src/util/v4/v4_sse.h @@ -5,47 +5,41 @@ #error "Do not include v4_sse.h directly; use v4.h" #endif -#define V4_ACCELERATION -#define V4_SSE_ACCELERATION - #include #include +#define V4_ACCELERATION +#define V4_SSE_ACCELERATION + #ifndef ALIGNED #define ALIGNED(n) #endif -// FIXME: IN PORTABLE, ALTIVEC, SPU -// - UPDATE V4INT, V4FLOAT - -// This requires gcc-3.3 and up -// Also, Bug 12902 has not been resolved on gcc-3.x.x. See README.patches for -// details. gcc-4.x.x does not seem to have this bug but may suffer from -// other problems (use "-fno-strict-aliasing" on these platforms) - #define ALWAYS_INLINE __attribute__((always_inline)) -namespace v4 { - +namespace v4 +{ class v4; class v4int; class v4float; - template struct permute { + template + struct permute + { constexpr static int value = i0 + i1*4 + i2*16 + i3*64; - }; // permute + }; + + #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) -# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) - //////////////// // v4 base class - - class v4 { - + + class v4 + { friend class v4int; friend class v4float; - - // v4 miscellenous friends + + // v4 miscellaneous friends friend inline int any( const v4 &a ) ALWAYS_INLINE; friend inline int all( const v4 &a ) ALWAYS_INLINE; @@ -61,53 +55,68 @@ namespace v4 { // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; // v4 memory manipulation friends - - friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + + friend inline void load_4x1( const void * ALIGNED(16) p, + v4 &a ) ALWAYS_INLINE; + + friend inline void store_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void stream_4x1( const v4 &a, + void * ALIGNED(16) p ) ALWAYS_INLINE; + + friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + + friend inline void copy_4x1( void * ALIGNED(16) dst, + const void * ALIGNED(16) src ) ALWAYS_INLINE; + + friend inline void swap_4x1( void * ALIGNED(16) a, + void * ALIGNED(16) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends friend inline void load_4x1_tr( const void *a0, const void *a1, const void *a2, const void *a3, v4 &a ) ALWAYS_INLINE; + friend inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, v4 &a, v4 &b ) ALWAYS_INLINE; + friend inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; + friend inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - + friend inline void store_4x1_tr( const v4 &a, void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; + friend inline void store_4x2_tr( const v4 &a, const v4 &b, void * ALIGNED(8) a0, void * ALIGNED(8) a1, void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) ALWAYS_INLINE; + friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, void * ALIGNED(16) a0, void * ALIGNED(16) a1, void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, void * ALIGNED(16) a0, @@ -117,81 +126,102 @@ namespace v4 { protected: - union { + union + { int i[4]; float f[4]; __m128 v; }; - + public: v4() {} // Default constructor - v4(const v4 &a) { v=a.v; } // Copy constructor - ~v4() {} // Default destructor + v4( const v4 &a ) // Copy constructor + { + v = a.v; + } + + ~v4() {} // Default destructor }; - + // v4 miscellaneous functions - inline int any( const v4 &a ) { + inline int any( const v4 &a ) + { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - - inline int all( const v4 &a ) { + + inline int all( const v4 &a ) + { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - // Note: n MUST BE AN IMMEDIATE! template - inline v4 splat(const v4 & a) { - __m128 a_v = a.v; + inline v4 splat( const v4 & a ) + { v4 b; - b.v = _mm_shuffle_ps( a_v, a_v, (n*permute<1,1,1,1>::value)); + + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); + return b; } - // Note: i0:3 MUST BE IMMEDIATES! */ template - inline v4 shuffle( const v4 & a ) { - __m128 a_v = a.v; + inline v4 shuffle( const v4 & a ) + { v4 b; - b.v = _mm_shuffle_ps( a_v, a_v, (permute::value) ); + + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); + return b; } - inline void swap( v4 &a, v4 &b ) { - __m128 a_v = a.v; a.v = b.v; b.v = a_v; + inline void swap( v4 &a, v4 &b ) + { + __m128 t = a.v; + + a.v = b.v; + + b.v = t; } - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { + inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) + { __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u; + t = _mm_unpackhi_ps( a0_v, a1_v ); a0_v = _mm_unpacklo_ps( a0_v, a1_v ); u = _mm_unpackhi_ps( a2_v, a3_v ); a2_v = _mm_unpacklo_ps( a2_v, a3_v ); + a1_v = _mm_movehl_ps( a2_v, a0_v ); a0_v = _mm_movelh_ps( a0_v, a2_v ); a2_v = _mm_movelh_ps( t, u ); a3_v = _mm_movehl_ps( u, t ); - a0.v = a0_v; a1.v = a1_v; a2.v = a2_v; a3.v = a3_v; + + a0.v = a0_v; + a1.v = a1_v; + a2.v = a2_v; + a3.v = a3_v; } // v4 memory manipulation functions - + inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) + v4 &a ) { a.v = _mm_load_ps( ( float * ) p ); } inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_store_ps( ( float * ) p, a.v ); } inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) + void * ALIGNED(16) p ) { _mm_stream_ps( ( float * ) p, a.v ); } @@ -207,9 +237,8 @@ namespace v4 { _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); } - /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */ inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) + void * ALIGNED(16) b ) { __m128 t = _mm_load_ps( ( float * ) a ); @@ -219,129 +248,192 @@ namespace v4 { // v4 transposed memory manipulation functions - inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, v4 &a ) { - a.v = _mm_setr_ps( ((const float *)a0)[0], - ((const float *)a1)[0], - ((const float *)a2)[0], - ((const float *)a3)[0] ); + inline void load_4x1_tr( const void *a0, + const void *a1, + const void *a2, + const void *a3, + v4 &a ) + { + a.v = _mm_setr_ps( ( (const float *) a0 )[0], + ( (const float *) a1 )[0], + ( (const float *) a2 )[0], + ( (const float *) a3 )[0] ); } inline void load_4x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, const void * ALIGNED(8) a2, const void * ALIGNED(8) a3, - v4 &a, v4 &b ) { + v4 &a, + v4 &b ) + { __m128 a_v, b_v, t; + b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 ); + + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); + a_v = _mm_shuffle_ps( t, b_v, 0x88 ); b_v = _mm_shuffle_ps( t, b_v, 0xdd ); - a.v = a_v; b.v = b_v; + + a.v = a_v; + b.v = b_v; } inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) { + v4 &a, + v4 &b, + v4 &c ) + { __m128 a_v, b_v, c_v, t, u; - t = _mm_load_ps( (const float *)a0 ); - b_v = _mm_load_ps( (const float *)a1 ); - c_v = _mm_load_ps( (const float *)a2 ); - u = _mm_load_ps( (const float *)a3 ); + + t = _mm_load_ps( (const float *) a0 ); + b_v = _mm_load_ps( (const float *) a1 ); + c_v = _mm_load_ps( (const float *) a2 ); + u = _mm_load_ps( (const float *) a3 ); + a_v = _mm_unpacklo_ps( t, b_v ); b_v = _mm_unpackhi_ps( t, b_v ); + t = _mm_unpacklo_ps( c_v, u ); u = _mm_unpackhi_ps( c_v, u ); + c_v = _mm_movelh_ps( b_v, u ); b_v = _mm_movehl_ps( t, a_v ); a_v = _mm_movelh_ps( a_v, t ); - a.v = a_v; b.v = b_v; c.v = c_v; + + a.v = a_v; + b.v = b_v; + c.v = c_v; } inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) { + v4 &a, + v4 &b, + v4 &c, + v4 &d ) + { __m128 a_v, b_v, c_v, d_v, t, u; - a_v = _mm_load_ps( (const float *)a0 ); - b_v = _mm_load_ps( (const float *)a1 ); - c_v = _mm_load_ps( (const float *)a2 ); - d_v = _mm_load_ps( (const float *)a3 ); + + a_v = _mm_load_ps( (const float *) a0 ); + b_v = _mm_load_ps( (const float *) a1 ); + c_v = _mm_load_ps( (const float *) a2 ); + d_v = _mm_load_ps( (const float *) a3 ); + t = _mm_unpackhi_ps( a_v, b_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); + u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); + b_v = _mm_movehl_ps( c_v, a_v ); a_v = _mm_movelh_ps( a_v, c_v ); c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; + + a.v = a_v; + b.v = b_v; + c.v = c_v; + d.v = d_v; } inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) { - ((float *)a0)[0] = a.f[0]; - ((float *)a1)[0] = a.f[1]; - ((float *)a2)[0] = a.f[2]; - ((float *)a3)[0] = a.f[3]; + void *a0, + void *a1, + void *a2, + void *a3 ) + { + ( (float *) a0 )[0] = a.f[0]; + ( (float *) a1 )[0] = a.f[1]; + ( (float *) a2 )[0] = a.f[2]; + ( (float *) a3 )[0] = a.f[3]; + } + + inline void store_4x2_tr( const v4 &a, + const v4 &b, + void * ALIGNED(8) a0, + void * ALIGNED(8) a1, + void * ALIGNED(8) a2, + void * ALIGNED(8) a3 ) + { + __m128 a_v = a.v, b_v = b.v, t; + + t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + + t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 } - inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) { + inline void store_4x3_tr( const v4 &a, + const v4 &b, + const v4 &c, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t - _mm_storel_pi((__m64 *)a0,t); // a0 b0 -> a0 - _mm_storeh_pi((__m64 *)a1,t); // a1 b1 -> a1 - t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t - _mm_storel_pi((__m64 *)a2,t); // a2 b2 -> a2 - _mm_storeh_pi((__m64 *)a3,t); // a3 b3 -> a3 + + t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + + _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + + t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + + _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + + ( (float *) a0 )[2] = c.f[0]; + ( (float *) a1 )[2] = c.f[1]; + ( (float *) a2 )[2] = c.f[2]; + ( (float *) a3 )[2] = c.f[3]; } - inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { - __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t - _mm_storel_pi((__m64 *)a0,t); // a0 b0 -> a0 - _mm_storeh_pi((__m64 *)a1,t); // a1 b1 -> a1 - t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t - _mm_storel_pi((__m64 *)a2,t); // a2 b2 -> a2 - _mm_storeh_pi((__m64 *)a3,t); // a3 b3 -> a3 - ((float *)a0)[2] = c.f[0]; - ((float *)a1)[2] = c.f[1]; - ((float *)a2)[2] = c.f[2]; - ((float *)a3)[2] = c.f[3]; - } - - /* FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) */ - inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) { + inline void store_4x4_tr( const v4 &a, + const v4 &b, + const v4 &c, + const v4 &d, + void * ALIGNED(16) a0, + void * ALIGNED(16) a1, + void * ALIGNED(16) a2, + void * ALIGNED(16) a3 ) + { __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; + t = _mm_unpackhi_ps( a_v, b_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); + b_v = _mm_movehl_ps( c_v, a_v ); a_v = _mm_movelh_ps( a_v, c_v ); c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *)a0, a_v ); - _mm_store_ps( (float *)a1, b_v ); - _mm_store_ps( (float *)a2, c_v ); - _mm_store_ps( (float *)a3, d_v ); + + _mm_store_ps( (float *) a0, a_v ); + _mm_store_ps( (float *) a1, b_v ); + _mm_store_ps( (float *) a2, c_v ); + _mm_store_ps( (float *) a3, d_v ); } ////////////// // v4int class - class v4int : public v4 { - + class v4int : public v4 + { // v4int prefix unary operator friends friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; @@ -409,33 +501,61 @@ namespace v4 { // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; public: // v4int constructors / destructors - + v4int() {} // Default constructor - v4int( const v4int &a ) { v = a.v; } // Copy constructor - v4int( const v4 &a ) { v = a.v; } // Init from mixed - v4int( int a ) { // Init from scalar - union { int i; float f; } u; + + v4int( const v4int &a ) // Copy constructor + { + v = a.v; + } + + v4int( const v4 &a ) // Init from mixed + { + v = a.v; + } + + v4int( int a ) // Init from scalar + { + union + { + int i; + float f; + } u; + u.i = a; - v = _mm_set1_ps( u.f ); + v = _mm_set1_ps( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars - union { int i; float f; } u0, u1, u2, u3; - u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3; + + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + { + union + { + int i; + float f; + } u0, u1, u2, u3; + + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; + v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); } - ~v4int() {}; // Destructor - + + ~v4int() {} // Destructor + // v4int assignment operators - -# define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) { \ + + #define ASSIGN(op) \ + inline v4int &operator op( const v4int &b ) \ + { \ i[0] op b.i[0]; \ i[1] op b.i[1]; \ i[2] op b.i[2]; \ @@ -443,121 +563,153 @@ namespace v4 { return *this; \ } - inline v4int &operator =(const v4int &b) { - v = b.v; - return *this; - } - ASSIGN(+=) ASSIGN(-=) ASSIGN(*=) ASSIGN(/=) ASSIGN(%=) + ASSIGN(<<=) + ASSIGN(>>=) - inline v4int &operator ^=(const v4int &b) { + #undef ASSIGN + + inline v4int &operator =( const v4int &b ) + { + v = b.v; + + return *this; + } + + inline v4int &operator ^=( const v4int &b ) + { v = _mm_xor_ps( v, b.v ); + return *this; } - inline v4int &operator &=(const v4int &b) { + inline v4int &operator &=( const v4int &b ) + { v = _mm_and_ps( v, b.v ); + return *this; } - inline v4int &operator |=(const v4int &b) { + inline v4int &operator |=( const v4int &b ) + { v = _mm_or_ps( v, b.v ); + return *this; } - ASSIGN(<<=) - ASSIGN(>>=) - -# undef ASSIGN - // v4int member access operator - - inline int &operator []( int n ) { return i[n]; } - inline int operator ()( int n ) { return i[n]; } + inline int &operator []( int n ) + { + return i[n]; + } + + inline int operator ()( int n ) + { + return i[n]; + } }; // v4int prefix unary operators -# define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) { \ + #define PREFIX_UNARY(op) \ + inline v4int operator op( const v4int &a ) \ + { \ v4int b; \ - b.i[0] = (op a.i[0]); \ - b.i[1] = (op a.i[1]); \ - b.i[2] = (op a.i[2]); \ - b.i[3] = (op a.i[3]); \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ return b; \ } - inline v4int operator +( const v4int & a ) { + inline v4int operator +( const v4int &a ) + { v4int b; + b.v = a.v; + return b; } PREFIX_UNARY(-) - inline v4int operator !( const v4int & a ) { + inline v4int operator !( const v4int &a ) + { v4int b; - b.i[0] = -(!a.i[0]); - b.i[1] = -(!a.i[1]); - b.i[2] = -(!a.i[2]); - b.i[3] = -(!a.i[3]); + + b.i[0] = - ( ! a.i[0] ); + b.i[1] = - ( ! a.i[1] ); + b.i[2] = - ( ! a.i[2] ); + b.i[3] = - ( ! a.i[3] ); + return b; } - inline v4int operator ~( const v4int & a ) { + inline v4int operator ~( const v4int &a ) + { v4int b; - union { int i; float f; } u; + + union + { + int i; + float f; + } u; + u.i = -1; + b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); + return b; } - -# undef PREFIX_UNARY + + #undef PREFIX_UNARY // v4int prefix increment / decrement -# define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) { \ + #define PREFIX_INCDEC(op) \ + inline v4int operator op( v4int &a ) \ + { \ v4int b; \ - b.i[0] = (op a.i[0]); \ - b.i[1] = (op a.i[1]); \ - b.i[2] = (op a.i[2]); \ - b.i[3] = (op a.i[3]); \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ return b; \ } PREFIX_INCDEC(++) PREFIX_INCDEC(--) -# undef PREFIX_INCDEC + #undef PREFIX_INCDEC // v4int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) { \ + #define POSTFIX_INCDEC(op) \ + inline v4int operator op( v4int &a, int ) \ + { \ v4int b; \ - b.i[0] = (a.i[0] op); \ - b.i[1] = (a.i[1] op); \ - b.i[2] = (a.i[2] op); \ - b.i[3] = (a.i[3] op); \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ return b; \ } POSTFIX_INCDEC(++) POSTFIX_INCDEC(--) -# undef POSTFIX_INCDEC + #undef POSTFIX_INCDEC // v4int binary operators - -# define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + + #define BINARY(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ c.i[0] = a.i[0] op b.i[0]; \ c.i[1] = a.i[1] op b.i[1]; \ @@ -571,39 +723,48 @@ namespace v4 { BINARY(*) BINARY(/) BINARY(%) + BINARY(<<) + BINARY(>>) - inline v4int operator ^( const v4int &a, const v4int &b ) { + #undef BINARY + + inline v4int operator ^( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_xor_ps( a.v, b.v ); + return c; } - inline v4int operator &( const v4int &a, const v4int &b ) { + inline v4int operator &( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_and_ps( a.v, b.v ); + return c; } - inline v4int operator |( const v4int &a, const v4int &b ) { + inline v4int operator |( const v4int &a, const v4int &b ) + { v4int c; + c.v = _mm_or_ps( a.v, b.v ); + return c; } - BINARY(<<) - BINARY(>>) - -# undef BINARY - // v4int logical operators -# define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) { \ + #define LOGICAL(op) \ + inline v4int operator op( const v4int &a, const v4int &b ) \ + { \ v4int c; \ - c.i[0] = -(a.i[0] op b.i[0]); \ - c.i[1] = -(a.i[1] op b.i[1]); \ - c.i[2] = -(a.i[2] op b.i[2]); \ - c.i[3] = -(a.i[3] op b.i[3]); \ + c.i[0] = - ( a.i[0] op b.i[0] ); \ + c.i[1] = - ( a.i[1] op b.i[1] ); \ + c.i[2] = - ( a.i[2] op b.i[2] ); \ + c.i[3] = - ( a.i[3] op b.i[3] ); \ return c; \ } @@ -615,44 +776,58 @@ namespace v4 { LOGICAL(>=) LOGICAL(&&) LOGICAL(||) - -# undef LOGICAL + + #undef LOGICAL // v4int miscellaneous functions - inline v4int abs( const v4int &a ) { + inline v4int abs( const v4int &a ) + { v4int b; - b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0]; - b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1]; - b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2]; - b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3]; + + b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; + b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1]; + b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2]; + b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; + return b; } - inline v4 czero( const v4int &c, const v4 &a ) { + inline v4 czero( const v4int &c, const v4 &a ) + { v4 b; - b.v = _mm_andnot_ps(c.v,a.v); + + b.v = _mm_andnot_ps( c.v, a.v ); + return b; } - inline v4 notczero( const v4int &c, const v4 &a ) { + inline v4 notczero( const v4int &c, const v4 &a ) + { v4 b; - b.v = _mm_and_ps(c.v,a.v); + + b.v = _mm_and_ps( c.v, a.v ); + return b; } - - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) { - __m128 c_v = c.v; + + inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) + { v4 tf; - tf.v = _mm_or_ps(_mm_andnot_ps(c_v,f.v),_mm_and_ps(c_v,t.v)); + + __m128 c_v = c.v; + + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), + _mm_and_ps( c_v, t.v ) ); + return tf; } //////////////// // v4float class - class v4float : public v4 { - + class v4float : public v4 + { // v4float prefix unary operator friends friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; @@ -691,9 +866,9 @@ namespace v4 { // v4float math library friends -# define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE + #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE + #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ + const v4float &b ) ALWAYS_INLINE CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); @@ -703,192 +878,252 @@ namespace v4 { CMATH_FR2(copysign); -# undef CMATH_FR1 -# undef CMATH_FR2 + #undef CMATH_FR1 + #undef CMATH_FR2 // v4float miscellaneous friends friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE; + friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; + friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; + friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - // FIXME: crack + friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; - + public: // v4float constructors / destructors - + v4float() {} // Default constructor - v4float( const v4float &a ) { v = a.v; } // Copy constructor - v4float( const v4 &a ) { v = a.v; } // Init from mixed - v4float( float a ) { // Init from scalar + + v4float( const v4float &a ) // Copy constructor + { + v = a.v; + } + + v4float( const v4 &a ) // Init from mixed + { + v = a.v; + } + + v4float( float a ) // Init from scalar + { v = _mm_set1_ps( a ); } - v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars + + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + { v = _mm_setr_ps( f0, f1, f2, f3 ); } + ~v4float() {} // Destructor // v4float assignment operators -# define ASSIGN(op,intrin) \ - inline v4float &operator op(const v4float &b) { \ - v = intrin(v,b.v); \ - return *this; \ + #define ASSIGN(op,intrin) \ + inline v4float &operator op( const v4float &b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } - inline v4float &operator =(const v4float &b) { + ASSIGN( +=, _mm_add_ps ) + ASSIGN( -=, _mm_sub_ps ) + ASSIGN( *=, _mm_mul_ps ) + ASSIGN( /=, _mm_div_ps ) + + #undef ASSIGN + + inline v4float &operator =( const v4float &b ) + { v = b.v; + return *this; } - ASSIGN(+=,_mm_add_ps) - ASSIGN(-=,_mm_sub_ps) - ASSIGN(*=,_mm_mul_ps) - ASSIGN(/=,_mm_div_ps) - -# undef ASSIGN - // v4float member access operator - inline float &operator []( int n ) { return f[n]; } - inline float operator ()( int n ) { return f[n]; } + inline float &operator []( int n ) + { + return f[n]; + } + inline float operator ()( int n ) + { + return f[n]; + } }; // v4float prefix unary operators - inline v4float operator +( const v4float &a ) { + inline v4float operator +( const v4float &a ) + { v4float b; + b.v = a.v; + return b; } - inline v4float operator -( const v4float &a ) { + inline v4float operator -( const v4float &a ) + { v4float b; - b.v = _mm_sub_ps(_mm_setzero_ps(),a.v); + + b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); + return b; } - inline v4int operator !( const v4float &a ) { + inline v4int operator !( const v4float &a ) + { v4int b; - b.v = _mm_cmpeq_ps(_mm_setzero_ps(),a.v); + + b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); + return b; } // v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) { + inline v4float operator ++( v4float &a ) + { v4float b; + __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); + a.v = t; b.v = t; + return b; } - inline v4float operator --( v4float &a ) { + inline v4float operator --( v4float &a ) + { v4float b; + __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); + a.v = t; b.v = t; + return b; } // v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) { + inline v4float operator ++( v4float &a, int ) + { v4float b; + __m128 a_v = a.v; + a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) ); b.v = a_v; + return b; } - inline v4float operator --( v4float &a, int ) { + inline v4float operator --( v4float &a, int ) + { v4float b; + __m128 a_v = a.v; - a.v = _mm_sub_ps(a_v, _mm_set1_ps( 1 ) ); + + a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) ); b.v = a_v; + return b; } // v4float binary operators - -# define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) { \ + + #define BINARY(op,intrin) \ + inline v4float operator op( const v4float &a, const v4float &b ) \ + { \ v4float c; \ - c.v = intrin(a.v,b.v); \ + c.v = intrin( a.v, b.v ); \ return c; \ } - BINARY(+,_mm_add_ps) - BINARY(-,_mm_sub_ps) - BINARY(*,_mm_mul_ps) - BINARY(/,_mm_div_ps) + BINARY( +, _mm_add_ps ) + BINARY( -, _mm_sub_ps ) + BINARY( *, _mm_mul_ps ) + BINARY( /, _mm_div_ps ) -# undef BINARY + #undef BINARY // v4float logical operators -# define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) { \ + #define LOGICAL(op,intrin) \ + inline v4int operator op( const v4float &a, const v4float &b ) \ + { \ v4int c; \ - c.v = intrin(a.v,b.v); \ + c.v = intrin( a.v, b.v ); \ return c; \ } - LOGICAL(<, _mm_cmplt_ps ) - LOGICAL(>, _mm_cmpgt_ps ) - LOGICAL(==,_mm_cmpeq_ps ) - LOGICAL(!=,_mm_cmpneq_ps) - LOGICAL(<=,_mm_cmple_ps ) - LOGICAL(>=,_mm_cmpge_ps ) + LOGICAL( <, _mm_cmplt_ps ) + LOGICAL( >, _mm_cmpgt_ps ) + LOGICAL( ==, _mm_cmpeq_ps ) + LOGICAL( <=, _mm_cmple_ps ) + LOGICAL( >=, _mm_cmpge_ps ) + LOGICAL( !=, _mm_cmpneq_ps ) - inline v4int operator &&( const v4float &a, const v4float &b ) { + #undef LOGICAL + + inline v4int operator &&( const v4float &a, const v4float &b ) + { v4int c; + __m128 vzero = _mm_setzero_ps(); - c.v = _mm_and_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero)); + + c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), + _mm_cmpneq_ps( b.v, vzero ) ); + return c; } - inline v4int operator ||( const v4float &a, const v4float &b ) { + inline v4int operator ||( const v4float &a, const v4float &b ) + { v4int c; + __m128 vzero = _mm_setzero_ps(); - c.v = _mm_or_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero)); + + c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), + _mm_cmpneq_ps( b.v, vzero ) ); + return c; } -# undef LOGICAL - // v4float math library functions -# define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) { \ + #define CMATH_FR1(fn) \ + inline v4float fn( const v4float &a ) \ + { \ v4float b; \ - b.f[0] = ::fn(a.f[0]); \ - b.f[1] = ::fn(a.f[1]); \ - b.f[2] = ::fn(a.f[2]); \ - b.f[3] = ::fn(a.f[3]); \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ return b; \ } -# define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) { \ + #define CMATH_FR2(fn) \ + inline v4float fn( const v4float &a, const v4float &b ) \ + { \ v4float c; \ - c.f[0] = ::fn(a.f[0],b.f[0]); \ - c.f[1] = ::fn(a.f[1],b.f[1]); \ - c.f[2] = ::fn(a.f[2],b.f[2]); \ - c.f[3] = ::fn(a.f[3],b.f[3]); \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ return c; \ } @@ -898,148 +1133,230 @@ namespace v4 { CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) - inline v4float fabs( const v4float &a ) { + #undef CMATH_FR1 + #undef CMATH_FR2 + + inline v4float fabs( const v4float &a ) + { v4float b; - b.v = _mm_andnot_ps( _mm_set1_ps( -0.f ), a.v ); + + b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v ); + return b; } - inline v4float sqrt( const v4float &a ) { + inline v4float sqrt( const v4float &a ) + { v4float b; - b.v = _mm_sqrt_ps(a.v); + + b.v = _mm_sqrt_ps( a.v ); + return b; } - inline v4float copysign( const v4float &a, const v4float &b ) { + inline v4float copysign( const v4float &a, const v4float &b ) + { v4float c; - __m128 t = _mm_set1_ps( -0.f ); - c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) ); + + __m128 t = _mm_set1_ps( -0.0f ); + + c.v = _mm_or_ps( _mm_and_ps( t, b.v ), + _mm_andnot_ps( t, a.v ) ); + return c; } -# undef CMATH_FR1 -# undef CMATH_FR2 + // v4float miscellaneous functions - // v4float miscelleanous functions - - inline v4float rsqrt_approx( const v4float &a ) { + inline v4float rsqrt_approx( const v4float &a ) + { v4float b; - b.v = _mm_rsqrt_ps(a.v); + + b.v = _mm_rsqrt_ps( a.v ); + return b; } - #if 0 - inline v4float rsqrt( const v4float &a ) { + inline v4float rsqrt( const v4float &a ) + { v4float b; + __m128 a_v = a.v, b_v; - b_v = _mm_rsqrt_ps(a_v); - // Note: It is quicker to just call div_ps and sqrt_ps if more - // refinement desired! - b.v = _mm_add_ps(b_v,_mm_mul_ps(_mm_set1_ps(0.5f), - _mm_sub_ps(b_v,_mm_mul_ps(a_v, - _mm_mul_ps(b_v, - _mm_mul_ps(b_v,b_v)))))); + + b_v = _mm_rsqrt_ps( a_v ); + + b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), + _mm_sub_ps( b_v, + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, + _mm_mul_ps( b_v, b_v ) + ) + ) + ) + ) + ); + return b; } - #endif - inline v4float rsqrt( const v4float &a ) { + #if 0 + inline v4float rsqrt( const v4float &a ) + { v4float b; - b.f[0] = ::sqrt( 1/a.f[0] ); - b.f[1] = ::sqrt( 1/a.f[1] ); - b.f[2] = ::sqrt( 1/a.f[2] ); - b.f[3] = ::sqrt( 1/a.f[3] ); + + b.f[0] = ::sqrt( 1 / a.f[0] ); + b.f[1] = ::sqrt( 1 / a.f[1] ); + b.f[2] = ::sqrt( 1 / a.f[2] ); + b.f[3] = ::sqrt( 1 / a.f[3] ); + return b; } + #endif - inline v4float rcp_approx( const v4float &a ) { + inline v4float rcp_approx( const v4float &a ) + { v4float b; - b.v = _mm_rcp_ps(a.v); + + b.v = _mm_rcp_ps( a.v ); + return b; } - #if 0 - inline v4float rcp( const v4float &a ) { + inline v4float rcp( const v4float &a ) + { v4float b; + __m128 a_v = a.v, b_v; - b_v = _mm_rcp_ps(a_v); - b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v))); + + b_v = _mm_rcp_ps( a_v ); + + b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ), + _mm_mul_ps( a_v, + _mm_mul_ps( b_v, b_v ) + ) + ); + return b; } - #endif - inline v4float rcp( const v4float &a ) { + #if 0 + inline v4float rcp( const v4float &a ) + { v4float b; - b.f[0] = 1/a.f[0]; - b.f[1] = 1/a.f[1]; - b.f[2] = 1/a.f[2]; - b.f[3] = 1/a.f[3]; + + b.f[0] = 1 / a.f[0]; + b.f[1] = 1 / a.f[1]; + b.f[2] = 1 / a.f[2]; + b.f[3] = 1 / a.f[3]; + return b; } + #endif - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v ); + return d; } - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v ); + return d; } - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) { + inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) + { v4float d; + d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) ); + return d; } - inline v4float clear_bits( const v4int &m, const v4float &a ) { + inline v4float clear_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_andnot_ps( m.v, a.v ); + return b; } - inline v4float set_bits( const v4int &m, const v4float &a ) { + inline v4float set_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_or_ps( m.v, a.v ); + return b; } - inline v4float toggle_bits( const v4int &m, const v4float &a ) { + inline v4float toggle_bits( const v4int &m, const v4float &a ) + { v4float b; + b.v = _mm_xor_ps( m.v, a.v ); + return b; } - inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void increment_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); } - inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void decrement_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); } - inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) { + inline void scale_4x1( float * ALIGNED(16) p, + const v4float &a ) + { _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); } // Given wl = x y z w, compute: // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) { - __m128 l = _mm_set1_ps(1), s = _mm_setr_ps(-0.f,+0.f,-0.f,+0.f); + inline void trilinear( v4float &wl, v4float &wh ) + { + __m128 l = _mm_set1_ps( 1.0f ); + + __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); + __m128 z = wl.v, xy; - xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) ); - z = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) ); + + xy = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) + ) + ); + + z = _mm_add_ps( l, + _mm_xor_ps( s, + _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) + ) + ); + + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), + _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); + + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); + + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); } -# undef PERM + #undef PERM } // namespace v4 From d938848c6e942aa2871f6c81a3b79a1c5ed0f3f2 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 10:48:42 -0600 Subject: [PATCH 43/95] Fix issue with undefined symbol. --- src/util/v4/v4_neon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 0152ad2b..d20dfc67 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -1039,7 +1039,7 @@ namespace v4 ALWAYS_VECTORIZE for( int j = 0; j < 4; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return tf; } From 78aa8a4f00f85f95b17dc3ec4f8a94e2650c94e1 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 12:18:07 -0600 Subject: [PATCH 44/95] Try a fix to a unit test failure for the Altivec case. --- src/util/v4/v4_altivec.h | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h index f1361278..d9438fc1 100644 --- a/src/util/v4/v4_altivec.h +++ b/src/util/v4/v4_altivec.h @@ -287,20 +287,37 @@ namespace v4 v4 &a, v4 &b ) { - _v4_float r = vec_ld( 0, (const float *) a0 ); // r = 0 1 2 3 - _v4_float s = vec_ld( 0, (const float *) a1 ); // s = 4 5 6 7 - _v4_float t = vec_ld( 0, (const float *) a2 ); // t = 8 9 10 11 - _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15 + _v4_float r, s, t, u; + + a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 3 + b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 7 + t = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 11 + u = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15 + + // Step 1: Interleave top and bottom half + + r = vec_mergeh( a.v, t ); // r = 0 8 1 9 + s = vec_mergeh( b.v, u ); // s = 4 12 5 13 + + // Step 2: Interleave even and odd rows + + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 + + // _v4_float r = vec_ld( 0, (const float *) a0 ); // r = 0 1 2 3 + // _v4_float s = vec_ld( 0, (const float *) a1 ); // s = 4 5 6 7 + // _v4_float t = vec_ld( 0, (const float *) a2 ); // t = 8 9 10 11 + // _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15 // Step 1: Interleave top and bottom half - _v4_float v = vec_mergeh( r, t ); // v = 0 8 1 9 - _v4_float w = vec_mergeh( s, u ); // w = 4 12 5 13 + // _v4_float w = vec_mergeh( r, t ); // v = 0 8 1 9 + // _v4_float x = vec_mergeh( s, u ); // w = 4 12 5 13 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( v, w ); // a = 0 4 8 12 - b.v = vec_mergel( v, w ); // b = 1 5 9 13 + // a.v = vec_mergeh( w, x ); // a = 0 4 8 12 + // b.v = vec_mergel( w, x ); // b = 1 5 9 13 } inline void load_4x3_tr( const void * ALIGNED(16) a0, From 562ec80a5e286fae704545653d37fde1ea8f88ef Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 14:52:06 -0600 Subject: [PATCH 45/95] Modify TEST_CASE_load_4x2_tr to see if I can get it to pass on IBM with Altivec. --- src/util/v4/test/v4.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc index 73a51540..d2945cea 100644 --- a/src/util/v4/test/v4.cc +++ b/src/util/v4/test/v4.cc @@ -222,22 +222,24 @@ TEST_CASE("TEST_CASE_load_4x1_tr", "[v4]") { } // TEST_CASE TEST_CASE("TEST_CASE_load_4x2_tr", "[v4]") { - DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 ); + DECLARE_ALIGNED_ARRAY( int, 16, mem, 32 ); v4int a0, a1, a2, a3; int i; - for( i=0; i<16; i++ ) mem[i] = i; - load_4x2_tr(mem, mem+4,mem+8, mem+12,a0,a1); - load_4x2_tr(mem+2,mem+6,mem+10,mem+14,a2,a3); - for( i=0; i<16; i++ ) if( mem[i]!=i ) break; + for( i=0; i<32; i++ ) mem[i] = i; + load_4x2_tr(mem, mem+4, mem+8, mem+12,a0,a1); + load_4x2_tr(mem+16,mem+20,mem+24,mem+28,a2,a3); + for( i=0; i<32; i++ ) if( mem[i]!=i ) break; //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) || //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 ); REQUIRE( any(a0==v4int( 0, 4, 8,12)) ); REQUIRE( any(a1==v4int( 1, 5, 9,13)) ); - REQUIRE( any(a2==v4int( 2, 6,10,14)) ); - REQUIRE( any(a3==v4int( 3, 7,11,15)) ); - REQUIRE( i==16 ); + REQUIRE( any(a2==v4int(16,20,24,28)) ); + REQUIRE( any(a3==v4int(17,21,25,29)) ); + // REQUIRE( any(a2==v4int( 2, 6,10,14)) ); + // REQUIRE( any(a3==v4int( 3, 7,11,15)) ); + REQUIRE( i==32 ); } // TEST_CASE From cbcc9582a265747fae6fa3459b4c1a805d687ce9 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 18:09:17 -0600 Subject: [PATCH 46/95] Remove a test hack. --- src/sf_interface/sf_interface.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h index fd9b72eb..6dc86883 100644 --- a/src/sf_interface/sf_interface.h +++ b/src/sf_interface/sf_interface.h @@ -52,15 +52,6 @@ #endif -// Temporary hack. -#ifdef V4_NEON_ACCELERATION_SNOUT - -#define PAD_SIZE_INTERPOLATOR 14 -#define PAD_SIZE_ACCUMULATOR 4 -#define PAD_SIZE_HYDRO 2 - -#endif - /*****************************************************************************/ // Interpolator arrays shall be a (nx+2) x (ny+2) x (nz+2) allocation From 10c2adba8404b2a0fefad398b8b72094e5ec2f81 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 19:30:41 -0600 Subject: [PATCH 47/95] Remove some experimental NEON intrinsic code. --- .../standard/pipeline/center_p_pipeline_v4.cc | 146 ------------------ .../pipeline/uncenter_p_pipeline_v4.cc | 146 ------------------ 2 files changed, 292 deletions(-) diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc index 2a25611f..dc6d5e18 100644 --- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc @@ -6,150 +6,6 @@ using namespace v4; -#ifdef V4_NEON_ACCELERATION_SNOUT - -void -center_p_pipeline_v4( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ) -{ - const interpolator_t * ALIGNED(128) f0 = args->f0; - - particle_t * ALIGNED(128) p; - - const float * ALIGNED(16) vp00; - const float * ALIGNED(16) vp01; - const float * ALIGNED(16) vp02; - const float * ALIGNED(16) vp03; - - const v4float qdt_2mc( args->qdt_2mc); - const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate. - const v4float one(1.0); - const v4float one_third(1.0/3.0); - const v4float two_fifteenths(2.0/15.0); - - v4float dx, dy, dz, ux, uy, uz, q; - v4float hax, hay, haz, cbx, cby, cbz; - v4float v00, v01, v02, v03, v04, v05; - v4float v06, v07, v08, v09, v10; - v4int ii; - - int itmp, nq; - - // Determine which particle blocks this pipeline processes. - - DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq ); - - p = args->p0 + itmp; - - nq >>= 2; - - // Process the particle blocks for this pipeline. - - for( ; nq; nq--, p+=4 ) - { - //-------------------------------------------------------------------------- - // Load particle position data. - //-------------------------------------------------------------------------- - load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, - dx, dy, dz, ii, ux, uy, uz, q ); - - // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, - // dx, dy, dz, ii ); - - //-------------------------------------------------------------------------- - // Set field interpolation pointers. - //-------------------------------------------------------------------------- - vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) ); - vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) ); - vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) ); - vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - load_4x16_tr( vp00, vp01, vp02, vp03, - hax, v00, v01, v02, - hay, v03, v04, v05, - haz, v06, v07, v08, - cbx, v09, cby, v10 ); - - // load_4x4_tr( vp00, vp01, vp02, vp03, - // hax, v00, v01, v02 ); - - hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4, - // hay, v03, v04, v05 ); - - hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8, - // haz, v00, v01, v02 ); - - haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12, - // cbx, v03, cby, v04 ); - - cbx = fma( v09, dx, cbx ); - cby = fma( v10, dy, cby ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles, final. - //-------------------------------------------------------------------------- - load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16, - cbz, v05 ); - - cbz = fma( v05, dz, cbz ); - - //-------------------------------------------------------------------------- - // Load particle momentum data. Could use load_4x3_tr. - //-------------------------------------------------------------------------- - // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux, - // ux, uy, uz, q ); - - //-------------------------------------------------------------------------- - // Update momentum. - //-------------------------------------------------------------------------- - ux += hax; - uy += hay; - uz += haz; - - v00 = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) ); - v01 = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) ); - v02 = ( v00 * v00 ) * v01; - v03 = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one ); - v04 = v03 * rcp( fma( v03 * v03, v01, one ) ); - v04 += v04; - - v00 = fma( fms( uy, cbz, uz * cby ), v03, ux ); - v01 = fma( fms( uz, cbx, ux * cbz ), v03, uy ); - v02 = fma( fms( ux, cby, uy * cbx ), v03, uz ); - - ux = fma( fms( v01, cbz, v02 * cby ), v04, ux ); - uy = fma( fms( v02, cbx, v00 * cbz ), v04, uy ); - uz = fma( fms( v00, cby, v01 * cbx ), v04, uz ); - - //-------------------------------------------------------------------------- - // Store particle momentum data. Could use store_4x3_tr. - //-------------------------------------------------------------------------- - store_4x4_tr( ux, uy, uz, q, - &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux ); - } -} - -#else - void center_p_pipeline_v4( center_p_pipeline_args_t * args, int pipeline_rank, @@ -280,8 +136,6 @@ center_p_pipeline_v4( center_p_pipeline_args_t * args, } } -#endif - #else void diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc index d4bfc425..3be32773 100644 --- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc +++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc @@ -6,150 +6,6 @@ using namespace v4; -#ifdef V4_NEON_ACCELERATION_SNOUT - -void -uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ) -{ - const interpolator_t * ALIGNED(128) f0 = args->f0; - - particle_t * ALIGNED(128) p; - - const float * ALIGNED(16) vp00; - const float * ALIGNED(16) vp01; - const float * ALIGNED(16) vp02; - const float * ALIGNED(16) vp03; - - const v4float qdt_2mc( -args->qdt_2mc); // For backward half advance. - const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate. - const v4float one(1.0); - const v4float one_third(1.0/3.0); - const v4float two_fifteenths(2.0/15.0); - - v4float dx, dy, dz, ux, uy, uz, q; - v4float hax, hay, haz, cbx, cby, cbz; - v4float v00, v01, v02, v03, v04, v05; - v4float v06, v07, v08, v09, v10; - v4int ii; - - int first, nq; - - // Determine which particle quads this pipeline processes. - - DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq ); - - p = args->p0 + first; - - nq >>= 2; - - // Process the particle quads for this pipeline. - - for( ; nq; nq--, p+=4 ) - { - //-------------------------------------------------------------------------- - // Load particle position data. - //-------------------------------------------------------------------------- - load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, - dx, dy, dz, ii, ux, uy, uz, q ); - - // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx, - // dx, dy, dz, ii ); - - //-------------------------------------------------------------------------- - // Set field interpolation pointers. - //-------------------------------------------------------------------------- - vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) ); - vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) ); - vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) ); - vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - load_4x16_tr( vp00, vp01, vp02, vp03, - hax, v00, v01, v02, - hay, v03, v04, v05, - haz, v06, v07, v08, - cbx, v09, cby, v10 ); - - // load_4x4_tr( vp00, vp01, vp02, vp03, - // hax, v00, v01, v02 ); - - hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4, - // hay, v03, v04, v05 ); - - hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8, - // haz, v00, v01, v02 ); - - haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles. - //-------------------------------------------------------------------------- - // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12, - // cbx, v03, cby, v04 ); - - cbx = fma( v09, dx, cbx ); - cby = fma( v10, dy, cby ); - - //-------------------------------------------------------------------------- - // Load interpolation data for particles, final. - //-------------------------------------------------------------------------- - load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16, - cbz, v05 ); - - cbz = fma( v05, dz, cbz ); - - //-------------------------------------------------------------------------- - // Load particle momentum data. Could use load_4x3_tr. - //-------------------------------------------------------------------------- - // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux, - // ux, uy, uz, q ); - - //-------------------------------------------------------------------------- - // Update momentum. - //-------------------------------------------------------------------------- - v00 = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) ); - v01 = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) ); - v02 = ( v00 * v00 ) * v01; - v03 = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one ); - v04 = v03 * rcp( fma( v03 * v03, v01, one ) ); - v04 += v04; - - v00 = fma( fms( uy, cbz, uz * cby ), v03, ux ); - v01 = fma( fms( uz, cbx, ux * cbz ), v03, uy ); - v02 = fma( fms( ux, cby, uy * cbx ), v03, uz ); - - ux = fma( fms( v01, cbz, v02 * cby ), v04, ux ); - uy = fma( fms( v02, cbx, v00 * cbz ), v04, uy ); - uz = fma( fms( v00, cby, v01 * cbx ), v04, uz ); - - ux += hax; - uy += hay; - uz += haz; - - //-------------------------------------------------------------------------- - // Store particle data. Could use store_4x3_tr. - //-------------------------------------------------------------------------- - store_4x4_tr( ux, uy, uz, q, - &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux ); - } -} - -#else - void uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, int pipeline_rank, @@ -280,8 +136,6 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, } } -#endif - #else void From 23144b8ecb82e1b8803b566232053b4e25dd46d5 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 19:43:06 -0600 Subject: [PATCH 48/95] Remove more code that was used for testing ARM NEON intrinsics version. --- src/vpic/initialize.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc index 4961559b..8cc28da0 100644 --- a/src/vpic/initialize.cc +++ b/src/vpic/initialize.cc @@ -51,12 +51,6 @@ vpic_simulation::initialize( int argc, if( rank()==0 ) MESSAGE(( "Uncentering particles" )); TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 ); } - LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 ); - for( int iwdn = 0; iwdn < 100; iwdn++ ) - { - LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 ); - LIST_FOR_EACH( sp, species_list ) TIC center_p( sp, interpolator_array ); TOC( center_p, 1 ); - } LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 ); if( rank()==0 ) MESSAGE(( "Performing initial diagnostics" )); From 679b6bbaee09b99e3aef15606d3c4097e93d08b1 Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 19:52:16 -0600 Subject: [PATCH 49/95] Remove some dead code. --- src/util/v4/test/v4.cc | 59 ------------------------------------------ 1 file changed, 59 deletions(-) diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc index d2945cea..b43ee5af 100644 --- a/src/util/v4/test/v4.cc +++ b/src/util/v4/test/v4.cc @@ -237,8 +237,6 @@ TEST_CASE("TEST_CASE_load_4x2_tr", "[v4]") { REQUIRE( any(a1==v4int( 1, 5, 9,13)) ); REQUIRE( any(a2==v4int(16,20,24,28)) ); REQUIRE( any(a3==v4int(17,21,25,29)) ); - // REQUIRE( any(a2==v4int( 2, 6,10,14)) ); - // REQUIRE( any(a3==v4int( 3, 7,11,15)) ); REQUIRE( i==32 ); } // TEST_CASE @@ -276,63 +274,6 @@ TEST_CASE("TEST_CASE_load_4x4_tr", "[v4]") { REQUIRE( i==16 ); } // TEST_CASE -#ifdef V4_NEON_ACCELERATION -TEST_CASE("TEST_CASE_load_4x8_tr", "[v4]") { - DECLARE_ALIGNED_ARRAY( int, 64, mem, 32 ); - v4int a0, a1, a2, a3, a4, a5, a6, a7; - int i; - for( i=0; i<32; i++ ) mem[i] = i; - load_4x8_tr(mem,mem+8,mem+16,mem+24,a0,a1,a2,a3,a4,a5,a6,a7); - for( i=0; i<32; i++ ) if( mem[i]!=i ) break; - //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) || - //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 ); - - REQUIRE( any(a0==v4int( 0, 8, 16, 24 )) ); - REQUIRE( any(a1==v4int( 1, 9, 17, 25 )) ); - REQUIRE( any(a2==v4int( 2, 10, 18, 26 )) ); - REQUIRE( any(a3==v4int( 3, 11, 19, 27 )) ); - REQUIRE( any(a4==v4int( 4, 12, 20, 28 )) ); - REQUIRE( any(a5==v4int( 5, 13, 21, 29 )) ); - REQUIRE( any(a6==v4int( 6, 14, 22, 30 )) ); - REQUIRE( any(a7==v4int( 7, 15, 23, 31 )) ); - REQUIRE( i==32 ); -} // TEST_CASE -#endif - -#ifdef V4_NEON_ACCELERATION -TEST_CASE("TEST_CASE_load_4x16_tr", "[v4]") { - DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 ); - v4int a00, a01, a02, a03, a04, a05, a06, a07; - v4int a08, a09, a10, a11, a12, a13, a14, a15; - int i; - for( i=0; i<64; i++ ) mem[i] = i; - load_4x16_tr(mem,mem+16,mem+32,mem+48, - a00,a01,a02,a03,a04,a05,a06,a07, - a08,a09,a10,a11,a12,a13,a14,a15); - for( i=0; i<64; i++ ) if( mem[i]!=i ) break; - //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) || - //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 ); - - REQUIRE( any(a00==v4int( 0, 16, 32, 48 )) ); - REQUIRE( any(a01==v4int( 1, 17, 33, 49 )) ); - REQUIRE( any(a02==v4int( 2, 18, 34, 50 )) ); - REQUIRE( any(a03==v4int( 3, 19, 35, 51 )) ); - REQUIRE( any(a04==v4int( 4, 20, 36, 52 )) ); - REQUIRE( any(a05==v4int( 5, 21, 37, 53 )) ); - REQUIRE( any(a06==v4int( 6, 22, 38, 54 )) ); - REQUIRE( any(a07==v4int( 7, 23, 39, 55 )) ); - REQUIRE( any(a08==v4int( 8, 24, 40, 56 )) ); - REQUIRE( any(a09==v4int( 9, 25, 41, 57 )) ); - REQUIRE( any(a10==v4int( 10, 26, 42, 58 )) ); - REQUIRE( any(a11==v4int( 11, 27, 43, 59 )) ); - REQUIRE( any(a12==v4int( 12, 28, 44, 60 )) ); - REQUIRE( any(a13==v4int( 13, 29, 45, 61 )) ); - REQUIRE( any(a14==v4int( 14, 30, 46, 62 )) ); - REQUIRE( any(a15==v4int( 15, 31, 47, 63 )) ); - REQUIRE( i==64 ); -} // TEST_CASE -#endif - TEST_CASE("TEST_CASE_store_4x1_tr", "[v4]") { DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 ); v4int a0( 0, 4, 8,12), a1( 1, 5, 9,13), a2( 2, 6,10,14), a3( 3, 7,11,15); From 41b7d3ee27992d17e90ebcd533ab14ff98bd2abc Mon Sep 17 00:00:00 2001 From: Dave Nystrom Date: Mon, 5 Aug 2019 20:13:01 -0600 Subject: [PATCH 50/95] Do not disable dynamic resizing since it is not disabled in github. --- src/boundary/boundary_p.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boundary/boundary_p.cc b/src/boundary/boundary_p.cc index a50d6657..cfcb699e 100644 --- a/src/boundary/boundary_p.cc +++ b/src/boundary/boundary_p.cc @@ -5,7 +5,7 @@ // If this is defined particle and mover buffers will not resize dynamically. // This is the common case for the users. -#define DISABLE_DYNAMIC_RESIZING +// #define DISABLE_DYNAMIC_RESIZING // FIXME: ARCHITECTURAL FLAW! CUSTOM BCS AND SHARED FACES CANNOT // COEXIST ON THE SAME FACE! THIS MEANS THAT CUSTOM BOUNDARYS MUST From 8599273b5cc242fa6643443ec47535e94010b0ed Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 13 Aug 2019 10:40:43 -0600 Subject: [PATCH 51/95] added global partition data to grid --- src/grid/grid.h | 13 ++++++++----- src/grid/partition.cc | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/grid/grid.h b/src/grid/grid.h index 7654fe94..3167c7e6 100644 --- a/src/grid/grid.h +++ b/src/grid/grid.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -46,7 +46,7 @@ enum grid_enums { // B_tang -> Symmetric | B_tang -> Anti-symmetric // E_norm -> Symmetric | E_norm -> Anti-symmetric (see note) // div B -> Symmetric | div B -> Anti-symmetric - // + // // Note: B_norm is tricky. For a symmetry plane, B_norm on the // boundary must be zero as there are no magnetic charges (a // non-zero B_norm would imply an infinitesimal layer of magnetic @@ -80,7 +80,7 @@ typedef struct grid { int64_t step; // Current timestep double t0; // Simulation time corresponding to step 0 - // Phase 2 grid data structures + // Phase 2 grid data structures float x0, y0, z0; // Min corner local domain (must be coherent) float x1, y1, z1; // Max corner local domain (must be coherent) int nx, ny, nz; // Local voxel mesh resolution. Voxels are @@ -99,6 +99,9 @@ typedef struct grid { // 0 ... nproc-1 ... comm boundary condition // <0 ... locally applied boundary condition + int gpx, gpy, gpz = -1; // Store global processor decomposition to let us figure + // out where we are in the global decomposition + // Phase 3 grid data structures // NOTE: VOXEL INDEXING LIMITS NUMBER OF VOXELS TO 2^31 (INCLUDING // GHOSTS) PER NODE. NEIGHBOR INDEXING FURTHER LIMITS TO @@ -147,7 +150,7 @@ typedef struct grid { // inner loops.) // // This is written with seeming extraneously if tests in order to get -// the compiler to generate branceless conditional move and add +// the compiler to generate branceless conditional move and add // instructions (none of the branches below are actual branches in // assembly). @@ -311,7 +314,7 @@ end_send_port( int i, // x port coord ([-1,0,1]) // ordering (e.g. inner loop increments x-index). // // jobs are indexed from 0 to n_job-1. jobs are _always_ have the -// number of voxels an integer multiple of the bundle size. If job +// number of voxels an integer multiple of the bundle size. If job // is set to n_job, this function will determine the parameters of // the final incomplete bundle. diff --git a/src/grid/partition.cc b/src/grid/partition.cc index 96664b78..fc554c2d 100644 --- a/src/grid/partition.cc +++ b/src/grid/partition.cc @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -39,7 +39,7 @@ partition_periodic_box( grid_t * g, int gnx, int gny, int gnz, int gpx, int gpy, int gpz ) { double f; - int rank, px, py, pz; + int rank, px, py, pz; // Make sure the grid can be setup @@ -55,6 +55,11 @@ partition_periodic_box( grid_t * g, // Setup basic variables RANK_TO_INDEX( world_rank, px,py,pz ); + // Capture global processor decomposition + g->gpx = gpx; + g->gpx = gpy; + g->gpx = gpz; + g->dx = (gx1-gx0)/(double)gnx; g->dy = (gy1-gy0)/(double)gny; g->dz = (gz1-gz0)/(double)gnz; @@ -96,7 +101,7 @@ partition_absorbing_box( grid_t * g, int gnx, int gny, int gnz, int gpx, int gpy, int gpz, int pbc ) { - int px, py, pz; + int px, py, pz; partition_periodic_box( g, gx0, gy0, gz0, @@ -108,30 +113,30 @@ partition_absorbing_box( grid_t * g, RANK_TO_INDEX( world_rank, px,py,pz ); - if( px==0 && gnx>1 ) { + if( px==0 && gnx>1 ) { set_fbc(g,BOUNDARY(-1,0,0),absorb_fields); set_pbc(g,BOUNDARY(-1,0,0),pbc); - } + } if( px==gpx-1 && gnx>1 ) { set_fbc(g,BOUNDARY( 1,0,0),absorb_fields); set_pbc(g,BOUNDARY( 1,0,0),pbc); } - if( py==0 && gny>1 ) { + if( py==0 && gny>1 ) { set_fbc(g,BOUNDARY(0,-1,0),absorb_fields); set_pbc(g,BOUNDARY(0,-1,0),pbc); - } + } if( py==gpy-1 && gny>1 ) { set_fbc(g,BOUNDARY(0, 1,0),absorb_fields); set_pbc(g,BOUNDARY(0, 1,0),pbc); } - if( pz==0 && gnz>1 ) { + if( pz==0 && gnz>1 ) { set_fbc(g,BOUNDARY(0,0,-1),absorb_fields); set_pbc(g,BOUNDARY(0,0,-1),pbc); - } + } if( pz==gpz-1 && gnz>1 ) { set_fbc(g,BOUNDARY(0,0, 1),absorb_fields); @@ -148,7 +153,7 @@ partition_metal_box( grid_t * g, double gx1, double gy1, double gz1, int gnx, int gny, int gnz, int gpx, int gpy, int gpz ) { - int px, py, pz; + int px, py, pz; partition_periodic_box( g, gx0, gy0, gz0, From 08a2d2d84596d16240119ec64239066b3a43c1c5 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 13 Aug 2019 12:11:40 -0600 Subject: [PATCH 52/95] first pass adding a compiling port of viou for HDF5 vpic IO --- CMakeLists.txt | 56 +-- sample/harrisHDF5 | 432 +++++++++++++++++ src/vpic/dump.cc | 899 +++++++++++++++++++++++++++++++++++- src/vpic/hdf5_header_info.h | 259 +++++++++++ src/vpic/vpic.h | 8 + 5 files changed, 1619 insertions(+), 35 deletions(-) create mode 100644 sample/harrisHDF5 create mode 100644 src/vpic/hdf5_header_info.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fcc1027..f7fd9d84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,10 +73,10 @@ option(USE_LEGACY_SORT "Enable Legacy Sort Implementation" OFF) option(VPIC_PRINT_MORE_DIGITS "Print more digits in VPIC timer info" OFF) -option(ENABLE_OPENSSL "Enable OpenSSL support for checksums" OFF) - option(DISABLE_DYNAMIC_RESIZING "Prevent particle arrays from dynamically resizing during a run" OFF) +option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install HDF5" OFF) + # option to set minimum number of particles set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing") @@ -114,24 +114,11 @@ if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO") add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES}) endif() -#------------------------------------------------------------------------------# -# OpenSSL -#------------------------------------------------------------------------------# - -if(ENABLE_OPENSSL) - find_package(OpenSSL REQUIRED) - - include_directories(${OPENSSL_INCLUDE_DIR}) - string(REPLACE ";" " " string_libraries "${OPENSSL_LIBRARIES}") - set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}") -endif(ENABLE_OPENSSL) - find_package(Threads REQUIRED) #------------------------------------------------------------------------------# # Act on build options set in project.cmake #------------------------------------------------------------------------------# - #------------------------------------------------------------------------------# # Add options for building with the legacy particle sort implementation. #------------------------------------------------------------------------------# @@ -277,10 +264,6 @@ endif() # Miscellaneous options. #------------------------------------------------------------------------------# -if(ENABLE_OPENSSL) - add_definitions(-DENABLE_OPENSSL) -endif(ENABLE_OPENSSL) - if(VPIC_PRINT_MORE_DIGITS) add_definitions(-DVPIC_PRINT_MORE_DIGITS) set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_PRINT_MORE_DIGITS") @@ -323,18 +306,6 @@ install(FILES ${CMAKE_SOURCE_DIR}/deck/main.cc install(FILES ${CMAKE_SOURCE_DIR}/deck/wrapper.cc DESTINATION share/vpic) -# local script -configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in - ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic) - -file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic - DESTINATION ${CMAKE_BINARY_DIR}/bin - FILE_PERMISSIONS - OWNER_READ OWNER_WRITE OWNER_EXECUTE - GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE -) - #------------------------------------------------------------------------------# # Add library target #------------------------------------------------------------------------------# @@ -358,8 +329,29 @@ else() set(VPIC_SRC) install(TARGETS vpic LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) endif() + +if(USE_HDF5) + # Enable HDF5, and the relevant defines + find_package(HDF5 REQUIRED) + add_definitions(-DVPIC_ENABLE_HDF5) + string(REPLACE ";" " " string_libraries "${HDF5_C_LIBRARIES}") + set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}") +endif(USE_HDF5) + +# Configure local script to generate bin/vpic +configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic) + +file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic + DESTINATION ${CMAKE_BINARY_DIR}/bin + FILE_PERMISSIONS + OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE +) + target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src) -target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${OPENSSL_LIBRARIES} ${CMAKE_DL_LIBS}) +target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES}) target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS}) macro(build_a_vpic name deck) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 new file mode 100644 index 00000000..ff565f33 --- /dev/null +++ b/sample/harrisHDF5 @@ -0,0 +1,432 @@ +// Magnetic reconnection in a Harris equilibrium thin current sheet +// +// This input deck reproduces the PIC simulations found in: +// William Daughton. "Nonlinear dynamics of thin current sheets." Phys. +// Plasmas. 9(9): 3668-3678. September 2002. +// +// This input deck was written by: +// Kevin J Bowers, Ph.D. +// Plasma Physics Group (X-1) +// Applied Physics Division +// Los Alamos National Lab +// August 2003 - original version +// October 2003 - heavily revised to utilize input deck syntactic sugar +// March/April 2004 - rewritten for domain decomposition V4PIC + +// If you want to use global variables (for example, to store the dump +// intervals for your diagnostics section), it must be done in the globals +// section. Variables declared the globals section will be preserved across +// restart dumps. For example, if the globals section is: +// begin_globals { +// double variable; +// } end_globals +// the double "variable" will be visible to other input deck sections as +// "global->variable". Note: Variables declared in the globals section are set +// to zero before the user's initialization block is executed. Up to 16K +// of global variables can be defined. + +begin_globals { + double energies_interval; + double fields_interval; + double ehydro_interval; + double ihydro_interval; + double eparticle_interval; + double iparticle_interval; + double restart_interval; +}; + +begin_initialization { + // At this point, there is an empty grid and the random number generator is + // seeded with the rank. The grid, materials, species need to be defined. + // Then the initial non-zero fields need to be loaded at time level 0 and the + // particles (position and momentum both) need to be loaded at time level 0. + + double input_mass_ratio; + int input_seed; + + // Arguments can be passed from the command line to the input deck + if( num_cmdline_arguments!=3 ) { + // Set sensible defaults + input_mass_ratio = 1.0; + input_seed = 0; + + sim_log( "Defaulting to mass_ratio of " << input_mass_ratio << " and seed of " << input_seed ); + sim_log( "For Custom Usage: " << cmdline_argument[0] << " mass_ratio seed" ); + } + else { + input_mass_ratio = atof(cmdline_argument[1]); // Ion mass / electron mass + input_seed = atof(cmdline_argument[2]); // Ion mass / electron mass + sim_log( "Detected input mass_ratio of " << input_mass_ratio << " and seed of " << input_seed ); + } + seed_entropy( input_seed ); + + // Diagnostic messages can be passed written (usually to stderr) + sim_log( "Computing simulation parameters"); + + // Define the system of units for this problem (natural units) + double L = 1; // Length normalization (sheet thickness) + double ec = 1; // Charge normalization + double me = 1; // Mass normalization + double c = 1; // Speed of light + double eps0 = 1; // Permittivity of space + + // Physics parameters + double mi_me = input_mass_ratio; // Ion mass / electron mass + double rhoi_L = 1; // Ion thermal gyroradius / Sheet thickness + double Ti_Te = 1; // Ion temperature / electron temperature + double wpe_wce = 3; // Electron plasma freq / electron cycltron freq + double theta = 0; // Orientation of the simulation wrt current sheet + double taui = 100; // Simulation wci's to run + + // Numerical parameters + double Lx = 16*L; // How big should the box be in the x direction + double Ly = 16*L; // How big should the box be in the y direction + double Lz = 16*L; // How big should the box be in the z direction + double nx = 64; // Global resolution in the x direction + double ny = 64; // Global resolution in the y direction + double nz = 1; // Global resolution in the z direction + double nppc = 64; // Average number of macro particles per cell (both species combined!) + double cfl_req = 0.99; // How close to Courant should we try to run + double wpedt_max = 0.36; // How big a timestep is allowed if Courant is not too restrictive + double damp = 0.001; // Level of radiation damping + + // Derived quantities + double mi = me*mi_me; // Ion mass + double kTe = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature + double kTi = kTe*Ti_Te; // Ion temperature + double vthe = sqrt(2*kTe/me); // Electron thermal velocity (B.D. convention) + double vthi = sqrt(2*kTi/mi); // Ion thermal velocity (B.D. convention) + double wci = vthi/(rhoi_L*L); // Ion cyclotron frequency + double wce = wci*mi_me; // Electron cyclotron frequency + double wpe = wce*wpe_wce; // Electron plasma frequency + double wpi = wpe/sqrt(mi_me); // Ion plasma frequency + double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te)); // Electron drift velocity + double vdri = -Ti_Te*vdre; // Ion drift velocity + double b0 = me*wce/ec; // Asymptotic magnetic field strength + double n0 = me*eps0*wpe*wpe/(ec*ec); // Peak electron density (also peak ion density) + double Npe = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L); // Number of physical electrons in box + double Npi = Npe; // Number of physical ions in box + double Ne = 0.5*nppc*nx*ny*nz; // Total macro electrons in box + Ne = trunc_granular(Ne,nproc()); // Make it divisible by number of processors + double Ni = Ne; // Total macro ions in box + double we = Npe/Ne; // Weight of a macro electron + double wi = Npi/Ni; // Weight of a macro ion + double gdri = 1/sqrt(1-vdri*vdri/(c*c)); // gamma of ion drift frame + double gdre = 1/sqrt(1-vdre*vdre/(c*c)); // gamma of electron drift frame + double udri = vdri*gdri; // 4-velocity of ion drift frame + double udre = vdre*gdre; // 4-velocity of electron drift frame + double uthi = sqrt(kTi/mi)/c; // Normalized ion thermal velocity (K.B. convention) + double uthe = sqrt(kTe/me)/c; // Normalized electron thermal velocity (K.B. convention) + double cs = cos(theta); + double sn = sin(theta); + + // Determine the timestep + double dg = courant_length(Lx,Ly,Lz,nx,ny,nz); // Courant length + double dt = cfl_req*dg/c; // Courant limited time step + if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe; // Override time step if plasma frequency limited + + //////////////////////////////////////// + // Setup high level simulation parmeters + + num_step = int(0.2*taui/(wci*dt)); + status_interval = int(1./(wci*dt)); + sync_shared_interval = status_interval; + clean_div_e_interval = status_interval; + clean_div_b_interval = status_interval; + + global->energies_interval = status_interval; + global->fields_interval = status_interval; + global->ehydro_interval = status_interval; + global->ihydro_interval = status_interval; + global->eparticle_interval = status_interval; + global->iparticle_interval = status_interval; + global->restart_interval = status_interval; + + /////////////////////////// + // Setup the space and time + + // Setup basic grid parameters + define_units( c, eps0 ); + define_timestep( dt ); + + // Parition a periodic box among the processors sliced uniformly along y + define_periodic_grid( -0.5*Lx, 0, 0, // Low corner + 0.5*Lx, Ly, Lz, // High corner + nx, ny, nz, // Resolution + 1, nproc(), 1 ); // Topology + + // Override some of the boundary conditions to put a particle reflecting + // perfect electrical conductor on the -x and +x boundaries + set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields ); + set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields ); + set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles ); + set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles ); + + define_material( "vacuum", 1 ); + // Note: define_material defaults to isotropic materials with mu=1,sigma=0 + // Tensor electronic, magnetic and conductive materials are supported + // though. See "shapes" for how to define them and assign them to regions. + // Also, space is initially filled with the first material defined. + + // If you pass NULL to define field array, the standard field array will + // be used (if damp is not provided, no radiation damping will be used). + define_field_array( NULL, damp ); + + //////////////////// + // Setup the species + + // Allow 50% more local_particles in case of non-uniformity + // VPIC will pick the number of movers to use for each species + // Both species use out-of-place sorting + species_t * ion = define_species( "ion", ec, mi, 1.5*Ni/nproc(), -1, 40, 1 ); + species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 ); + + /////////////////////////////////////////////////// + // Log diagnostic information about this simulation + + sim_log( "" ); + sim_log( "System of units" ); + sim_log( "L = " << L ); + sim_log( "ec = " << ec ); + sim_log( "me = " << me ); + sim_log( "c = " << c ); + sim_log( "eps0 = " << eps0 ); + sim_log( "" ); + sim_log( "Physics parameters" ); + sim_log( "rhoi/L = " << rhoi_L ); + sim_log( "Ti/Te = " << Ti_Te ); + sim_log( "wpe/wce = " << wpe_wce ); + sim_log( "mi/me = " << mi_me ); + sim_log( "theta = " << theta ); + sim_log( "taui = " << taui ); + sim_log( "" ); + sim_log( "Numerical parameters" ); + sim_log( "num_step = " << num_step ); + sim_log( "dt = " << dt ); + sim_log( "Lx = " << Lx << ", Lx/L = " << Lx/L ); + sim_log( "Ly = " << Ly << ", Ly/L = " << Ly/L ); + sim_log( "Lz = " << Lz << ", Lz/L = " << Lz/L ); + sim_log( "nx = " << nx << ", dx = " << Lx/nx << ", L/dx = " << L*nx/Lx ); + sim_log( "ny = " << ny << ", dy = " << Ly/ny << ", L/dy = " << L*ny/Ly ); + sim_log( "nz = " << nz << ", dz = " << Lz/nz << ", L/dz = " << L*nz/Lz ); + sim_log( "nppc = " << nppc ); + sim_log( "courant = " << c*dt/dg ); + sim_log( "damp = " << damp ); + sim_log( "" ); + sim_log( "Ion parameters" ); + sim_log( "qpi = " << ec << ", mi = " << mi << ", qpi/mi = " << ec/mi ); + sim_log( "vthi = " << vthi << ", vthi/c = " << vthi/c << ", kTi = " << kTi ); + sim_log( "vdri = " << vdri << ", vdri/c = " << vdri/c ); + sim_log( "wpi = " << wpi << ", wpi dt = " << wpi*dt << ", n0 = " << n0 ); + sim_log( "wci = " << wci << ", wci dt = " << wci*dt ); + sim_log( "rhoi = " << vthi/wci << ", L/rhoi = " << L/(vthi/wci) << ", dx/rhoi = " << (Lx/nx)/(vthi/wci) ); + sim_log( "debyei = " << vthi/wpi << ", L/debyei = " << L/(vthi/wpi) << ", dx/debyei = " << (Lx/nx)/(vthi/wpi) ); + sim_log( "Npi = " << Npi << ", Ni = " << Ni << ", Npi/Ni = " << Npi/Ni << ", wi = " << wi ); + sim_log( "" ); + sim_log( "Electron parameters" ); + sim_log( "qpe = " << -ec << ", me = " << me << ", qpe/me = " << -ec/me ); + sim_log( "vthe = " << vthe << ", vthe/c = " << vthe/c << ", kTe = " << kTe ); + sim_log( "vdre = " << vdre << ", vdre/c = " << vdre/c ); + sim_log( "wpe = " << wpe << ", wpe dt = " << wpe*dt << ", n0 = " << n0 ); + sim_log( "wce = " << wce << ", wce dt = " << wce*dt ); + sim_log( "rhoe = " << vthe/wce << ", L/rhoe = " << L/(vthe/wce) << ", dx/rhoe = " << (Lx/nx)/(vthe/wce) ); + sim_log( "debyee = " << vthe/wpe << ", L/debyee = " << L/(vthe/wpe) << ", dx/debyee = " << (Lx/nx)/(vthe/wpe) ); + sim_log( "Npe = " << Npe << ", Ne = " << Ne << ", Npe/Ne = " << Npe/Ne << ", we = " << we ); + sim_log( "" ); + sim_log( "Miscellaneous" ); + sim_log( "nptotal = " << Ni + Ne ); + sim_log( "nproc = " << nproc() ); + sim_log( "" ); + + //////////////////////////// + // Load fields and particles + + sim_log( "Loading fields" ); + + set_region_field( everywhere, 0, 0, 0, // Electric field + 0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field + // Note: everywhere is a region that encompasses the entire simulation + // In general, regions are specied as logical equations (i.e. x>0 && x+y<2) + + sim_log( "Loading particles" ); + + double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc(); + + repeat( Ni/nproc() ) { + double x, y, z, ux, uy, uz, d0; + + // Pick an appropriately distributed random location for the pair + do { + x = L*atanh( uniform( rng(0), -1, 1 ) ); + } while( x<=-0.5*Lx || x>=0.5*Lx ); + y = uniform( rng(0), ymin, ymax ); + z = uniform( rng(0), 0, Lz ); + + // For the ion, pick an isothermal normalized momentum in the drift frame + // (this is a proper thermal equilibrium in the non-relativistic limit), + // boost it from the drift frame to the frame with the magnetic field + // along z and then rotate it into the lab frame. Then load the particle. + // Repeat the process for the electron. + + ux = normal( rng(0), 0, uthi ); + uy = normal( rng(0), 0, uthi ); + uz = normal( rng(0), 0, uthi ); + d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri; + uy = d0*cs - uz*sn; + uz = d0*sn + uz*cs; + inject_particle( ion, x, y, z, ux, uy, uz, wi, 0, 0 ); + + ux = normal( rng(0), 0, uthe ); + uy = normal( rng(0), 0, uthe ); + uz = normal( rng(0), 0, uthe ); + d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre; + uy = d0*cs - uz*sn; + uz = d0*sn + uz*cs; + inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 ); + } + + // Upon completion of the initialization, the following occurs: + // - The synchronization error (tang E, norm B) is computed between domains + // and tang E / norm B are synchronized by averaging where discrepancies + // are encountered. + // - The initial divergence error of the magnetic field is computed and + // one pass of cleaning is done (for good measure) + // - The bound charge density necessary to give the simulation an initially + // clean divergence e is computed. + // - The particle momentum is uncentered from u_0 to u_{-1/2} + // - The user diagnostics are called on the initial state + // - The physics loop is started + // + // The physics loop consists of: + // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2} + // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles) + // - User current injection (adjust field(x,y,z).jfx, jfy, jfz) + // - Advance B from B_0 to B_{1/2} + // - Advance E from E_0 to E_1 + // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz) + // - Advance B from B_{1/2} to B_1 + // - (periodically) Divergence clean electric field + // - (periodically) Divergence clean magnetic field + // - (periodically) Synchronize shared tang e and norm b + // - Increment the time step + // - Call user diagnostics + // - (periodically) Print a status message +} + +begin_diagnostics { + +# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0) + + if( step()==-10 ) { + // A grid dump contains all grid parameters, field boundary conditions, + // particle boundary conditions and domain connectivity information. This + // is stored in a binary format. Each rank makes a grid dump + dump_grid("grid"); + + // A materials dump contains all the materials parameters. This is in a + // text format. Only rank 0 makes the materials dump + dump_materials("materials"); + + // A species dump contains the physics parameters of a species. This is in + // a text format. Only rank 0 makes the species dump + dump_species("species"); + } + + // Energy dumps store all the energies in various directions of E and B + // and the total kinetic (not including rest mass) energies of each species + // species in a simple text format. By default, the energies are appended to + // the file. However, if a "0" is added to the dump_energies call, a new + // energies dump file will be created. The energies are in the units of the + // problem and are all time centered appropriately. Note: When restarting a + // simulation from a restart dump made at a prior time step to the last + // energies dump, the energies file will have a "hiccup" of intervening + // time levels. This "hiccup" will not occur if the simulation is aborted + // immediately following a restart dump. Energies dumps are in a text + // format and the layout is documented at the top of the file. Only rank 0 + // makes makes an energies dump. + if( should_dump(energies) ) dump_energies( "energies", step()==0 ? 0 : 1 ); + + // Field dumps store the raw electromagnetic fields, sources and material + // placement and a number of auxilliary fields. E, B and RHOB are + // timecentered, JF and TCA are half a step old. Material fields are static + // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for + // debugging purposes. By default, field dump filenames are tagged with + // step(). However, if a "0" is added to the call, the filename will not be + // tagged. The JF that gets stored is accumulated with a charge-conserving + // algorithm. As a result, JF is not valid until at least one timestep has + // been completed. Field dumps are in a binary format. Each rank makes a + // field dump. + if( step()==-10 ) dump_fields_hdf5("fields"); // Get first valid total J + if( should_dump(fields) ) dump_fields_hdf5("fields"); + + // Hydro dumps store particle charge density, current density and + // stress-energy tensor. All these quantities are known at the time + // t = time(). All these quantities are accumulated trilinear + // node-centered. By default, species dump filenames are tagged with + // step(). However, if a "0" is added to the call, the filename will not + // be tagged. Note that the current density accumulated by this routine is + // purely diagnostic. It is not used by the simulation and it is not + // accumulated using a self-consistent charge-conserving method. Hydro dumps + // are in a binary format. Each rank makes a hydro dump. + if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); + if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); + + // Particle dumps store the particle data for a given species. The data + // written is known at the time t = time(). By default, particle dumps + // are tagged with step(). However, if a "0" is added to the call, the + // filename will not be tagged. Particle dumps are in a binary format. + // Each rank makes a particle dump. + if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle"); + if( should_dump(iparticle) ) dump_particles_hdf5("ion", "iparticle"); + + // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string + // and tag is an integer. A typical usage is: + // checkpt( "checkpt", step() ). + // This will cause each process to write their simulation state to a file + // whose name is based on fbase, tag and the node's rank. For the above + // usage, if called on step 314 on a 4 process run, the four files: + // checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3 + // to be written. The simulation can then be restarted from this point by + // invoking the application with "--restore checkpt.314". checkpt must be + // the _VERY_ LAST_ diagnostic called. If not, diagnostics performed after + // the checkpt but before the next timestep will be missed on restore. + // Restart dumps are in a binary format unique to the each simulation. + + if( should_dump(restart) ) checkpt( "checkpt", step() ); + + // If you want to write a checkpt after a certain amount of simulation time, + // use uptime() in conjunction with checkpt. For example, this will cause + // the simulation state to be written after 7.5 hours of running to the + // same file every time (useful for dealing with quotas on big machines). + //if( uptime()>=27000 ) { + // checkpt( "timeout", 0 ); + // abort(0); + //} + +# undef should_dump + +} + +begin_particle_injection { + + // No particle injection for this simulation + +} + +begin_current_injection { + + // No current injection for this simulation + +} + +begin_field_injection { + + // No field injection for this simulation + +} + +begin_particle_collisions{ + + // No collisions for this simulation + +} diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 62505147..1639a044 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -15,6 +15,11 @@ #include "dumpmacros.h" #include "../util/io/FileUtils.h" +#ifdef VPIC_ENABLE_HDF5 +#include "hdf5.h" // from the lib +#include "hdf5_header_info.h" // from vpic +#endif + /* -1 means no ranks talk */ #define VERBOSE_rank -1 @@ -256,6 +261,897 @@ vpic_simulation::dump_hydro( const char *sp_name, if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); } +#ifdef VPIC_ENABLE_HDF5 +#define DUMP_DIR_FORMAT "./%s" + +/* define to do C-style indexing */ +#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + +// TODO: make function? +#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, main_body_head, time_step); \ + if (field_dump_flag.enabledE()) \ + write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ + if (field_dump_flag.div_e_err) \ + fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ + if (field_dump_flag.enabledCB()) \ + write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ + if (field_dump_flag.div_b_err) \ + fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ + if (field_dump_flag.enabledTCA()) \ + write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ + if (field_dump_flag.rhob) \ + fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ + if (field_dump_flag.enabledJF()) \ + write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ + if (field_dump_flag.rhof) \ + fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ + if (field_dump_flag.enabledEMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ + if (field_dump_flag.nmat) \ + fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ + if (field_dump_flag.enabledFMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ + if (field_dump_flag.cmat) \ + fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ + fprintf(fp, "%s", main_body_foot); \ + if (add_footer_flag) \ + fputs(footer, fp); \ + fclose(fp); \ + } +void +vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag ) +{ + size_t step_for_viou = step(); + + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + +#ifdef DUMP_INFO_DEBUG + printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); + printf("base dir for field: %s \n", global->fdParams.baseDir); + printf("stride x y z = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z); + printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); + printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); + printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); +#endif + +#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + + char fname[256]; + char field_scratch[128]; + char subfield_scratch[128]; + + sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); + dump_mkdir(field_scratch); + sprintf(subfield_scratch, "%s/T.%lld/", field_scratch, step_for_viou); + dump_mkdir(subfield_scratch); + + sprintf(fname, "%s/%s_%lld.h5", subfield_scratch, "fields", step_for_viou); + double el1 = uptime(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + sprintf(fname, "Timestep_%lld", step_for_viou); + hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + el1 = uptime() - el1; + //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); + + /* +// Create a variable list of field values to output. +size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); +size_t * varlist = new size_t[numvars]; + +for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; + +printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ + +#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + // Local voxel mesh resolution. Voxels are + // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 + // with voxels 1:nx,1:ny,1:nz being non-ghost + // voxels. + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + plist_id = H5Pcreate(H5P_DATASET_XFER); + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + + //global->topology_x + + hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; + field_global_size[0] = (grid->nx * grid->gpx); + field_global_size[1] = (grid->ny * grid->gpy); + field_global_size[2] = (grid->nz * grid->gpz); + + field_local_size[0] = grid->nx; + field_local_size[1] = grid->ny; + field_local_size[2] = grid->nz; + + // TODO: delete this +#define RANK_TO_INDEX2(rank, ix, iy, iz) \ + BEGIN_PRIMITIVE \ + { \ + int _ix, _iy, _iz; \ + _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ + _iy = _ix / int(grid->gpx); /* iy = iy+gpy*iz */ \ + _ix -= _iy * int(grid->gpx); /* ix = ix */ \ + _iz = _iy / int(grid->gpy); /* iz = iz */ \ + _iy -= _iz * int(grid->gpy); /* iy = iy */ \ + (ix) = _ix; \ + (iy) = _iy; \ + (iz) = _iz; \ + } \ + END_PRIMITIVE + + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; + + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); + +#ifdef DUMP_INFO_DEBUG + printf("global size = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", field_global_size[0], field_global_size[1], field_global_size[2]); + printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + fflush(stdout); +#endif + + hid_t filespace = H5Screate_simple(3, field_global_size, NULL); + hid_t memspace = H5Screate_simple(3, field_local_size, NULL); + hid_t dataspace_id; + + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + + if (field_dump_flag.ex) + DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); + if (field_dump_flag.ey) + DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); + if (field_dump_flag.ez) + DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_e_err) + DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.cbx) + DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); + if (field_dump_flag.cby) + DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); + if (field_dump_flag.cbz) + DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_b_err) + DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.tcax) + DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcay) + DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcaz) + DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhob) + DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); + + if (field_dump_flag.jfx) + DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfy) + DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfz) + DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhof) + DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); + + //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) + if (field_dump_flag.ematx) + DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); + if (field_dump_flag.ematy) + DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); + if (field_dump_flag.ematz) + DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); + if (field_dump_flag.nmat) + DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); + + if (field_dump_flag.fmatx) + DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); + if (field_dump_flag.fmaty) + DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); + if (field_dump_flag.fmatz) + DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); + if (field_dump_flag.cmat) + DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); + + el2 = uptime() - el2; + //sim_log("TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id); + + free(temp_buf); + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //sim_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + //int fields_interval = global->fields_interval; + // TODO: make sure field interval is set + int nframes = num_step / field_interval + 1; + static int field_tframe = 0; + +#ifdef DUMP_INFO_DEBUG + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" field_interval: %d \n", field_interval); + printf(" current step: %lld \n", step_for_viou); + printf(" current step: %lld \n", step_for_viou); + + //printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", field_tframe); +#endif + + if (field_tframe >= 1) + { + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + field_tframe++; + } +} +void vpic_simulation::dump_hydro_hdf5( const char *speciesname, + const char *fbase, + int ftag ) +{ + size_t step_for_viou = step(); + +#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + //#define DUMP_INFO_DEBUG 1 + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + species_t *sp = find_species_name(speciesname, species_list); + if (!sp) + ERROR(("Invalid species name: %s", speciesname)); + +#ifdef ENABLE_V407_SCIDAC + clear_hydro( hydro, grid ); + accumulate_hydro_p( hydro, sp->p, sp->np, sp->q_m, interpolator, grid ); + synchronize_hydro( hydro, grid ); +#else + clear_hydro_array(hydro_array); + accumulate_hydro_p(hydro_array, sp, interpolator_array); + synchronize_hydro_array(hydro_array); +#endif + /*#ifdef DUMP_INFO_DEBUG +printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); +printf("base dir for field: %s \n", global->fdParams.baseDir); +printf("stride x y z = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z); +printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); +printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); +printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); +printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); +#endif*/ + + char hname[256]; + char hydro_scratch[128]; + char subhydro_scratch[128]; + + sprintf(hydro_scratch, "./%s", "hydro_hdf5"); + dump_mkdir(hydro_scratch); + sprintf(subhydro_scratch, "%s/T.%lld/", hydro_scratch, step_for_viou); + dump_mkdir(subhydro_scratch); + + sprintf(hname, "%s/hydro_%s_%lld.h5", subhydro_scratch, speciesname, step_for_viou); + double el1 = uptime(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + sprintf(hname, "Timestep_%lld", step_for_viou); + hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + el1 = uptime() - el1; + //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); + + // Create a variable list of field values to output. + //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); + //size_t *varlist = new size_t[numvars]; + + //for (size_t i(0), c(0); i < total_field_variables; i++) + // if (global->fdParams.output_vars.bitset(i)) + // varlist[c++] = i; + + //printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars); + + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; + + //typedef struct hydro_array { + // hydro_t * ALIGNED(128) h; + // grid_t * g; + //} hydro_array_t; + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + plist_id = H5Pcreate(H5P_DATASET_XFER); + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + + //global->topology_x + + hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; + hydro_global_size[0] = (grid->nx * grid->gpx); + hydro_global_size[1] = (grid->ny * grid->gpy); + hydro_global_size[2] = (grid->nz * grid->gpz); + + hydro_local_size[0] = grid->nx; + hydro_local_size[1] = grid->ny; + hydro_local_size[2] = grid->nz; + +#define RANK_TO_INDEX2(rank, ix, iy, iz) \ + BEGIN_PRIMITIVE \ + { \ + int _ix, _iy, _iz; \ + _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ + _iy = _ix / int(grid->gpx); /* iy = iy+gpy*iz */ \ + _ix -= _iy * int(grid->gpx); /* ix = ix */ \ + _iz = _iy / int(grid->gpy); /* iz = iz */ \ + _iy -= _iz * int(grid->gpy); /* iy = iy */ \ + (ix) = _ix; \ + (iy) = _iy; \ + (iz) = _iz; \ + } \ + END_PRIMITIVE + + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; + + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); + +#ifdef DUMP_INFO_DEBUG + printf("global size = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + fflush(stdout); +#endif + + hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL); + hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); + hid_t dataspace_id; + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; + + if (hydro_dump_flag.jx) + DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jy) + DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jz) + DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.rho) + DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.px) + DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.py) + DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.pz) + DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.ke) + DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.txx) + DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tyy) + DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzz) + DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.tyz) + DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzx) + DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.txy) + DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); + + el2 = uptime() - el2; + //sim_log("TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id); + + free(temp_buf); + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //sim_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char output_xml_file[128]; + sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf"); + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + int nframes = num_step / field_interval + 1; + int fields_interval = field_interval; + static int tframe = 0; + +#ifdef DUMP_INFO_DEBUG + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" fields_interval: %d \n", fields_interval); + printf(" current step: %lld \n", step_for_viou); + printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", tframe); +#endif + + char speciesname_new[128]; + sprintf(speciesname_new, "hydro_%s", speciesname); + if (tframe >= 1) + { + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, fields_interval); + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + tframe++; + } +} + +// TODO": make the sp_name and speciesname varailbe naming consistent +void +vpic_simulation::dump_particles_hdf5( const char *sp_name, + const char *fbase, + int ftag ) +{ + size_t step_for_viou = step(); + char fname[256]; + char group_name[256]; + char particle_scratch[128]; + char subparticle_scratch[128]; + + int np_local; + species_t *sp; + + float *Pf; + int *Pi; + + // get the total number of particles. in this example, output only electrons + sp = species_list; + sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); + dump_mkdir(particle_scratch); + sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); + dump_mkdir(subparticle_scratch); + + // TODO: Allow the user to set this + + int stride_particle_dump = 1; + while (sp) + { + np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; + + // make a copy of the part of particle data to be dumped + double ec1 = uptime(); + + int sp_np = sp->np; + int sp_max_np = sp->max_np; + particle_t *ALIGNED(128) p_buf = NULL; + if (!p_buf) + MALLOC_ALIGNED(p_buf, np_local, 128); + particle_t *sp_p = sp->p; + sp->p = p_buf; + sp->np = np_local; + sp->max_np = np_local; + + for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) + { + COPY(&sp->p[i], &sp_p[iptl], 1); + } + #ifdef ENABLE_V407_SCIDAC + # define PBUF_SIZE 32768 // 1MB of particles + for( int buf_start=0; buf_start np_local ) n_buf = np_local - buf_start; + COPY( p_buf, &sp->p[buf_start], n_buf ); + center_p( p_buf, n_buf, sp->q_m, interpolator, grid ); + } + #else + center_p(sp, interpolator_array); + #endif + ec1 = uptime() - ec1; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; + //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); + + Pf = (float *)sp->p; + Pi = (int *)sp->p; + + // open HDF5 file in "particle/T./" subdirectory + // filename: eparticle.h5p + sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); + sprintf(group_name, "/Timestep_%ld", step_for_viou); + double el1 = uptime(); + + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + H5Pclose(plist_id); + + long long total_particles, offset; + long long numparticles = np_local; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; + + hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); + + hsize_t memspace_count_temp = numparticles * 8; + hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + plist_id = H5Pcreate(H5P_DATASET_XFER); + + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); + + hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; + H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); + + el1 = uptime() - el1; + //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + + double el2 = uptime(); + + hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable dX \n"); + + dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable dY \n"); + + dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable dZ \n"); + + dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable i \n"); + + dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable Ux \n"); + + dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable Uy \n"); + + dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable Uz \n"); + + dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); + H5Dclose(dset_id); + //if (rank == 0) printf ("Written variable q \n"); + + el2 = uptime() - el2; + //sim_log("Particle TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + H5Sclose(memspace); + H5Sclose(filespace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + el3 = uptime() - el3; + //sim_log("Particle TimeHDF5Close: " << el3 << " s"); + + sp->p = sp_p; + sp->np = sp_np; + sp->max_np = sp_max_np; + FREE_ALIGNED(p_buf); + + // Write metadata if step() == 0 + char meta_fname[256]; + + sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); + + double meta_el1 = uptime(); + + hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id); + hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Pclose(meta_plist_id); + + long long meta_total_particles, meta_offset; + long long meta_numparticles = 1; + MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + meta_offset -= meta_numparticles; + + hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL); + hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL); + meta_plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE); + H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL); + meta_el1 = uptime() - meta_el1; + //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts + + double meta_el2 = uptime(); + + hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dX \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dY \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dZ \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable i \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz); + H5Dclose(meta_dset_id); + + meta_el2 = uptime() - meta_el2; + //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); + double meta_el3 = uptime(); + H5Sclose(meta_memspace); + H5Sclose(meta_filespace); + H5Pclose(meta_plist_id); + H5Gclose(meta_group_id); + H5Fclose(meta_file_id); + meta_el3 = uptime() - meta_el3; + //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); + + sp = sp->next; + } +} +#endif + void vpic_simulation::dump_particles( const char *sp_name, const char *fbase, @@ -699,9 +1595,6 @@ vpic_simulation::hydro_dump( const char * speciesname, int dim[3]; - /* define to do C-style indexing */ -# define hydro(x,y,z) hydro_array->h[VOXEL(x,y,z, grid->nx,grid->ny,grid->nz)] - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ nxout = (grid->nx)/istride; nyout = (grid->ny)/jstride; diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h new file mode 100644 index 00000000..4f1ee934 --- /dev/null +++ b/src/vpic/hdf5_header_info.h @@ -0,0 +1,259 @@ +#ifndef VPIC_HDF5_HEAD_INFO +#define VPIC_HDF5_HEAD_INFO + +#define FIELD_ARRAY_NAME field_array +struct field_dump_flag_t +{ + bool ex = true, ey = true, ez = true, div_e_err = true; + bool cbx = true, cby = true, cbz = true, div_b_err = true; + bool tcax = true, tcay = true, tcaz = true, rhob = true; + bool jfx = true, jfy = true, jfz = true, rhof = true; + bool ematx = true, ematy = true, ematz = true, nmat = true; + bool fmatx = true, fmaty = true, fmatz = true, cmat = true; + void disableE() + { + ex = false, ey = false, ez = false, div_e_err = false; + } + + void disableCB() + { + cbx = false, cby = false, cbz = false, div_b_err = false; + } + + void disableTCA() + { + tcax = false, tcay = false, tcaz = false, rhob = false; + } + + void disableJF() + { + jfx = false, jfy = false, jfz = false, rhof = false; + } + + void disableEMAT() + { + ematx = false, ematy = false, ematz = false, nmat = false; + } + + void disableFMAT() + { + fmatx = false, fmaty = false, fmatz = false, cmat = false; + } + + void resetToDefaults() + { + ex = true, ey = true, ez = true, div_e_err = true; + cbx = true, cby = true, cbz = true, div_b_err = true; + tcax = true, tcay = true, tcaz = true, rhob = true; + jfx = true, jfy = true, jfz = true, rhof = true; + ematx = true, ematy = true, ematz = true, nmat = true; + fmatx = true, fmaty = true, fmatz = true, cmat = true; + } + + bool enabledE() + { + return ex && ey && ez; + } + + bool enabledCB() + { + return cbx && cby && cbz; + } + + bool enabledTCA() + { + return tcax && tcay && tcaz; + } + + bool enabledJF() + { + return jfx && jfy && jfz; + } + + bool enabledEMAT() + { + return ematx && ematy && ematz; + } + + bool enabledFMAT() + { + return fmatx && fmaty && fmatz; + } +}; + +struct hydro_dump_flag_t +{ + bool jx = true, jy = true, jz = true, rho = true; + bool px = true, py = true, pz = true, ke = true; + bool txx = true, tyy = true, tzz = true; + bool tyz = true, tzx = true, txy = true; + + void disableJ() + { + jx = false, jy = false, jz = false, rho = false; + } + + void disableP() + { + px = false, py = false, pz = false, ke = false; + } + + void disableTD() //Stress diagonal + { + txx = false, tyy = false, tzz = false; + } + + void disableTOD() //Stress off-diagonal + { + tyz = false, tzx = false, txy = false; + } + void resetToDefaults() + { + jx = true, jy = true, jz = true, rho = true; + px = true, py = true, pz = true, ke = true; + txx = true, tyy = true, tzz = true; + tyz = true, tzx = true, txy = true; + } + + bool enabledJ() + { + return jx && jy && jz; + } + + bool enabledP() + { + return px && py && pz; + } + + bool enabledTD() + { + return txx && tyy && tzz; + } + + bool enabledTOD() + { + return tyz && tzx && txy; + } +}; + +// Declare vars to use +hydro_dump_flag_t hydro_dump_flag; +field_dump_flag_t field_dump_flag; + +// XML header stuff +const char *header = "\n\n\n\t\n"; +const char *header_topology = "\t\t\n"; +const char *header_geom = "\t\t\n"; +const char *header_origin = "\t\t\t \n\t\t\t%s\n"; +const char *header_dxdydz = "\t\t\t \n\t\t\t%s\n"; +const char *footer_geom = "\t\t\n"; +const char *grid_line = "\t\t \n \ +\t\t\t\n"; +const char *footer = "\t\t\n\t\n\n"; + +const char *main_body_head = "\t\t\t \n \ +\t\t\t\t \n \ +\t\t\t\t \n"; +const char *main_body_foot = "\t\t\t\n"; + +const char *main_body_attributeV = "\ + \t\t\t\t \n \ + \t\t\t\t\t \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t \n \ + \t\t\t\t \n "; + +const char *main_body_attributeS = "\ + \t\t\t\t \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t \n "; + +#define create_file_with_header(xml_file_name, dimensions, orignal, dxdydz, nframes, fields_interval) \ + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "w"); \ + fputs(header, fp); \ + fprintf(fp, header_topology, dimensions); \ + fputs(header_geom, fp); \ + fprintf(fp, header_origin, orignal); \ + fprintf(fp, header_dxdydz, dxdydz); \ + fputs(footer_geom, fp); \ + fprintf(fp, grid_line, nframes); \ + int i; \ + for (i = 0; i < nframes; i++) \ + fprintf(fp, "%d ", i*fields_interval); \ + fputs(grid_line_footer, fp); \ + fclose(fp); \ + } +#define write_main_body_attribute(fpp, main_body_attribute_p, attribute_name, dims_4d_p, dims_3d_p, file_name_pre_p, time_step_p, a1, a2, a3) \ + { \ + fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3); \ + } + +#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, main_body_head, time_step); \ + if (field_dump_flag.enabledE()) \ + write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ + if (field_dump_flag.div_e_err) \ + fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ + if (field_dump_flag.enabledCB()) \ + write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ + if (field_dump_flag.div_b_err) \ + fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ + if (field_dump_flag.enabledTCA()) \ + write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ + if (field_dump_flag.rhob) \ + fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ + if (field_dump_flag.enabledJF()) \ + write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ + if (field_dump_flag.rhof) \ + fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ + if (field_dump_flag.enabledEMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ + if (field_dump_flag.nmat) \ + fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ + if (field_dump_flag.enabledFMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ + if (field_dump_flag.cmat) \ + fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ + fprintf(fp, "%s", main_body_foot); \ + if (add_footer_flag) \ + fputs(footer, fp); \ + fclose(fp); \ + } +#define invert_hydro_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, main_body_head, time_step); \ + if (hydro_dump_flag.enabledJ()) \ + write_main_body_attribute(fp, main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz"); \ + if (hydro_dump_flag.rho) \ + fprintf(fp, main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho"); \ + if (hydro_dump_flag.enabledP()) \ + write_main_body_attribute(fp, main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz"); \ + if (hydro_dump_flag.ke) \ + fprintf(fp, main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke"); \ + if (hydro_dump_flag.enabledTD()) \ + write_main_body_attribute(fp, main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz"); \ + if (hydro_dump_flag.enabledTOD()) \ + write_main_body_attribute(fp, main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \ + fprintf(fp, "%s", main_body_foot); \ + if (add_footer_flag) \ + fputs(footer, fp); \ + fclose(fp); \ + } + + +#endif // VPIC_HDF5_HEAD_INFO diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index f7518836..80c2aaca 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -233,11 +233,19 @@ class vpic_simulation { // Binary dumps void dump_grid( const char *fbase ); + void dump_fields( const char *fbase, int fname_tag = 1 ); + void dump_fields_hdf5( const char *fbase, int fname_tag = 1 ); + void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 ); + void dump_hydro_hdf5( const char *sp_name, const char *fbase, + int fname_tag = 1 ); + void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); + void dump_particles_hdf5( const char *sp_name, const char *fbase, + int fname_tag = 1 ); // convenience functions for simlog output void create_field_list(char * strlist, DumpParameters & dumpParams); From d1ba88c976c90fc279d54f27f8c673e8dd5f7021 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 13 Aug 2019 14:01:34 -0600 Subject: [PATCH 53/95] fixed small bug in global topoly setting and add field_interval value to test deck --- sample/harrisHDF5 | 1 + src/grid/partition.cc | 4 +-- src/vpic/dump.cc | 69 ++++++++++++++++++++++--------------------- src/vpic/vpic.h | 2 +- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index ff565f33..2b43e3d2 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -130,6 +130,7 @@ begin_initialization { num_step = int(0.2*taui/(wci*dt)); status_interval = int(1./(wci*dt)); + field_interval = 1; sync_shared_interval = status_interval; clean_div_e_interval = status_interval; clean_div_b_interval = status_interval; diff --git a/src/grid/partition.cc b/src/grid/partition.cc index fc554c2d..ff9b09f4 100644 --- a/src/grid/partition.cc +++ b/src/grid/partition.cc @@ -57,8 +57,8 @@ partition_periodic_box( grid_t * g, // Capture global processor decomposition g->gpx = gpx; - g->gpx = gpy; - g->gpx = gpz; + g->gpy = gpy; + g->gpz = gpz; g->dx = (gx1-gx0)/(double)gnx; g->dy = (gy1-gy0)/(double)gny; diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 1639a044..cbdb1289 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -264,6 +264,22 @@ vpic_simulation::dump_hydro( const char *sp_name, #ifdef VPIC_ENABLE_HDF5 #define DUMP_DIR_FORMAT "./%s" +// TODO: rename or remove this +#define RANK_TO_INDEX2(rank, ix, iy, iz) \ + BEGIN_PRIMITIVE \ + { \ + int _ix, _iy, _iz; \ + _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ + _iy = _ix / grid->gpx; /* iy = iy+gpy*iz */ \ + _ix -= _iy * grid->gpx; /* ix = ix */ \ + _iz = _iy / grid->gpy; /* iz = iz */ \ + _iy -= _iz * grid->gpy; /* iy = iy */ \ + (ix) = _ix; \ + (iy) = _iy; \ + (iz) = _iz; \ + } \ + END_PRIMITIVE + /* define to do C-style indexing */ #define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] @@ -412,26 +428,26 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ field_local_size[1] = grid->ny; field_local_size[2] = grid->nz; - // TODO: delete this -#define RANK_TO_INDEX2(rank, ix, iy, iz) \ - BEGIN_PRIMITIVE \ - { \ - int _ix, _iy, _iz; \ - _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ - _iy = _ix / int(grid->gpx); /* iy = iy+gpy*iz */ \ - _ix -= _iy * int(grid->gpx); /* ix = ix */ \ - _iz = _iy / int(grid->gpy); /* iz = iz */ \ - _iy -= _iz * int(grid->gpy); /* iy = iy */ \ - (ix) = _ix; \ - (iy) = _iy; \ - (iz) = _iz; \ - } \ - END_PRIMITIVE + int gpx = grid->gpx; + int gpy = grid->gpy; + int gpz = grid->gpz; int mpi_rank_x, mpi_rank_y, mpi_rank_z; - RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + //RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + + int _ix, _iy, _iz; + _ix = (mpi_rank); + _iy = _ix / grid->gpx; + _ix -= _iy * grid->gpx; + _iz = _iy / grid->gpy; + _iy -= _iz * grid->gpy; + int ix = _ix; + int iy = _iy; + int iz = _iz; + + mpi_rank_x = ix; + mpi_rank_y = iy; + mpi_rank_z = iz; global_offset[0] = (grid->nx) * mpi_rank_x; global_offset[1] = (grid->ny) * mpi_rank_y; @@ -730,21 +746,6 @@ printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid- hydro_local_size[1] = grid->ny; hydro_local_size[2] = grid->nz; -#define RANK_TO_INDEX2(rank, ix, iy, iz) \ - BEGIN_PRIMITIVE \ - { \ - int _ix, _iy, _iz; \ - _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ - _iy = _ix / int(grid->gpx); /* iy = iy+gpy*iz */ \ - _ix -= _iy * int(grid->gpx); /* ix = ix */ \ - _iz = _iy / int(grid->gpy); /* iz = iz */ \ - _iy -= _iz * int(grid->gpy); /* iy = iy */ \ - (ix) = _ix; \ - (iy) = _iy; \ - (iz) = _iz; \ - } \ - END_PRIMITIVE - int mpi_rank_x, mpi_rank_y, mpi_rank_z; RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); @@ -948,7 +949,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, { COPY(&sp->p[i], &sp_p[iptl], 1); } - #ifdef ENABLE_V407_SCIDAC + #ifdef ENABLE_V407_SCIDAC # define PBUF_SIZE 32768 // 1MB of particles for( int buf_start=0; buf_start Date: Tue, 13 Aug 2019 14:37:05 -0600 Subject: [PATCH 54/95] updated field interval to be the correct value --- sample/harrisHDF5 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index 2b43e3d2..e8bf115d 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -130,7 +130,7 @@ begin_initialization { num_step = int(0.2*taui/(wci*dt)); status_interval = int(1./(wci*dt)); - field_interval = 1; + field_interval = status_interval; sync_shared_interval = status_interval; clean_div_e_interval = status_interval; clean_div_b_interval = status_interval; From 7a10247e5617000c5249341c3b46d4bded434cfc Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Thu, 10 Oct 2019 09:56:23 -0600 Subject: [PATCH 55/95] clean up hdf5 build system. require paralle, and includes and guard example deck agaisnt no hdf5 --- CMakeLists.txt | 4 ++++ sample/harrisHDF5 | 6 ++++++ src/vpic/dump.cc | 2 ++ 3 files changed, 12 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f7fd9d84..46b15741 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -333,9 +333,13 @@ endif() if(USE_HDF5) # Enable HDF5, and the relevant defines find_package(HDF5 REQUIRED) + if (NOT HDF5_IS_PARALLEL) + message(FATAL_ERROR "HDF5 Parallel support is required: ${HDF5_IS_PARALLEL}") + endif() add_definitions(-DVPIC_ENABLE_HDF5) string(REPLACE ";" " " string_libraries "${HDF5_C_LIBRARIES}") set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}") + include_directories(${HDF5_INCLUDE_DIRS}) endif(USE_HDF5) # Configure local script to generate bin/vpic diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index e8bf115d..6beedeed 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -25,6 +25,12 @@ // to zero before the user's initialization block is executed. Up to 16K // of global variables can be defined. + +// Deck only works if VPIC was build with HDF support. Check for that: +#ifndef VPIC_ENABLE_HDF5 +#error "VPIC_ENABLE_HDF5" is required +#endif + begin_globals { double energies_interval; double fields_interval; diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index cbdb1289..837ddda3 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -15,6 +15,8 @@ #include "dumpmacros.h" #include "../util/io/FileUtils.h" +#include + #ifdef VPIC_ENABLE_HDF5 #include "hdf5.h" // from the lib #include "hdf5_header_info.h" // from vpic From 5f8211b0c7c699ffda7ea63af7c89a0c433895fa Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Thu, 10 Oct 2019 10:22:34 -0600 Subject: [PATCH 56/95] allow user to change dumping flags in the deck --- sample/harrisHDF5 | 5 ++ src/vpic/dump.cc | 2 - src/vpic/hdf5_header_info.h | 137 -------------------------------- src/vpic/vpic.h | 150 +++++++++++++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 142 deletions(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index 6beedeed..0c084b07 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -47,6 +47,11 @@ begin_initialization { // Then the initial non-zero fields need to be loaded at time level 0 and the // particles (position and momentum both) need to be loaded at time level 0. + + // Example of how to call / set dumping + field_dump_flag.disableEMAT(); + + double input_mass_ratio; int input_seed; diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 837ddda3..cbdb1289 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -15,8 +15,6 @@ #include "dumpmacros.h" #include "../util/io/FileUtils.h" -#include - #ifdef VPIC_ENABLE_HDF5 #include "hdf5.h" // from the lib #include "hdf5_header_info.h" // from vpic diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h index 4f1ee934..baed8f7d 100644 --- a/src/vpic/hdf5_header_info.h +++ b/src/vpic/hdf5_header_info.h @@ -2,143 +2,6 @@ #define VPIC_HDF5_HEAD_INFO #define FIELD_ARRAY_NAME field_array -struct field_dump_flag_t -{ - bool ex = true, ey = true, ez = true, div_e_err = true; - bool cbx = true, cby = true, cbz = true, div_b_err = true; - bool tcax = true, tcay = true, tcaz = true, rhob = true; - bool jfx = true, jfy = true, jfz = true, rhof = true; - bool ematx = true, ematy = true, ematz = true, nmat = true; - bool fmatx = true, fmaty = true, fmatz = true, cmat = true; - void disableE() - { - ex = false, ey = false, ez = false, div_e_err = false; - } - - void disableCB() - { - cbx = false, cby = false, cbz = false, div_b_err = false; - } - - void disableTCA() - { - tcax = false, tcay = false, tcaz = false, rhob = false; - } - - void disableJF() - { - jfx = false, jfy = false, jfz = false, rhof = false; - } - - void disableEMAT() - { - ematx = false, ematy = false, ematz = false, nmat = false; - } - - void disableFMAT() - { - fmatx = false, fmaty = false, fmatz = false, cmat = false; - } - - void resetToDefaults() - { - ex = true, ey = true, ez = true, div_e_err = true; - cbx = true, cby = true, cbz = true, div_b_err = true; - tcax = true, tcay = true, tcaz = true, rhob = true; - jfx = true, jfy = true, jfz = true, rhof = true; - ematx = true, ematy = true, ematz = true, nmat = true; - fmatx = true, fmaty = true, fmatz = true, cmat = true; - } - - bool enabledE() - { - return ex && ey && ez; - } - - bool enabledCB() - { - return cbx && cby && cbz; - } - - bool enabledTCA() - { - return tcax && tcay && tcaz; - } - - bool enabledJF() - { - return jfx && jfy && jfz; - } - - bool enabledEMAT() - { - return ematx && ematy && ematz; - } - - bool enabledFMAT() - { - return fmatx && fmaty && fmatz; - } -}; - -struct hydro_dump_flag_t -{ - bool jx = true, jy = true, jz = true, rho = true; - bool px = true, py = true, pz = true, ke = true; - bool txx = true, tyy = true, tzz = true; - bool tyz = true, tzx = true, txy = true; - - void disableJ() - { - jx = false, jy = false, jz = false, rho = false; - } - - void disableP() - { - px = false, py = false, pz = false, ke = false; - } - - void disableTD() //Stress diagonal - { - txx = false, tyy = false, tzz = false; - } - - void disableTOD() //Stress off-diagonal - { - tyz = false, tzx = false, txy = false; - } - void resetToDefaults() - { - jx = true, jy = true, jz = true, rho = true; - px = true, py = true, pz = true, ke = true; - txx = true, tyy = true, tzz = true; - tyz = true, tzx = true, txy = true; - } - - bool enabledJ() - { - return jx && jy && jz; - } - - bool enabledP() - { - return px && py && pz; - } - - bool enabledTD() - { - return txx && tyy && tzz; - } - - bool enabledTOD() - { - return tyz && tzx && txy; - } -}; - -// Declare vars to use -hydro_dump_flag_t hydro_dump_flag; -field_dump_flag_t field_dump_flag; // XML header stuff const char *header = "\n\n\n\t\n"; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index bed8ab97..e90808bb 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -34,6 +34,144 @@ #endif // #include "dumpvars.h" + +// TODO: move these to a better header? +#ifdef VPIC_ENABLE_HDF5 +struct field_dump_flag_t +{ + bool ex = true, ey = true, ez = true, div_e_err = true; + bool cbx = true, cby = true, cbz = true, div_b_err = true; + bool tcax = true, tcay = true, tcaz = true, rhob = true; + bool jfx = true, jfy = true, jfz = true, rhof = true; + bool ematx = true, ematy = true, ematz = true, nmat = true; + bool fmatx = true, fmaty = true, fmatz = true, cmat = true; + void disableE() + { + ex = false, ey = false, ez = false, div_e_err = false; + } + + void disableCB() + { + cbx = false, cby = false, cbz = false, div_b_err = false; + } + + void disableTCA() + { + tcax = false, tcay = false, tcaz = false, rhob = false; + } + + void disableJF() + { + jfx = false, jfy = false, jfz = false, rhof = false; + } + + void disableEMAT() + { + ematx = false, ematy = false, ematz = false, nmat = false; + } + + void disableFMAT() + { + fmatx = false, fmaty = false, fmatz = false, cmat = false; + } + + void resetToDefaults() + { + ex = true, ey = true, ez = true, div_e_err = true; + cbx = true, cby = true, cbz = true, div_b_err = true; + tcax = true, tcay = true, tcaz = true, rhob = true; + jfx = true, jfy = true, jfz = true, rhof = true; + ematx = true, ematy = true, ematz = true, nmat = true; + fmatx = true, fmaty = true, fmatz = true, cmat = true; + } + + bool enabledE() + { + return ex && ey && ez; + } + + bool enabledCB() + { + return cbx && cby && cbz; + } + + bool enabledTCA() + { + return tcax && tcay && tcaz; + } + + bool enabledJF() + { + return jfx && jfy && jfz; + } + + bool enabledEMAT() + { + return ematx && ematy && ematz; + } + + bool enabledFMAT() + { + return fmatx && fmaty && fmatz; + } +}; + +struct hydro_dump_flag_t +{ + bool jx = true, jy = true, jz = true, rho = true; + bool px = true, py = true, pz = true, ke = true; + bool txx = true, tyy = true, tzz = true; + bool tyz = true, tzx = true, txy = true; + + void disableJ() + { + jx = false, jy = false, jz = false, rho = false; + } + + void disableP() + { + px = false, py = false, pz = false, ke = false; + } + + void disableTD() //Stress diagonal + { + txx = false, tyy = false, tzz = false; + } + + void disableTOD() //Stress off-diagonal + { + tyz = false, tzx = false, txy = false; + } + void resetToDefaults() + { + jx = true, jy = true, jz = true, rho = true; + px = true, py = true, pz = true, ke = true; + txx = true, tyy = true, tzz = true; + tyz = true, tzx = true, txy = true; + } + + bool enabledJ() + { + return jx && jy && jz; + } + + bool enabledP() + { + return px && py && pz; + } + + bool enabledTD() + { + return txx && tyy && tzz; + } + + bool enabledTOD() + { + return tyz && tzx && txy; + } +}; +#endif + typedef FileIO FILETYPE; const uint32_t all (0xffffffff); @@ -235,17 +373,23 @@ class vpic_simulation { void dump_grid( const char *fbase ); void dump_fields( const char *fbase, int fname_tag = 1 ); - void dump_fields_hdf5( const char *fbase, int fname_tag = 1 ); void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 ); - void dump_hydro_hdf5( const char *sp_name, const char *fbase, - int fname_tag = 1 ); void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); +#ifdef VPIC_ENABLE_HDF5 void dump_particles_hdf5( const char *sp_name, const char *fbase, int fname_tag = 1 ); + void dump_hydro_hdf5( const char *sp_name, const char *fbase, + int fname_tag = 1 ); + void dump_fields_hdf5( const char *fbase, int fname_tag = 1 ); + + // Declare vars to use + hydro_dump_flag_t hydro_dump_flag; + field_dump_flag_t field_dump_flag; +#endif // convenience functions for simlog output void create_field_list(char * strlist, DumpParameters & dumpParams); From 15d8b89a5f8058d79d5a8a716f7fb8815abe30b1 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Mon, 14 Oct 2019 17:40:05 -0600 Subject: [PATCH 57/95] default init hdf5 dumping structs --- src/vpic/vpic.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index 4f150afc..a97d7978 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -7,9 +7,9 @@ * March/April 2004 - Heavily revised and extended from earlier V4PIC versions * */ - + #include "vpic.h" - + /* Note that, when a vpic_simulation is created (and thus registered with the checkpt service), it is created empty; none of the simulation objects on which it depends have been created yet. (These get created @@ -99,7 +99,7 @@ vpic_simulation::vpic_simulation() { // if( n_rng Date: Mon, 14 Oct 2019 18:39:44 -0600 Subject: [PATCH 58/95] temporarily disable second hydro dumping species as it breaks the current ported apporach --- sample/harrisHDF5 | 3 ++- src/vpic/dump.cc | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index 0c084b07..d6cb208e 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -142,6 +142,7 @@ begin_initialization { num_step = int(0.2*taui/(wci*dt)); status_interval = int(1./(wci*dt)); field_interval = status_interval; + hydro_interval = status_interval; sync_shared_interval = status_interval; clean_div_e_interval = status_interval; clean_div_b_interval = status_interval; @@ -381,7 +382,7 @@ begin_diagnostics { // accumulated using a self-consistent charge-conserving method. Hydro dumps // are in a binary format. Each rank makes a hydro dump. if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); - if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); + //if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); // Particle dumps store the particle data for a given species. The data // written is known at the time t = time(). By default, particle dumps diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index cbdb1289..698c2283 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -330,11 +330,11 @@ vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag ) #ifdef DUMP_INFO_DEBUG printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); - printf("base dir for field: %s \n", global->fdParams.baseDir); - printf("stride x y z = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z); + //printf("base dir for field: %s \n", fdParams.baseDir); + //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); - printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); + //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); #endif @@ -458,9 +458,9 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", field_global_size[0], field_global_size[1], field_global_size[2]); - printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]); + printf("global size = %d %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]); + printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); fflush(stdout); #endif @@ -624,6 +624,9 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ field_tframe++; } } + +// TODO: fix this, it currently uses a static global and the logic only +// supports 1 species otherwise things get out of sync void vpic_simulation::dump_hydro_hdf5( const char *speciesname, const char *fbase, int ftag ) @@ -758,9 +761,9 @@ printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid- global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]); + printf("global size = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); fflush(stdout); #endif From 8143ab3e69cfb375b8ed2c9b498d3a1375ad4f28 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 15 Oct 2019 13:03:17 -0600 Subject: [PATCH 59/95] modify sprinf of size_t to be the correct zu --- src/vpic/dump.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 698c2283..eb3be0ba 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -366,17 +366,17 @@ vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag ) sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); dump_mkdir(field_scratch); - sprintf(subfield_scratch, "%s/T.%lld/", field_scratch, step_for_viou); + sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); dump_mkdir(subfield_scratch); - sprintf(fname, "%s/%s_%lld.h5", subfield_scratch, "fields", step_for_viou); + sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); double el1 = uptime(); hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); H5Pclose(plist_id); - sprintf(fname, "Timestep_%lld", step_for_viou); + sprintf(fname, "Timestep_%zu", step_for_viou); hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); el1 = uptime() - el1; @@ -688,17 +688,17 @@ printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid- sprintf(hydro_scratch, "./%s", "hydro_hdf5"); dump_mkdir(hydro_scratch); - sprintf(subhydro_scratch, "%s/T.%lld/", hydro_scratch, step_for_viou); + sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); dump_mkdir(subhydro_scratch); - sprintf(hname, "%s/hydro_%s_%lld.h5", subhydro_scratch, speciesname, step_for_viou); + sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou); double el1 = uptime(); hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); H5Pclose(plist_id); - sprintf(hname, "Timestep_%lld", step_for_viou); + sprintf(hname, "Timestep_%zu", step_for_viou); hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); el1 = uptime() - el1; @@ -713,7 +713,7 @@ printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid- // if (global->fdParams.output_vars.bitset(i)) // varlist[c++] = i; - //printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars); + //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); //typedef struct hydro { From eaa4c3cd96be699ed85fcccb0f92e8965d05866d Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 15 Oct 2019 13:30:21 -0600 Subject: [PATCH 60/95] remove scidac 407 ifdefs --- src/vpic/dump.cc | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index eb3be0ba..2b990533 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -663,24 +663,9 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, if (!sp) ERROR(("Invalid species name: %s", speciesname)); -#ifdef ENABLE_V407_SCIDAC - clear_hydro( hydro, grid ); - accumulate_hydro_p( hydro, sp->p, sp->np, sp->q_m, interpolator, grid ); - synchronize_hydro( hydro, grid ); -#else clear_hydro_array(hydro_array); accumulate_hydro_p(hydro_array, sp, interpolator_array); synchronize_hydro_array(hydro_array); -#endif - /*#ifdef DUMP_INFO_DEBUG -printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); -printf("base dir for field: %s \n", global->fdParams.baseDir); -printf("stride x y z = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z); -printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); -printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); -printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); -printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); -#endif*/ char hname[256]; char hydro_scratch[128]; @@ -952,17 +937,9 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, { COPY(&sp->p[i], &sp_p[iptl], 1); } - #ifdef ENABLE_V407_SCIDAC - # define PBUF_SIZE 32768 // 1MB of particles - for( int buf_start=0; buf_start np_local ) n_buf = np_local - buf_start; - COPY( p_buf, &sp->p[buf_start], n_buf ); - center_p( p_buf, n_buf, sp->q_m, interpolator, grid ); - } - #else + center_p(sp, interpolator_array); - #endif + ec1 = uptime() - ec1; int mpi_rank; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); From 6ad27e80d0585fa785696739902a69edefda3bb1 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 15 Oct 2019 16:36:14 -0600 Subject: [PATCH 61/95] fix global static and re-enable multiple species dumping --- sample/harrisHDF5 | 4 ++-- src/vpic/dump.cc | 21 +++++++++++++-------- src/vpic/vpic.cc | 2 ++ src/vpic/vpic.h | 2 +- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index d6cb208e..2b3b21bf 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -381,8 +381,8 @@ begin_diagnostics { // purely diagnostic. It is not used by the simulation and it is not // accumulated using a self-consistent charge-conserving method. Hydro dumps // are in a binary format. Each rank makes a hydro dump. - if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); - //if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); + if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); + if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); // Particle dumps store the particle data for a given species. The data // written is known at the time t = time(). By default, particle dumps diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 2b990533..8ed36449 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -10,6 +10,7 @@ */ #include +#include #include "vpic.h" #include "dumpmacros.h" @@ -27,6 +28,10 @@ // COMPATIBLE WITH EXISTING EXTERNAL 3RD PARTY VISUALIZATION SOFTWARE. // IN THE LONG RUN, THIS EXTERNAL SOFTWARE WILL NEED TO BE UPDATED. +// TODO: this should live somewhere more sensible, but it's better than the +// global static it replaces +std::unordered_map tframe_map; + int vpic_simulation::dump_mkdir(const char * dname) { return FileUtils::makeDirectory(dname); } // dump_mkdir @@ -578,8 +583,6 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ char dxdydz[128]; sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - //int fields_interval = global->fields_interval; - // TODO: make sure field interval is set int nframes = num_step / field_interval + 1; static int field_tframe = 0; @@ -598,6 +601,8 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ printf(" tframe: %d \n", field_tframe); #endif + // TODO: this footer dumping is more likely better done in a + // destructor, rather than hoping a multiple division works out if (field_tframe >= 1) { if (field_tframe == (nframes - 1)) @@ -842,9 +847,9 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, char dxdydz[128]; sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - int nframes = num_step / field_interval + 1; - int fields_interval = field_interval; - static int tframe = 0; + int nframes = num_step / hydro_interval + 1; + + const int tframe = tframe_map[sp->id]; #ifdef DUMP_INFO_DEBUG printf(" meta file : %s \n", output_xml_file); @@ -853,7 +858,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, printf(" orignal: %s \n", orignal); printf(" dxdydz: %s \n", dxdydz); printf(" nframes: %d \n", nframes); - printf(" fields_interval: %d \n", fields_interval); + printf(" hydro_fields_interval: %d \n", hydro_interval); printf(" current step: %lld \n", step_for_viou); printf(" Simulation time: %f \n", grid->t0); printf(" tframe: %d \n", tframe); @@ -874,7 +879,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, } else { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, fields_interval); + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); if (tframe == (nframes - 1)) { invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); @@ -884,7 +889,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); } } - tframe++; + tframe_map[sp->id]++; } } diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index a97d7978..bfd18767 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -110,6 +110,8 @@ vpic_simulation::vpic_simulation() { #ifdef VPIC_ENABLE_HDF5 // Default init hdf5 dump flags + field_interval = 1; + hydro_interval = 1; field_dump_flag = field_dump_flag_t(); hydro_dump_flag = hydro_dump_flag_t(); #endif diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index e90808bb..ce1a2454 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -284,7 +284,7 @@ class vpic_simulation { double quota; int checkpt_interval; int hydro_interval; - int field_interval = 1; + int field_interval; int particle_interval; size_t nxout, nyout, nzout; From f81849115cb20b1538b127eeac183b27c4e450a0 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 16 Oct 2019 12:33:22 -0600 Subject: [PATCH 62/95] first pass adding support for converting p->i to a global i --- src/vpic/dump.cc | 56 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 8ed36449..97253c79 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -693,7 +693,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, el1 = uptime() - el1; //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + //double el2 = uptime(); // Create a variable list of field values to output. //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); @@ -802,7 +802,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname, if (hydro_dump_flag.txy) DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); - el2 = uptime() - el2; + //el2 = uptime() - el2; //sim_log("TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); @@ -990,49 +990,81 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, el1 = uptime() - el1; //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + //double el2 = uptime(); + // This point offset is silly, and loses the type safety (pf+1) hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable dX \n"); dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable dY \n"); dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable dZ \n"); + + int local_i = *(Pi + 3); + int write_i = local_i; + +#ifdef OUTPUT_CONVERT_GLOBAL_ID +# define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE { \ + int _ix, _iy, _iz; \ + _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ + _iy = _ix/int(nx); /* iy = iy+gpy*iz */ \ + _ix -= _iy*int(nx); /* ix = ix */ \ + _iz = _iy/int(ny); /* iz = iz */ \ + _iy -= _iz*int(ny); /* iy = iy */ \ + (ix) = _ix; \ + (iy) = _iy; \ + (iz) = _iz; \ + } END_PRIMITIVE + int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z + UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); + + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; + + int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz); + + write_i = global_i; + // TODO: update the address written below, it requires something more stable than a statced int +#undef UNVOXEL +#endif dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable i \n"); dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable Ux \n"); dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable Uy \n"); dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable Uz \n"); dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); H5Dclose(dset_id); - //if (rank == 0) printf ("Written variable q \n"); - el2 = uptime() - el2; + //el2 = uptime() - el2; //sim_log("Particle TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); From 73e071690489ec7269bf7cd415881be672204626 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 16 Oct 2019 15:32:07 -0600 Subject: [PATCH 63/95] add loop ovr particles and ability to write custom global pi to file --- src/vpic/dump.cc | 58 +++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 97253c79..733a2913 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -978,6 +978,11 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, hsize_t memspace_count_temp = numparticles * 8; hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + + // Don't need, can just use H5S_ALL + //hsize_t linearspace_count_temp = numparticles; + //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + plist_id = H5Pcreate(H5P_DATASET_XFER); //Comment out for test only @@ -1005,9 +1010,6 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); H5Dclose(dset_id); - int local_i = *(Pi + 3); - int write_i = local_i; - #ifdef OUTPUT_CONVERT_GLOBAL_ID # define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE { \ int _ix, _iy, _iz; \ @@ -1020,33 +1022,49 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, (iy) = _iy; \ (iz) = _iz; \ } END_PRIMITIVE - int ix, iy, iz, rx, ry, rz; - // Convert rank to local x/y/z - UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - // Calculate local ix/iy/iz - UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + std::vector global_pi; + global_pi.reserve(numparticles); + // TODO: this could be parallel + for (int i = 0; i < numparticles; i++) + { + int local_i = *(Pi + 3); + int write_i = local_i; + + int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z + UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); - // Convert ix/iy/iz to global - int gix = ix + (grid->nx * (rx)); - int giy = iy + (grid->ny * (ry)); - int giz = iz + (grid->nz * (rz)); + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); - // calculate global grid sizes - int gnx = grid->nx * grid->gpx; - int gny = grid->ny * grid->gpy; - int gnz = grid->nz * grid->gpz; + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; - int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz); + int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz); + int* hmm = new int(); + *hmm = 10; + + global_pi[i] = global_i; + } - write_i = global_i; - // TODO: update the address written below, it requires something more stable than a statced int #undef UNVOXEL -#endif + dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data()); + H5Dclose(dset_id); +#else dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); H5Dclose(dset_id); +#endif dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); From 843babb51429a496279371821c8f0e84a7844c78 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 22 Oct 2019 09:16:04 -0600 Subject: [PATCH 64/95] tidied up global particle id convert, close to correct now --- src/vpic/dump.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 733a2913..a63a13a2 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -1010,6 +1010,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); H5Dclose(dset_id); +#define OUTPUT_CONVERT_GLOBAL_ID 1 #ifdef OUTPUT_CONVERT_GLOBAL_ID # define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE { \ int _ix, _iy, _iz; \ @@ -1028,8 +1029,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, // TODO: this could be parallel for (int i = 0; i < numparticles; i++) { - int local_i = *(Pi + 3); - int write_i = local_i; + int local_i = sp->p[i].i; int ix, iy, iz, rx, ry, rz; // Convert rank to local x/y/z @@ -1048,10 +1048,10 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, int gny = grid->ny * grid->gpy; int gnz = grid->nz * grid->gpz; - int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz); - int* hmm = new int(); - *hmm = 10; + // TODO: find a better way to account for the hard coded ghosts in VOXEL + int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); + //std::cout << rank() << " local i " << local_i << " becomes " << global_i << std::endl; global_pi[i] = global_i; } From 351a785d46bb8088191ac90e2a5711aa897600f4 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 29 Oct 2019 14:27:46 -0600 Subject: [PATCH 65/95] first pass where build system does something useful --- CMakeLists.txt | 94 +++++---- bin/vpic-local.in | 2 +- sample/harrisOpenPMD | 443 +++++++++++++++++++++++++++++++++++++++++++ src/vpic/dump.cc | 16 ++ src/vpic/vpic.h | 3 + 5 files changed, 516 insertions(+), 42 deletions(-) create mode 100644 sample/harrisOpenPMD diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c354de5..57f1ed13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,8 @@ option(DISABLE_DYNAMIC_RESIZING "Prevent particle arrays from dynamically resizi option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install HDF5" OFF) +option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you install OpenPM" OFF) + # option to set minimum number of particles set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing") @@ -136,7 +138,7 @@ find_package(Threads REQUIRED) if(USE_LEGACY_SORT) add_definitions(-DVPIC_USE_LEGACY_SORT) - set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT") + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT") endif(USE_LEGACY_SORT) #------------------------------------------------------------------------------# @@ -151,7 +153,7 @@ endif() if(USE_PTHREADS) add_definitions(-DVPIC_USE_PTHREADS) - set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_PTHREADS") + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_PTHREADS") endif(USE_PTHREADS) if(USE_OPENMP) @@ -307,30 +309,6 @@ if(ENABLE_COVERAGE_BUILD) set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} --coverage") endif(ENABLE_COVERAGE_BUILD) -# process Makefile.run.in to get a simple Makefile.run for a run. Points to -# local built exe wrapper, and has example deck/platform. -configure_file(${CMAKE_SOURCE_DIR}/sample/Makefile.run.in - ${CMAKE_BINARY_DIR}/bin/Makefile.run) - -# Append all defines to VPIC_DEFINES, so it can be seen during input deck building -get_directory_property(ALL_DEFINES DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) -#string(REPLACE ";" " -D" EEK "${ALL_DEFINES}") -foreach(d ${ALL_DEFINES}) - set(VPIC_DEFINES "${VPIC_DEFINES} -D${d}") -endforeach() - -# install script -configure_file(${CMAKE_SOURCE_DIR}/bin/vpic.in - ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install) -install(FILES ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install - DESTINATION bin - RENAME vpic - PERMISSIONS - OWNER_READ OWNER_WRITE OWNER_EXECUTE - GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE - ) - install(FILES deck/main.cc deck/wrapper.cc DESTINATION share/vpic) install(FILES deck/wrapper.h DESTINATION include/vpic) install(DIRECTORY src/ DESTINATION include/vpic @@ -361,22 +339,22 @@ else() install(TARGETS vpic LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) endif() -# Configure local script to generate bin/vpic -configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in - ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic) - -file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic - DESTINATION ${CMAKE_BINARY_DIR}/bin - FILE_PERMISSIONS - OWNER_READ OWNER_WRITE OWNER_EXECUTE - GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE -) - target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src) target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES}) target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS}) +if(USE_OPENPMD) + # Enable OpenPMD, and the relevant defines + message("Using OpenPMD") + find_package(openPMD REQUIRED) + add_definitions(-DVPIC_ENABLE_OPENPMD) + link_libraries(openPMD::openPMD) + get_target_property(openPMD_LIBRARIES openPMD::openPMD LOCATION) + string(REPLACE ";" " " string_libraries ${openPMD_LIBRARIES}) + set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}") +endif(USE_OPENPMD) +message(${VPIC_CXX_LIBRARIES}) + macro(build_a_vpic name deck) if(NOT EXISTS ${deck}) message(FATAL_ERROR "Could not find deck '${deck}'") @@ -434,6 +412,40 @@ if(ENABLE_PERFORMANCE_TESTS) include_directories(${CATCH_DIR}) add_subdirectory(test/performance) endif(ENABLE_PERFORMANCE_TESTS) -#~---------------------------------------------------------------------------~-# -# vim: set tabstop=2 shiftwidth=2 expandtab : -#~---------------------------------------------------------------------------~-# + +# process Makefile.run.in to get a simple Makefile.run for a run. Points to +# local built exe wrapper, and has example deck/platform. +configure_file(${CMAKE_SOURCE_DIR}/sample/Makefile.run.in + ${CMAKE_BINARY_DIR}/bin/Makefile.run) + +# Append all defines to VPIC_DEFINES, so it can be seen during input deck building +get_directory_property(ALL_DEFINES DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) +#string(REPLACE ";" " -D" EEK "${ALL_DEFINES}") +foreach(d ${ALL_DEFINES}) + set(VPIC_DEFINES "${VPIC_DEFINES} -D${d}") +endforeach() + +# install script +configure_file(${CMAKE_SOURCE_DIR}/bin/vpic.in + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install) +install(FILES ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install + DESTINATION bin + RENAME vpic + PERMISSIONS + OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE +) + +# Configure local script to generate bin/vpic +configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic) + +file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic + DESTINATION ${CMAKE_BINARY_DIR}/bin + FILE_PERMISSIONS + OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE +) + diff --git a/bin/vpic-local.in b/bin/vpic-local.in index f0e64ded..e372c91f 100644 --- a/bin/vpic-local.in +++ b/bin/vpic-local.in @@ -4,4 +4,4 @@ deck=`echo $1 | sed 's,\.cxx,,g;s,\.cc,,g;s,\.cpp,,g;s,.*\/,,g'` echo "${CMAKE_CXX_COMPILER} ${VPIC_DEFINES} ${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl" -${CMAKE_CXX_COMPILER} ${VPIC_DEFINES}${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl +${CMAKE_CXX_COMPILER} ${VPIC_DEFINES} ${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD new file mode 100644 index 00000000..f8c932e9 --- /dev/null +++ b/sample/harrisOpenPMD @@ -0,0 +1,443 @@ +// Magnetic reconnection in a Harris equilibrium thin current sheet +// +// This input deck reproduces the PIC simulations found in: +// William Daughton. "Nonlinear dynamics of thin current sheets." Phys. +// Plasmas. 9(9): 3668-3678. September 2002. +// +// This input deck was written by: +// Kevin J Bowers, Ph.D. +// Plasma Physics Group (X-1) +// Applied Physics Division +// Los Alamos National Lab +// August 2003 - original version +// October 2003 - heavily revised to utilize input deck syntactic sugar +// March/April 2004 - rewritten for domain decomposition V4PIC + +// If you want to use global variables (for example, to store the dump +// intervals for your diagnostics section), it must be done in the globals +// section. Variables declared the globals section will be preserved across +// restart dumps. For example, if the globals section is: +// begin_globals { +// double variable; +// } end_globals +// the double "variable" will be visible to other input deck sections as +// "global->variable". Note: Variables declared in the globals section are set +// to zero before the user's initialization block is executed. Up to 16K +// of global variables can be defined. + + +// Deck only works if VPIC was build with HDF support. Check for that: +#ifndef VPIC_ENABLE_OPENPMD +#error "VPIC_ENABLE_OPENPMD" is required +#endif + +begin_globals { + double energies_interval; + double fields_interval; + double ehydro_interval; + double ihydro_interval; + double eparticle_interval; + double iparticle_interval; + double restart_interval; +}; + +begin_initialization { + // At this point, there is an empty grid and the random number generator is + // seeded with the rank. The grid, materials, species need to be defined. + // Then the initial non-zero fields need to be loaded at time level 0 and the + // particles (position and momentum both) need to be loaded at time level 0. + + // Example of how to call / set dumping + //field_dump_flag.disableEMAT(); + + double input_mass_ratio; + int input_seed; + + // Arguments can be passed from the command line to the input deck + if( num_cmdline_arguments!=3 ) { + // Set sensible defaults + input_mass_ratio = 1.0; + input_seed = 0; + + sim_log( "Defaulting to mass_ratio of " << input_mass_ratio << " and seed of " << input_seed ); + sim_log( "For Custom Usage: " << cmdline_argument[0] << " mass_ratio seed" ); + } + else { + input_mass_ratio = atof(cmdline_argument[1]); // Ion mass / electron mass + input_seed = atof(cmdline_argument[2]); // Ion mass / electron mass + sim_log( "Detected input mass_ratio of " << input_mass_ratio << " and seed of " << input_seed ); + } + seed_entropy( input_seed ); + + // Diagnostic messages can be passed written (usually to stderr) + sim_log( "Computing simulation parameters"); + + // Define the system of units for this problem (natural units) + double L = 1; // Length normalization (sheet thickness) + double ec = 1; // Charge normalization + double me = 1; // Mass normalization + double c = 1; // Speed of light + double eps0 = 1; // Permittivity of space + + // Physics parameters + double mi_me = input_mass_ratio; // Ion mass / electron mass + double rhoi_L = 1; // Ion thermal gyroradius / Sheet thickness + double Ti_Te = 1; // Ion temperature / electron temperature + double wpe_wce = 3; // Electron plasma freq / electron cycltron freq + double theta = 0; // Orientation of the simulation wrt current sheet + double taui = 100; // Simulation wci's to run + + // Numerical parameters + double Lx = 16*L; // How big should the box be in the x direction + double Ly = 16*L; // How big should the box be in the y direction + double Lz = 16*L; // How big should the box be in the z direction + double nx = 64; // Global resolution in the x direction + double ny = 64; // Global resolution in the y direction + double nz = 1; // Global resolution in the z direction + double nppc = 64; // Average number of macro particles per cell (both species combined!) + double cfl_req = 0.99; // How close to Courant should we try to run + double wpedt_max = 0.36; // How big a timestep is allowed if Courant is not too restrictive + double damp = 0.001; // Level of radiation damping + + // Derived quantities + double mi = me*mi_me; // Ion mass + double kTe = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature + double kTi = kTe*Ti_Te; // Ion temperature + double vthe = sqrt(2*kTe/me); // Electron thermal velocity (B.D. convention) + double vthi = sqrt(2*kTi/mi); // Ion thermal velocity (B.D. convention) + double wci = vthi/(rhoi_L*L); // Ion cyclotron frequency + double wce = wci*mi_me; // Electron cyclotron frequency + double wpe = wce*wpe_wce; // Electron plasma frequency + double wpi = wpe/sqrt(mi_me); // Ion plasma frequency + double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te)); // Electron drift velocity + double vdri = -Ti_Te*vdre; // Ion drift velocity + double b0 = me*wce/ec; // Asymptotic magnetic field strength + double n0 = me*eps0*wpe*wpe/(ec*ec); // Peak electron density (also peak ion density) + double Npe = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L); // Number of physical electrons in box + double Npi = Npe; // Number of physical ions in box + double Ne = 0.5*nppc*nx*ny*nz; // Total macro electrons in box + Ne = trunc_granular(Ne,nproc()); // Make it divisible by number of processors + double Ni = Ne; // Total macro ions in box + double we = Npe/Ne; // Weight of a macro electron + double wi = Npi/Ni; // Weight of a macro ion + double gdri = 1/sqrt(1-vdri*vdri/(c*c)); // gamma of ion drift frame + double gdre = 1/sqrt(1-vdre*vdre/(c*c)); // gamma of electron drift frame + double udri = vdri*gdri; // 4-velocity of ion drift frame + double udre = vdre*gdre; // 4-velocity of electron drift frame + double uthi = sqrt(kTi/mi)/c; // Normalized ion thermal velocity (K.B. convention) + double uthe = sqrt(kTe/me)/c; // Normalized electron thermal velocity (K.B. convention) + double cs = cos(theta); + double sn = sin(theta); + + // Determine the timestep + double dg = courant_length(Lx,Ly,Lz,nx,ny,nz); // Courant length + double dt = cfl_req*dg/c; // Courant limited time step + if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe; // Override time step if plasma frequency limited + + //////////////////////////////////////// + // Setup high level simulation parmeters + + num_step = int(0.2*taui/(wci*dt)); + status_interval = int(1./(wci*dt)); + field_interval = status_interval; + hydro_interval = status_interval; + sync_shared_interval = status_interval; + clean_div_e_interval = status_interval; + clean_div_b_interval = status_interval; + + global->energies_interval = status_interval; + global->fields_interval = status_interval; + global->ehydro_interval = status_interval; + global->ihydro_interval = status_interval; + global->eparticle_interval = status_interval; + global->iparticle_interval = status_interval; + global->restart_interval = status_interval; + + /////////////////////////// + // Setup the space and time + + // Setup basic grid parameters + define_units( c, eps0 ); + define_timestep( dt ); + + // Parition a periodic box among the processors sliced uniformly along y + define_periodic_grid( -0.5*Lx, 0, 0, // Low corner + 0.5*Lx, Ly, Lz, // High corner + nx, ny, nz, // Resolution + 1, nproc(), 1 ); // Topology + + // Override some of the boundary conditions to put a particle reflecting + // perfect electrical conductor on the -x and +x boundaries + set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields ); + set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields ); + set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles ); + set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles ); + + define_material( "vacuum", 1 ); + // Note: define_material defaults to isotropic materials with mu=1,sigma=0 + // Tensor electronic, magnetic and conductive materials are supported + // though. See "shapes" for how to define them and assign them to regions. + // Also, space is initially filled with the first material defined. + + // If you pass NULL to define field array, the standard field array will + // be used (if damp is not provided, no radiation damping will be used). + define_field_array( NULL, damp ); + + //////////////////// + // Setup the species + + // Allow 50% more local_particles in case of non-uniformity + // VPIC will pick the number of movers to use for each species + // Both species use out-of-place sorting + species_t * ion = define_species( "ion", ec, mi, 1.5*Ni/nproc(), -1, 40, 1 ); + species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 ); + + /////////////////////////////////////////////////// + // Log diagnostic information about this simulation + + sim_log( "" ); + sim_log( "System of units" ); + sim_log( "L = " << L ); + sim_log( "ec = " << ec ); + sim_log( "me = " << me ); + sim_log( "c = " << c ); + sim_log( "eps0 = " << eps0 ); + sim_log( "" ); + sim_log( "Physics parameters" ); + sim_log( "rhoi/L = " << rhoi_L ); + sim_log( "Ti/Te = " << Ti_Te ); + sim_log( "wpe/wce = " << wpe_wce ); + sim_log( "mi/me = " << mi_me ); + sim_log( "theta = " << theta ); + sim_log( "taui = " << taui ); + sim_log( "" ); + sim_log( "Numerical parameters" ); + sim_log( "num_step = " << num_step ); + sim_log( "dt = " << dt ); + sim_log( "Lx = " << Lx << ", Lx/L = " << Lx/L ); + sim_log( "Ly = " << Ly << ", Ly/L = " << Ly/L ); + sim_log( "Lz = " << Lz << ", Lz/L = " << Lz/L ); + sim_log( "nx = " << nx << ", dx = " << Lx/nx << ", L/dx = " << L*nx/Lx ); + sim_log( "ny = " << ny << ", dy = " << Ly/ny << ", L/dy = " << L*ny/Ly ); + sim_log( "nz = " << nz << ", dz = " << Lz/nz << ", L/dz = " << L*nz/Lz ); + sim_log( "nppc = " << nppc ); + sim_log( "courant = " << c*dt/dg ); + sim_log( "damp = " << damp ); + sim_log( "" ); + sim_log( "Ion parameters" ); + sim_log( "qpi = " << ec << ", mi = " << mi << ", qpi/mi = " << ec/mi ); + sim_log( "vthi = " << vthi << ", vthi/c = " << vthi/c << ", kTi = " << kTi ); + sim_log( "vdri = " << vdri << ", vdri/c = " << vdri/c ); + sim_log( "wpi = " << wpi << ", wpi dt = " << wpi*dt << ", n0 = " << n0 ); + sim_log( "wci = " << wci << ", wci dt = " << wci*dt ); + sim_log( "rhoi = " << vthi/wci << ", L/rhoi = " << L/(vthi/wci) << ", dx/rhoi = " << (Lx/nx)/(vthi/wci) ); + sim_log( "debyei = " << vthi/wpi << ", L/debyei = " << L/(vthi/wpi) << ", dx/debyei = " << (Lx/nx)/(vthi/wpi) ); + sim_log( "Npi = " << Npi << ", Ni = " << Ni << ", Npi/Ni = " << Npi/Ni << ", wi = " << wi ); + sim_log( "" ); + sim_log( "Electron parameters" ); + sim_log( "qpe = " << -ec << ", me = " << me << ", qpe/me = " << -ec/me ); + sim_log( "vthe = " << vthe << ", vthe/c = " << vthe/c << ", kTe = " << kTe ); + sim_log( "vdre = " << vdre << ", vdre/c = " << vdre/c ); + sim_log( "wpe = " << wpe << ", wpe dt = " << wpe*dt << ", n0 = " << n0 ); + sim_log( "wce = " << wce << ", wce dt = " << wce*dt ); + sim_log( "rhoe = " << vthe/wce << ", L/rhoe = " << L/(vthe/wce) << ", dx/rhoe = " << (Lx/nx)/(vthe/wce) ); + sim_log( "debyee = " << vthe/wpe << ", L/debyee = " << L/(vthe/wpe) << ", dx/debyee = " << (Lx/nx)/(vthe/wpe) ); + sim_log( "Npe = " << Npe << ", Ne = " << Ne << ", Npe/Ne = " << Npe/Ne << ", we = " << we ); + sim_log( "" ); + sim_log( "Miscellaneous" ); + sim_log( "nptotal = " << Ni + Ne ); + sim_log( "nproc = " << nproc() ); + sim_log( "" ); + + //////////////////////////// + // Load fields and particles + + sim_log( "Loading fields" ); + + set_region_field( everywhere, 0, 0, 0, // Electric field + 0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field + // Note: everywhere is a region that encompasses the entire simulation + // In general, regions are specied as logical equations (i.e. x>0 && x+y<2) + + sim_log( "Loading particles" ); + + double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc(); + + repeat( Ni/nproc() ) { + double x, y, z, ux, uy, uz, d0; + + // Pick an appropriately distributed random location for the pair + do { + x = L*atanh( uniform( rng(0), -1, 1 ) ); + } while( x<=-0.5*Lx || x>=0.5*Lx ); + y = uniform( rng(0), ymin, ymax ); + z = uniform( rng(0), 0, Lz ); + + // For the ion, pick an isothermal normalized momentum in the drift frame + // (this is a proper thermal equilibrium in the non-relativistic limit), + // boost it from the drift frame to the frame with the magnetic field + // along z and then rotate it into the lab frame. Then load the particle. + // Repeat the process for the electron. + + ux = normal( rng(0), 0, uthi ); + uy = normal( rng(0), 0, uthi ); + uz = normal( rng(0), 0, uthi ); + d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri; + uy = d0*cs - uz*sn; + uz = d0*sn + uz*cs; + inject_particle( ion, x, y, z, ux, uy, uz, wi, 0, 0 ); + + ux = normal( rng(0), 0, uthe ); + uy = normal( rng(0), 0, uthe ); + uz = normal( rng(0), 0, uthe ); + d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre; + uy = d0*cs - uz*sn; + uz = d0*sn + uz*cs; + inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 ); + } + + // Upon completion of the initialization, the following occurs: + // - The synchronization error (tang E, norm B) is computed between domains + // and tang E / norm B are synchronized by averaging where discrepancies + // are encountered. + // - The initial divergence error of the magnetic field is computed and + // one pass of cleaning is done (for good measure) + // - The bound charge density necessary to give the simulation an initially + // clean divergence e is computed. + // - The particle momentum is uncentered from u_0 to u_{-1/2} + // - The user diagnostics are called on the initial state + // - The physics loop is started + // + // The physics loop consists of: + // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2} + // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles) + // - User current injection (adjust field(x,y,z).jfx, jfy, jfz) + // - Advance B from B_0 to B_{1/2} + // - Advance E from E_0 to E_1 + // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz) + // - Advance B from B_{1/2} to B_1 + // - (periodically) Divergence clean electric field + // - (periodically) Divergence clean magnetic field + // - (periodically) Synchronize shared tang e and norm b + // - Increment the time step + // - Call user diagnostics + // - (periodically) Print a status message +} + +begin_diagnostics { + +# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0) + + if( step()==-10 ) { + // A grid dump contains all grid parameters, field boundary conditions, + // particle boundary conditions and domain connectivity information. This + // is stored in a binary format. Each rank makes a grid dump + dump_grid("grid"); + + // A materials dump contains all the materials parameters. This is in a + // text format. Only rank 0 makes the materials dump + dump_materials("materials"); + + // A species dump contains the physics parameters of a species. This is in + // a text format. Only rank 0 makes the species dump + dump_species("species"); + } + + // Energy dumps store all the energies in various directions of E and B + // and the total kinetic (not including rest mass) energies of each species + // species in a simple text format. By default, the energies are appended to + // the file. However, if a "0" is added to the dump_energies call, a new + // energies dump file will be created. The energies are in the units of the + // problem and are all time centered appropriately. Note: When restarting a + // simulation from a restart dump made at a prior time step to the last + // energies dump, the energies file will have a "hiccup" of intervening + // time levels. This "hiccup" will not occur if the simulation is aborted + // immediately following a restart dump. Energies dumps are in a text + // format and the layout is documented at the top of the file. Only rank 0 + // makes makes an energies dump. + if( should_dump(energies) ) dump_energies( "energies", step()==0 ? 0 : 1 ); + + // Field dumps store the raw electromagnetic fields, sources and material + // placement and a number of auxilliary fields. E, B and RHOB are + // timecentered, JF and TCA are half a step old. Material fields are static + // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for + // debugging purposes. By default, field dump filenames are tagged with + // step(). However, if a "0" is added to the call, the filename will not be + // tagged. The JF that gets stored is accumulated with a charge-conserving + // algorithm. As a result, JF is not valid until at least one timestep has + // been completed. Field dumps are in a binary format. Each rank makes a + // field dump. + if( step()==-10 ) dump_fields_openpmd("fields"); // Get first valid total J + if( should_dump(fields) ) dump_fields_openpmd("fields"); + + // Hydro dumps store particle charge density, current density and + // stress-energy tensor. All these quantities are known at the time + // t = time(). All these quantities are accumulated trilinear + // node-centered. By default, species dump filenames are tagged with + // step(). However, if a "0" is added to the call, the filename will not + // be tagged. Note that the current density accumulated by this routine is + // purely diagnostic. It is not used by the simulation and it is not + // accumulated using a self-consistent charge-conserving method. Hydro dumps + // are in a binary format. Each rank makes a hydro dump. + //if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); + //if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); + + // Particle dumps store the particle data for a given species. The data + // written is known at the time t = time(). By default, particle dumps + // are tagged with step(). However, if a "0" is added to the call, the + // filename will not be tagged. Particle dumps are in a binary format. + // Each rank makes a particle dump. + //if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle"); + //if( should_dump(iparticle) ) dump_particles_hdf5("ion", "iparticle"); + + // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string + // and tag is an integer. A typical usage is: + // checkpt( "checkpt", step() ). + // This will cause each process to write their simulation state to a file + // whose name is based on fbase, tag and the node's rank. For the above + // usage, if called on step 314 on a 4 process run, the four files: + // checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3 + // to be written. The simulation can then be restarted from this point by + // invoking the application with "--restore checkpt.314". checkpt must be + // the _VERY_ LAST_ diagnostic called. If not, diagnostics performed after + // the checkpt but before the next timestep will be missed on restore. + // Restart dumps are in a binary format unique to the each simulation. + + if( should_dump(restart) ) checkpt( "checkpt", step() ); + + // If you want to write a checkpt after a certain amount of simulation time, + // use uptime() in conjunction with checkpt. For example, this will cause + // the simulation state to be written after 7.5 hours of running to the + // same file every time (useful for dealing with quotas on big machines). + //if( uptime()>=27000 ) { + // checkpt( "timeout", 0 ); + // abort(0); + //} + +# undef should_dump + +} + +begin_particle_injection { + + // No particle injection for this simulation + +} + +begin_current_injection { + + // No current injection for this simulation + +} + +begin_field_injection { + + // No field injection for this simulation + +} + +begin_particle_collisions{ + + // No collisions for this simulation + +} diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index a63a13a2..86749b5f 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -21,6 +21,11 @@ #include "hdf5_header_info.h" // from vpic #endif +#define VPIC_ENABLE_OPENPMD 1 +#ifdef VPIC_ENABLE_OPENPMD +#include +#endif + /* -1 means no ranks talk */ #define VERBOSE_rank -1 @@ -266,6 +271,17 @@ vpic_simulation::dump_hydro( const char *sp_name, if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); } +#ifdef VPIC_ENABLE_OPENPMD +void vpic_simulation::dump_fields_openpmd( const char *fbase, int ftag ) +{ + openPMD::Series series = openPMD::Series( + "../samples/5_parallel_write.h5", + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); +} +#endif + #ifdef VPIC_ENABLE_HDF5 #define DUMP_DIR_FORMAT "./%s" diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index ce1a2454..ead1b631 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -379,6 +379,9 @@ class vpic_simulation { void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); + +//#ifdef // TODO: add ifdef + void dump_fields_openpmd( const char *fbase, int fname_tag = 1 ); #ifdef VPIC_ENABLE_HDF5 void dump_particles_hdf5( const char *sp_name, const char *fbase, int fname_tag = 1 ); From 9dc9c401cb4a421e99263801599affb0b3ba4104 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 30 Oct 2019 17:57:28 -0600 Subject: [PATCH 66/95] First pass where data dumps cbx --- CMakeLists.txt | 34 ++++++++-- sample/harrisOpenPMD | 5 +- sample/read_openpmd.py | 39 +++++++++++ src/grid/grid.h | 14 ++++ src/vpic/dump.cc | 147 +++++++++++++++++++++++++++-------------- 5 files changed, 181 insertions(+), 58 deletions(-) create mode 100644 sample/read_openpmd.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 57f1ed13..2c912f52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,17 +343,39 @@ target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src) target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES}) target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS}) +# get absolute paths to linked libraries, and their transitive dependencies +function(openpmdreclibs tgtname outname) + get_target_property(PC_PRIVATE_LIBS_TGT ${tgtname} INTERFACE_LINK_LIBRARIES) + foreach(PC_LIB IN LISTS PC_PRIVATE_LIBS_TGT) + if(TARGET ${PC_LIB}) + openpmdreclibs(${PC_LIB} ${outname}) + else() + if(PC_LIB) + string(APPEND ${outname} " ${PC_LIB}") + endif() + endif() + endforeach() + set(${outname} ${${outname}} PARENT_SCOPE) +endfunction() + if(USE_OPENPMD) - # Enable OpenPMD, and the relevant defines - message("Using OpenPMD") - find_package(openPMD REQUIRED) + # Enable openPMD, and the relevant defines + find_package(openPMD REQUIRED CONFIG COMPONENTS MPI) + target_link_libraries(vpic PRIVATE openPMD::openPMD) + target_compile_definitions(vpic PRIVATE "-DVPIC_ENABLE_OPENPMD") + add_definitions(-DVPIC_ENABLE_OPENPMD) - link_libraries(openPMD::openPMD) + + # legacy stuff for 2-phase compile get_target_property(openPMD_LIBRARIES openPMD::openPMD LOCATION) - string(REPLACE ";" " " string_libraries ${openPMD_LIBRARIES}) + string(REPLACE ";" " " string_libraries "${openPMD_LIBRARIES}") set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}") + get_target_property(openPMD_TYPE openPMD::openPMD TYPE) + if("${openPMD_TYPE}" STREQUAL "STATIC_LIBRARY") + openpmdreclibs(openPMD openPMD_TRANSITIVE_LIBS) + set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${openPMD_TRANSITIVE_LIBS}") + endif() endif(USE_OPENPMD) -message(${VPIC_CXX_LIBRARIES}) macro(build_a_vpic name deck) if(NOT EXISTS ${deck}) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index f8c932e9..a93965f8 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -367,8 +367,9 @@ begin_diagnostics { // algorithm. As a result, JF is not valid until at least one timestep has // been completed. Field dumps are in a binary format. Each rank makes a // field dump. - if( step()==-10 ) dump_fields_openpmd("fields"); // Get first valid total J - if( should_dump(fields) ) dump_fields_openpmd("fields"); + std::string openpm_field_name = "fields.h5"; + if( step()==-10 ) dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J + if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str()); // Hydro dumps store particle charge density, current density and // stress-energy tensor. All these quantities are known at the time diff --git a/sample/read_openpmd.py b/sample/read_openpmd.py new file mode 100644 index 00000000..84b19009 --- /dev/null +++ b/sample/read_openpmd.py @@ -0,0 +1,39 @@ + +import openpmd_api as api + +# example: data handling +import numpy as np + +file_name = "./fields.h5" +series = api.Series( file_name, api.Access_Type.read_only) + +print(list(series.iterations)) + +from pprint import pprint +#pprint(vars(series)) +#pprint(vars(series.iterations)) + +i = series.iterations[1]; + +print("openPMD version: ", + series.openPMD) + +# record +cB = i.meshes["B"] + +# record components +cbx = cB["x"] + +x_data = cbx.load_chunk() + +series.flush() + +extent = cbx.shape + +print( + "First values in E_x " + "of shape: ", + extent) + + +print(x_data) diff --git a/src/grid/grid.h b/src/grid/grid.h index 3167c7e6..b95a034e 100644 --- a/src/grid/grid.h +++ b/src/grid/grid.h @@ -138,6 +138,20 @@ typedef struct grid { #define VOXEL(x,y,z, nx,ny,nz) ((x) + ((nx)+2)*((y) + ((ny)+2)*(z))) +// TODO: make the asymmetry in how nx+2 is handled more obvious +#define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE { \ + int _ix, _iy, _iz; \ + _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ + _iy = _ix/int(nx); /* iy = iy+gpy*iz */ \ + _ix -= _iy*int(nx); /* ix = ix */ \ + _iz = _iy/int(ny); /* iz = iz */ \ + _iy -= _iz*int(ny); /* iy = iy */ \ + (ix) = _ix; \ + (iy) = _iy; \ + (iz) = _iz; \ + } END_PRIMITIVE + + // Advance the voxel mesh index (v) and corresponding voxel mesh // coordinates (x,y,z) in a region with min- and max-corners of // (xl,yl,zl) and (xh,yh,zh) of a (nx,ny,nz) resolution voxel mesh in diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 86749b5f..8fa2a4de 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -271,36 +271,103 @@ vpic_simulation::dump_hydro( const char *sp_name, if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); } +// TODO: remove this hack +static openPMD::Series* series; + #ifdef VPIC_ENABLE_OPENPMD -void vpic_simulation::dump_fields_openpmd( const char *fbase, int ftag ) +void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) { - openPMD::Series series = openPMD::Series( - "../samples/5_parallel_write.h5", - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); + + // TODO: recreating the series every time is probably not what we want? + std::cout << "Writing openPMD data" << std::endl; + + if (series == nullptr) { + std::cout << "init series" << std::endl; + series = new openPMD::Series( + fbase, + //"test_parallel_write.h5", + //"test_parallel_write.bp", + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); + } + + std::cout << "Writing itration " << step() << std::endl; + auto i = series->iterations[ step() + 0 ]; + // TODO: it would be nice to set these... + //series.setAuthor( "Axel Huebl "); + //series.setMachine( "Hall Probe 5000, Model 3"); + i.setAttribute( "vacuum", true); + + auto cB = i.meshes["B"]; + + // record components + auto cbx = cB["x"]; + //auto B_y = B["y"]; + //auto B_z = B["z"]; + + // TODO: set unitDimension so the anaylsis software knows what fields + // things are + + //auto dataset = api::Dataset( api::determineDatatype(), {150, 300}); + size_t gnx = (grid->nx * grid->gpx); + size_t gny = (grid->ny * grid->gpy); + size_t gnz = (grid->nz * grid->gpz); + openPMD::Extent global_extent = {gny, gny, gnz}; + + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + cbx.resetDataset(dataset); + //B_y.resetDataset(dataset); + //B_z.resetDataset(dataset); + + // Convert rank to local x/y/z + int rx, ry, rz; + UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + size_t nx = grid->nx; + size_t ny = grid->ny; + size_t nz = grid->nz; + + // NOTE: this assumes a static mesh decomposition in nx/ny/nz + size_t global_offset_x = (nx) * rx; + size_t global_offset_y = (ny) * ry; + size_t global_offset_z = (nz) * rz; + + openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; + openPMD::Extent chunk_extent = {nx, ny, nz}; + + // Store a local copy of the data which we pull out of the AoS + std::vector cbx_data; + cbx_data.reserve(nx * ny * nz); + + // We could do 1D here, but we don't really care about the ghosts, and we + // can thread over nz/ny (collapsed?) + // Go over non-ghosts and grab just that data into a dense array + for (size_t k = 1; k < grid->nz + 1; k++) + { + for (size_t j = 1; j < grid->ny + 1; j++) + { + for (size_t i = 1; i < grid->nx + 1; i++) + { + int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); + int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + cbx_data[local_index] = field_array->f[global_index].cbx; + } + } + } + + cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); + //B_y.storeChunk( y_data, chunk_offset, chunk_extent); + //B_z.storeChunk( z_data, chunk_offset, chunk_extent); + series->flush(); } #endif #ifdef VPIC_ENABLE_HDF5 #define DUMP_DIR_FORMAT "./%s" -// TODO: rename or remove this -#define RANK_TO_INDEX2(rank, ix, iy, iz) \ - BEGIN_PRIMITIVE \ - { \ - int _ix, _iy, _iz; \ - _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ - _iy = _ix / grid->gpx; /* iy = iy+gpy*iz */ \ - _ix -= _iy * grid->gpx; /* ix = ix */ \ - _iz = _iy / grid->gpy; /* iz = iz */ \ - _iy -= _iz * grid->gpy; /* iy = iy */ \ - (ix) = _ix; \ - (iy) = _iy; \ - (iz) = _iz; \ - } \ - END_PRIMITIVE - /* define to do C-style indexing */ #define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] @@ -453,22 +520,13 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ int gpy = grid->gpy; int gpz = grid->gpz; - int mpi_rank_x, mpi_rank_y, mpi_rank_z; - //RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - - int _ix, _iy, _iz; - _ix = (mpi_rank); - _iy = _ix / grid->gpx; - _ix -= _iy * grid->gpx; - _iz = _iy / grid->gpy; - _iy -= _iz * grid->gpy; - int ix = _ix; - int iy = _iy; - int iz = _iz; - - mpi_rank_x = ix; - mpi_rank_y = iy; - mpi_rank_z = iz; + // Convert rank to local decomposition + int rx, ry, rz; + UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + mpi_rank_x = rx; + mpi_rank_y = ry; + mpi_rank_z = rz; global_offset[0] = (grid->nx) * mpi_rank_x; global_offset[1] = (grid->ny) * mpi_rank_y; @@ -1028,18 +1086,6 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, #define OUTPUT_CONVERT_GLOBAL_ID 1 #ifdef OUTPUT_CONVERT_GLOBAL_ID -# define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE { \ - int _ix, _iy, _iz; \ - _ix = (rank); /* ix = ix+gpx*( iy+gpy*iz ) */ \ - _iy = _ix/int(nx); /* iy = iy+gpy*iz */ \ - _ix -= _iy*int(nx); /* ix = ix */ \ - _iz = _iy/int(ny); /* iz = iz */ \ - _iy -= _iz*int(ny); /* iy = iy */ \ - (ix) = _ix; \ - (iy) = _iy; \ - (iz) = _iz; \ - } END_PRIMITIVE - std::vector global_pi; global_pi.reserve(numparticles); // TODO: this could be parallel @@ -1048,6 +1094,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, int local_i = sp->p[i].i; int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); From 0662a19ed71ef909516414c4624af13d572b44d4 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Mon, 4 Nov 2019 11:09:18 -0700 Subject: [PATCH 67/95] data verified using python --- sample/harrisOpenPMD | 2 ++ src/vpic/dump.cc | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index a93965f8..b60b6fdd 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -367,7 +367,9 @@ begin_diagnostics { // algorithm. As a result, JF is not valid until at least one timestep has // been completed. Field dumps are in a binary format. Each rank makes a // field dump. + std::string openpm_field_name = "fields.h5"; + //std::string openpm_field_name = "fields.bp"; if( step()==-10 ) dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str()); diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 8fa2a4de..6671496e 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -303,8 +303,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) // record components auto cbx = cB["x"]; - //auto B_y = B["y"]; - //auto B_z = B["z"]; + auto cby = cB["y"]; + auto cbz = cB["z"]; // TODO: set unitDimension so the anaylsis software knows what fields // things are @@ -319,6 +319,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); cbx.resetDataset(dataset); + cby.resetDataset(dataset); + cbz.resetDataset(dataset); //B_y.resetDataset(dataset); //B_z.resetDataset(dataset); @@ -340,7 +342,12 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) // Store a local copy of the data which we pull out of the AoS std::vector cbx_data; + std::vector cby_data; + std::vector cbz_data; + cbx_data.reserve(nx * ny * nz); + cby_data.reserve(nx * ny * nz); + cbz_data.reserve(nx * ny * nz); // We could do 1D here, but we don't really care about the ghosts, and we // can thread over nz/ny (collapsed?) @@ -353,14 +360,18 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) { int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + cbx_data[local_index] = field_array->f[global_index].cbx; + cby_data[local_index] = field_array->f[global_index].cby; + cbz_data[local_index] = field_array->f[global_index].cbz; } } } cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); - //B_y.storeChunk( y_data, chunk_offset, chunk_extent); - //B_z.storeChunk( z_data, chunk_offset, chunk_extent); + cby.storeChunk( cby_data, chunk_offset, chunk_extent); + cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); + series->flush(); } #endif From 6b8416372db93081ceae03ac03ddf8b7b49a44ac Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Mon, 4 Nov 2019 11:27:38 -0700 Subject: [PATCH 68/95] quick tidy up and add j and e --- src/vpic/dump.cc | 78 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 6671496e..fbbd31d3 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -271,25 +271,22 @@ vpic_simulation::dump_hydro( const char *sp_name, if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); } -// TODO: remove this hack +#ifdef VPIC_ENABLE_OPENPMD + +// TODO: remove this hack, and actually store the state properly static openPMD::Series* series; -#ifdef VPIC_ENABLE_OPENPMD void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) { - - // TODO: recreating the series every time is probably not what we want? std::cout << "Writing openPMD data" << std::endl; if (series == nullptr) { std::cout << "init series" << std::endl; series = new openPMD::Series( - fbase, - //"test_parallel_write.h5", - //"test_parallel_write.bp", - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); + fbase, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); } std::cout << "Writing itration " << step() << std::endl; @@ -300,16 +297,25 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) i.setAttribute( "vacuum", true); auto cB = i.meshes["B"]; + auto E = i.meshes["E"]; + auto J = i.meshes["J"]; // record components auto cbx = cB["x"]; auto cby = cB["y"]; auto cbz = cB["z"]; + auto Ex = E["x"]; + auto Ey = E["y"]; + auto Ez = E["z"]; + + auto Jx = J["x"]; + auto Jy = J["y"]; + auto Jz = J["z"]; + // TODO: set unitDimension so the anaylsis software knows what fields // things are - //auto dataset = api::Dataset( api::determineDatatype(), {150, 300}); size_t gnx = (grid->nx * grid->gpx); size_t gny = (grid->ny * grid->gpy); size_t gnz = (grid->nz * grid->gpz); @@ -321,8 +327,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) cbx.resetDataset(dataset); cby.resetDataset(dataset); cbz.resetDataset(dataset); - //B_y.resetDataset(dataset); - //B_z.resetDataset(dataset); + + Ex.resetDataset(dataset); + Ey.resetDataset(dataset); + Ez.resetDataset(dataset); + + Jx.resetDataset(dataset); + Jy.resetDataset(dataset); + Jz.resetDataset(dataset); // Convert rank to local x/y/z int rx, ry, rz; @@ -345,9 +357,27 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) std::vector cby_data; std::vector cbz_data; - cbx_data.reserve(nx * ny * nz); - cby_data.reserve(nx * ny * nz); - cbz_data.reserve(nx * ny * nz); + std::vector ex_data; + std::vector ey_data; + std::vector ez_data; + + std::vector jx_data; + std::vector jy_data; + std::vector jz_data; + + size_t nv = nx * ny * nz; + + cbx_data.reserve(nv); + cby_data.reserve(nv); + cbz_data.reserve(nv); + + ex_data.reserve(nv); + ey_data.reserve(nv); + ez_data.reserve(nv); + + jx_data.reserve(nv); + jy_data.reserve(nv); + jz_data.reserve(nv); // We could do 1D here, but we don't really care about the ghosts, and we // can thread over nz/ny (collapsed?) @@ -364,6 +394,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) cbx_data[local_index] = field_array->f[global_index].cbx; cby_data[local_index] = field_array->f[global_index].cby; cbz_data[local_index] = field_array->f[global_index].cbz; + + ex_data[local_index] = field_array->f[global_index].ex; + ey_data[local_index] = field_array->f[global_index].ey; + ez_data[local_index] = field_array->f[global_index].ez; + + jx_data[local_index] = field_array->f[global_index].jfx; + jy_data[local_index] = field_array->f[global_index].jfy; + jz_data[local_index] = field_array->f[global_index].jfz; } } } @@ -372,6 +410,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) cby.storeChunk( cby_data, chunk_offset, chunk_extent); cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); + Ex.storeChunk( ex_data, chunk_offset, chunk_extent); + Ey.storeChunk( ey_data, chunk_offset, chunk_extent); + Ez.storeChunk( ez_data, chunk_offset, chunk_extent); + + Jx.storeChunk( jx_data, chunk_offset, chunk_extent); + Jy.storeChunk( jy_data, chunk_offset, chunk_extent); + Jz.storeChunk( jz_data, chunk_offset, chunk_extent); + series->flush(); } #endif From 572ee50f5670540c3075c8fe7d731627bbddd510 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 5 Nov 2019 11:01:52 -0700 Subject: [PATCH 69/95] add particle dumping via openpmd (close but not passing openpmd file validator becasue of missing timeOffset) --- sample/harrisOpenPMD | 2 +- src/vpic/dump.cc | 119 ++++++++++++++++++++++++++++++++++++++++++- src/vpic/vpic.h | 9 +++- 3 files changed, 126 insertions(+), 4 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index b60b6fdd..16abb432 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -390,7 +390,7 @@ begin_diagnostics { // are tagged with step(). However, if a "0" is added to the call, the // filename will not be tagged. Particle dumps are in a binary format. // Each rank makes a particle dump. - //if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle"); + if( should_dump(eparticle) ) dump_particles_openpmd("electron","eparticle"); //if( should_dump(iparticle) ) dump_particles_hdf5("ion", "iparticle"); // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index fbbd31d3..45ebf853 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -21,7 +21,6 @@ #include "hdf5_header_info.h" // from vpic #endif -#define VPIC_ENABLE_OPENPMD 1 #ifdef VPIC_ENABLE_OPENPMD #include #endif @@ -45,6 +44,39 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) { return FileUtils::getCurrentWorkingDirectory(dname, size); } // dump_mkdir + +// TODO: move this somewhere more sensible +std::array global_particle_index(int local_i, grid_t* grid, int rank) +{ + int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + + // Account for the "first" ghost cell + ix = ix - 1; + iy = iy - 1; + iz = iz - 1; + + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); + + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; + + // TODO: find a better way to account for the hard coded ghosts in VOXEL + int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); + + return { global_i, gix, giy, giz }; +} + + /***************************************************************************** * ASCII dump IO *****************************************************************************/ @@ -276,6 +308,86 @@ vpic_simulation::dump_hydro( const char *sp_name, // TODO: remove this hack, and actually store the state properly static openPMD::Series* series; +void +vpic_simulation::dump_particles_openpmd( const char *sp_name, + const char *fbase, + int ftag ) +{ + + species_t *sp = find_species_name( sp_name, species_list ); + + if (series == nullptr) { + std::cout << "init series" << std::endl; + series = new openPMD::Series( + fbase, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); + } + + auto i = series->iterations[ step() ]; + + // TODO: set these + i.setTime( (float)step() ); + i.setDt(1.0); + i.setTimeUnitSI(1.0); + + auto& p = i.particles[sp_name]; + //openPMD::ParticleSpecies& p = i.particles[sp_name]; + + const int np = sp->np; + + // TODO: this could be a function call as it's used elsewhere (in hdf5) + unsigned long long total_particles, offset; + unsigned long long numparticles = np; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; + + openPMD::Extent global_extent = {total_particles}; + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + auto px = p["position"]["x"]; + auto pxo = p["positionOffset"]["x"]; + + px.resetDataset(dataset); + pxo.resetDataset(dataset); + + // convert data to SoA, allowing the user to chunk the operation + const int max_chunk = 32768*8; // 1MB SoA + // Loop over all particles in chunks + for (int i = 0; i < np; i += max_chunk) + { + // We have to be careful as the last chunk may not be full + // Find how many are left and do that many + size_t to_write = std::min(np-i, max_chunk); + + // Convert the chunk ready to write + std::vector x_pos; + std::vector x_off; + x_pos.reserve(to_write); + x_off.reserve(to_write); + + for (int j = 0; j < to_write; j++) + { + // TODO: do I need to center the particles? + auto& particle = sp->p[i+j]; + x_pos[j] = particle.dx; + std::array gi = global_particle_index(particle.i, grid, rank()); + x_off[j] = (float)gi[1]; + } + + // Base offset plus i to account for chunks + auto o = openPMD::Offset{offset + i}; + auto e = openPMD::Extent{to_write}; + px.storeChunk(x_pos, o, e); + pxo.storeChunk(x_off, o, e); + } + + +} + void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) { std::cout << "Writing openPMD data" << std::endl; @@ -290,7 +402,7 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) } std::cout << "Writing itration " << step() << std::endl; - auto i = series->iterations[ step() + 0 ]; + auto i = series->iterations[ step() ]; // TODO: it would be nice to set these... //series.setAuthor( "Axel Huebl "); //series.setMachine( "Hall Probe 5000, Model 3"); @@ -379,6 +491,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) jy_data.reserve(nv); jz_data.reserve(nv); + // TODO: make this AoS to SoA conversion a function + // We could do 1D here, but we don't really care about the ghosts, and we // can thread over nz/ny (collapsed?) // Go over non-ghosts and grab just that data into a dense array @@ -1143,6 +1257,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name, #define OUTPUT_CONVERT_GLOBAL_ID 1 #ifdef OUTPUT_CONVERT_GLOBAL_ID + // TODO: make a function out of this too, its used in openpmd std::vector global_pi; global_pi.reserve(numparticles); // TODO: this could be parallel diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index ead1b631..6b657a16 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -380,8 +380,15 @@ class vpic_simulation { void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); -//#ifdef // TODO: add ifdef +#ifdef VPIC_ENABLE_OPENPMD void dump_fields_openpmd( const char *fbase, int fname_tag = 1 ); + void dump_particles_openpmd( + const char *sp_name, + const char *fbase, + int ftag = 1 + ); +#endif + #ifdef VPIC_ENABLE_HDF5 void dump_particles_hdf5( const char *sp_name, const char *fbase, int fname_tag = 1 ); From f197e3ea35694f22e389cefdee1bb80d36e60f32 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 5 Nov 2019 19:01:22 -0700 Subject: [PATCH 70/95] start refactoring dump into seperate class to better support user selection --- src/vpic/dump.cc | 1393 ++------------------------------------ src/vpic/dump.h | 46 ++ src/vpic/dump_strategy.h | 1365 +++++++++++++++++++++++++++++++++++++ src/vpic/dumpmacros.h | 197 +++--- src/vpic/vpic.cc | 8 +- src/vpic/vpic.h | 25 +- 6 files changed, 1570 insertions(+), 1464 deletions(-) create mode 100644 src/vpic/dump.h create mode 100644 src/vpic/dump_strategy.h diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 45ebf853..35d7441d 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -12,19 +12,10 @@ #include #include -#include "vpic.h" #include "dumpmacros.h" +#include "vpic.h" #include "../util/io/FileUtils.h" -#ifdef VPIC_ENABLE_HDF5 -#include "hdf5.h" // from the lib -#include "hdf5_header_info.h" // from vpic -#endif - -#ifdef VPIC_ENABLE_OPENPMD -#include -#endif - /* -1 means no ranks talk */ #define VERBOSE_rank -1 @@ -44,42 +35,32 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) { return FileUtils::getCurrentWorkingDirectory(dname, size); } // dump_mkdir +/***************************************************************************** + * ASCII dump IO + *****************************************************************************/ -// TODO: move this somewhere more sensible -std::array global_particle_index(int local_i, grid_t* grid, int rank) +void vpic_simulation::dump_particles( const char *sp_name, + const char *fbase, + int ftag ) { - int ix, iy, iz, rx, ry, rz; - // Convert rank to local x/y/z - UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - // Calculate local ix/iy/iz - UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); - - // Account for the "first" ghost cell - ix = ix - 1; - iy = iy - 1; - iz = iz - 1; - - // Convert ix/iy/iz to global - int gix = ix + (grid->nx * (rx)); - int giy = iy + (grid->ny * (ry)); - int giz = iz + (grid->nz * (rz)); - - // calculate global grid sizes - int gnx = grid->nx * grid->gpx; - int gny = grid->ny * grid->gpy; - int gnz = grid->nz * grid->gpz; - - // TODO: find a better way to account for the hard coded ghosts in VOXEL - int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); - - return { global_i, gix, giy, giz }; + species_t * sp = find_species_name(sp_name, species_list); + dump_strategy.dump_particles( + fbase, + sp, + grid, + step(), + interpolator_array, + ftag + ); } +void dump_fields( const char *fbase, int fname_tag = 1 ) +{ +} -/***************************************************************************** - * ASCII dump IO - *****************************************************************************/ +void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 ) +{ +} void vpic_simulation::dump_energies( const char *fname, @@ -162,26 +143,6 @@ vpic_simulation::dump_materials( const char *fname ) { * Binary dump IO *****************************************************************************/ -/* -enum dump_types { - grid_dump = 0, - field_dump = 1, - hydro_dump = 2, - particle_dump = 3, - restart_dump = 4 -}; -*/ - -// TODO: should this be an enum? -namespace dump_type { - const int grid_dump = 0; - const int field_dump = 1; - const int hydro_dump = 2; - const int particle_dump = 3; - const int restart_dump = 4; - const int history_dump = 5; -} // namespace - void vpic_simulation::dump_grid( const char *fbase ) { char fname[256]; @@ -196,14 +157,14 @@ vpic_simulation::dump_grid( const char *fbase ) { if( status==fail ) ERROR(( "Could not open \"%s\".", fname )); /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = grid->nx; - nyout = grid->ny; - nzout = grid->nz; - dxout = grid->dx; - dyout = grid->dy; - dzout = grid->dz; + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; - WRITE_HEADER_V0( dump_type::grid_dump, -1, 0, fileIO ); + WRITE_HEADER_V0( dump_type::grid_dump, -1, 0, fileIO, step(), rank(), nproc()); dim[0] = 3; dim[1] = 3; @@ -225,1268 +186,6 @@ vpic_simulation::dump_grid( const char *fbase ) { if( fileIO.close() ) ERROR(( "File close failed on dump grid!!!" )); } -void -vpic_simulation::dump_fields( const char *fbase, int ftag ) { - char fname[256]; - FileIO fileIO; - int dim[3]; - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( rank()==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase )); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() ); - else sprintf( fname, "%s.%i", fbase, rank() ); - - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail ) ERROR(( "Could not open \"%s\".", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = grid->nx; - nyout = grid->ny; - nzout = grid->nz; - dxout = grid->dx; - dyout = grid->dy; - dzout = grid->dz; - - WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO ); - - dim[0] = grid->nx+2; - dim[1] = grid->ny+2; - dim[2] = grid->nz+2; - WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO ); - fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] ); - if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" )); -} - -void -vpic_simulation::dump_hydro( const char *sp_name, - const char *fbase, - int ftag ) { - species_t *sp; - char fname[256]; - FileIO fileIO; - int dim[3]; - - sp = find_species_name( sp_name, species_list ); - if( !sp ) ERROR(( "Invalid species \"%s\"", sp_name )); - - clear_hydro_array( hydro_array ); - accumulate_hydro_p( hydro_array, sp, interpolator_array ); - synchronize_hydro_array( hydro_array ); - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( rank()==0 ) - MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() ); - else sprintf( fname, "%s.%i", fbase, rank() ); - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail) ERROR(( "Could not open \"%s\".", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = grid->nx; - nyout = grid->ny; - nzout = grid->nz; - dxout = grid->dx; - dyout = grid->dy; - dzout = grid->dz; - - WRITE_HEADER_V0( dump_type::hydro_dump,sp->id,sp->q/sp->m,fileIO); - - dim[0] = grid->nx+2; - dim[1] = grid->ny+2; - dim[2] = grid->nz+2; - WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO ); - fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] ); - if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); -} - -#ifdef VPIC_ENABLE_OPENPMD - -// TODO: remove this hack, and actually store the state properly -static openPMD::Series* series; - -void -vpic_simulation::dump_particles_openpmd( const char *sp_name, - const char *fbase, - int ftag ) -{ - - species_t *sp = find_species_name( sp_name, species_list ); - - if (series == nullptr) { - std::cout << "init series" << std::endl; - series = new openPMD::Series( - fbase, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); - } - - auto i = series->iterations[ step() ]; - - // TODO: set these - i.setTime( (float)step() ); - i.setDt(1.0); - i.setTimeUnitSI(1.0); - - auto& p = i.particles[sp_name]; - //openPMD::ParticleSpecies& p = i.particles[sp_name]; - - const int np = sp->np; - - // TODO: this could be a function call as it's used elsewhere (in hdf5) - unsigned long long total_particles, offset; - unsigned long long numparticles = np; - MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - offset -= numparticles; - - openPMD::Extent global_extent = {total_particles}; - openPMD::Datatype datatype = openPMD::determineDatatype(); - openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - - auto px = p["position"]["x"]; - auto pxo = p["positionOffset"]["x"]; - - px.resetDataset(dataset); - pxo.resetDataset(dataset); - - // convert data to SoA, allowing the user to chunk the operation - const int max_chunk = 32768*8; // 1MB SoA - // Loop over all particles in chunks - for (int i = 0; i < np; i += max_chunk) - { - // We have to be careful as the last chunk may not be full - // Find how many are left and do that many - size_t to_write = std::min(np-i, max_chunk); - - // Convert the chunk ready to write - std::vector x_pos; - std::vector x_off; - x_pos.reserve(to_write); - x_off.reserve(to_write); - - for (int j = 0; j < to_write; j++) - { - // TODO: do I need to center the particles? - auto& particle = sp->p[i+j]; - x_pos[j] = particle.dx; - std::array gi = global_particle_index(particle.i, grid, rank()); - x_off[j] = (float)gi[1]; - } - - // Base offset plus i to account for chunks - auto o = openPMD::Offset{offset + i}; - auto e = openPMD::Extent{to_write}; - px.storeChunk(x_pos, o, e); - pxo.storeChunk(x_off, o, e); - } - - -} - -void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag) -{ - std::cout << "Writing openPMD data" << std::endl; - - if (series == nullptr) { - std::cout << "init series" << std::endl; - series = new openPMD::Series( - fbase, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); - } - - std::cout << "Writing itration " << step() << std::endl; - auto i = series->iterations[ step() ]; - // TODO: it would be nice to set these... - //series.setAuthor( "Axel Huebl "); - //series.setMachine( "Hall Probe 5000, Model 3"); - i.setAttribute( "vacuum", true); - - auto cB = i.meshes["B"]; - auto E = i.meshes["E"]; - auto J = i.meshes["J"]; - - // record components - auto cbx = cB["x"]; - auto cby = cB["y"]; - auto cbz = cB["z"]; - - auto Ex = E["x"]; - auto Ey = E["y"]; - auto Ez = E["z"]; - - auto Jx = J["x"]; - auto Jy = J["y"]; - auto Jz = J["z"]; - - // TODO: set unitDimension so the anaylsis software knows what fields - // things are - - size_t gnx = (grid->nx * grid->gpx); - size_t gny = (grid->ny * grid->gpy); - size_t gnz = (grid->nz * grid->gpz); - openPMD::Extent global_extent = {gny, gny, gnz}; - - openPMD::Datatype datatype = openPMD::determineDatatype(); - openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - - cbx.resetDataset(dataset); - cby.resetDataset(dataset); - cbz.resetDataset(dataset); - - Ex.resetDataset(dataset); - Ey.resetDataset(dataset); - Ez.resetDataset(dataset); - - Jx.resetDataset(dataset); - Jy.resetDataset(dataset); - Jz.resetDataset(dataset); - - // Convert rank to local x/y/z - int rx, ry, rz; - UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - size_t nx = grid->nx; - size_t ny = grid->ny; - size_t nz = grid->nz; - - // NOTE: this assumes a static mesh decomposition in nx/ny/nz - size_t global_offset_x = (nx) * rx; - size_t global_offset_y = (ny) * ry; - size_t global_offset_z = (nz) * rz; - - openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; - openPMD::Extent chunk_extent = {nx, ny, nz}; - - // Store a local copy of the data which we pull out of the AoS - std::vector cbx_data; - std::vector cby_data; - std::vector cbz_data; - - std::vector ex_data; - std::vector ey_data; - std::vector ez_data; - - std::vector jx_data; - std::vector jy_data; - std::vector jz_data; - - size_t nv = nx * ny * nz; - - cbx_data.reserve(nv); - cby_data.reserve(nv); - cbz_data.reserve(nv); - - ex_data.reserve(nv); - ey_data.reserve(nv); - ez_data.reserve(nv); - - jx_data.reserve(nv); - jy_data.reserve(nv); - jz_data.reserve(nv); - - // TODO: make this AoS to SoA conversion a function - - // We could do 1D here, but we don't really care about the ghosts, and we - // can thread over nz/ny (collapsed?) - // Go over non-ghosts and grab just that data into a dense array - for (size_t k = 1; k < grid->nz + 1; k++) - { - for (size_t j = 1; j < grid->ny + 1; j++) - { - for (size_t i = 1; i < grid->nx + 1; i++) - { - int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); - int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); - - cbx_data[local_index] = field_array->f[global_index].cbx; - cby_data[local_index] = field_array->f[global_index].cby; - cbz_data[local_index] = field_array->f[global_index].cbz; - - ex_data[local_index] = field_array->f[global_index].ex; - ey_data[local_index] = field_array->f[global_index].ey; - ez_data[local_index] = field_array->f[global_index].ez; - - jx_data[local_index] = field_array->f[global_index].jfx; - jy_data[local_index] = field_array->f[global_index].jfy; - jz_data[local_index] = field_array->f[global_index].jfz; - } - } - } - - cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); - cby.storeChunk( cby_data, chunk_offset, chunk_extent); - cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); - - Ex.storeChunk( ex_data, chunk_offset, chunk_extent); - Ey.storeChunk( ey_data, chunk_offset, chunk_extent); - Ez.storeChunk( ez_data, chunk_offset, chunk_extent); - - Jx.storeChunk( jx_data, chunk_offset, chunk_extent); - Jy.storeChunk( jy_data, chunk_offset, chunk_extent); - Jz.storeChunk( jz_data, chunk_offset, chunk_extent); - - series->flush(); -} -#endif - -#ifdef VPIC_ENABLE_HDF5 -#define DUMP_DIR_FORMAT "./%s" - -/* define to do C-style indexing */ -#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - -// TODO: make function? -#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ - { \ - FILE *fp; \ - fp = fopen(xml_file_name, "a"); \ - fprintf(fp, main_body_head, time_step); \ - if (field_dump_flag.enabledE()) \ - write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ - if (field_dump_flag.div_e_err) \ - fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ - if (field_dump_flag.enabledCB()) \ - write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ - if (field_dump_flag.div_b_err) \ - fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ - if (field_dump_flag.enabledTCA()) \ - write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ - if (field_dump_flag.rhob) \ - fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ - if (field_dump_flag.enabledJF()) \ - write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ - if (field_dump_flag.rhof) \ - fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ - if (field_dump_flag.enabledEMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ - if (field_dump_flag.nmat) \ - fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ - if (field_dump_flag.enabledFMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ - if (field_dump_flag.cmat) \ - fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ - fprintf(fp, "%s", main_body_foot); \ - if (add_footer_flag) \ - fputs(footer, fp); \ - fclose(fp); \ - } -void -vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag ) -{ - size_t step_for_viou = step(); - - int mpi_size, mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - -#ifdef DUMP_INFO_DEBUG - printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); - //printf("base dir for field: %s \n", fdParams.baseDir); - //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); - printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); - printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); - //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); - printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); -#endif - -#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } - - char fname[256]; - char field_scratch[128]; - char subfield_scratch[128]; - - sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); - dump_mkdir(field_scratch); - sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); - dump_mkdir(subfield_scratch); - - sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); - double el1 = uptime(); - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - H5Pclose(plist_id); - - sprintf(fname, "Timestep_%zu", step_for_viou); - hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - - el1 = uptime() - el1; - //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); - - /* -// Create a variable list of field values to output. -size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); -size_t * varlist = new size_t[numvars]; - -for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; - -printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ - -#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - // Local voxel mesh resolution. Voxels are - // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 - // with voxels 1:nx,1:ny,1:nz being non-ghost - // voxels. - - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); - hsize_t temp_buf_index; - hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; - plist_id = H5Pcreate(H5P_DATASET_XFER); - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - - //global->topology_x - - hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; - field_global_size[0] = (grid->nx * grid->gpx); - field_global_size[1] = (grid->ny * grid->gpy); - field_global_size[2] = (grid->nz * grid->gpz); - - field_local_size[0] = grid->nx; - field_local_size[1] = grid->ny; - field_local_size[2] = grid->nz; - - int gpx = grid->gpx; - int gpy = grid->gpy; - int gpz = grid->gpz; - - // Convert rank to local decomposition - int rx, ry, rz; - UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - mpi_rank_x = rx; - mpi_rank_y = ry; - mpi_rank_z = rz; - - global_offset[0] = (grid->nx) * mpi_rank_x; - global_offset[1] = (grid->ny) * mpi_rank_y; - global_offset[2] = (grid->nz) * mpi_rank_z; - - global_count[0] = (grid->nx); - global_count[1] = (grid->ny); - global_count[2] = (grid->nz); - -#ifdef DUMP_INFO_DEBUG - printf("global size = %d %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]); - printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - fflush(stdout); -#endif - - hid_t filespace = H5Screate_simple(3, field_global_size, NULL); - hid_t memspace = H5Screate_simple(3, field_local_size, NULL); - hid_t dataspace_id; - - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - - if (field_dump_flag.ex) - DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); - if (field_dump_flag.ey) - DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); - if (field_dump_flag.ez) - DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_e_err) - DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.cbx) - DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); - if (field_dump_flag.cby) - DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); - if (field_dump_flag.cbz) - DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_b_err) - DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.tcax) - DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcay) - DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcaz) - DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhob) - DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); - - if (field_dump_flag.jfx) - DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfy) - DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfz) - DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhof) - DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); - - //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) - if (field_dump_flag.ematx) - DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); - if (field_dump_flag.ematy) - DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); - if (field_dump_flag.ematz) - DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); - if (field_dump_flag.nmat) - DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); - - if (field_dump_flag.fmatx) - DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); - if (field_dump_flag.fmaty) - DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); - if (field_dump_flag.fmatz) - DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); - if (field_dump_flag.cmat) - DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); - - el2 = uptime() - el2; - //sim_log("TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - - //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF - float attr_data[2][3]; - attr_data[0][0] = grid->x0; - attr_data[0][1] = grid->y0; - attr_data[0][2] = grid->z0; - attr_data[1][0] = grid->dx; - attr_data[1][1] = grid->dy; - attr_data[1][2] = grid->dz; - hsize_t dims[2]; - dims[0] = 2; - dims[1] = 3; - hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); - H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); - H5Sclose(va_geo_dataspace_id); - H5Aclose(va_geo_attribute_id); - - free(temp_buf); - H5Sclose(filespace); - H5Sclose(memspace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - - el3 = uptime() - el3; - //sim_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - - int nframes = num_step / field_interval + 1; - static int field_tframe = 0; - -#ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" field_interval: %d \n", field_interval); - printf(" current step: %lld \n", step_for_viou); - printf(" current step: %lld \n", step_for_viou); - - //printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", field_tframe); -#endif - - // TODO: this footer dumping is more likely better done in a - // destructor, rather than hoping a multiple division works out - if (field_tframe >= 1) - { - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - field_tframe++; - } -} - -// TODO: fix this, it currently uses a static global and the logic only -// supports 1 species otherwise things get out of sync -void vpic_simulation::dump_hydro_hdf5( const char *speciesname, - const char *fbase, - int ftag ) -{ - size_t step_for_viou = step(); - -#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } - //#define DUMP_INFO_DEBUG 1 - int mpi_size, mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - species_t *sp = find_species_name(speciesname, species_list); - if (!sp) - ERROR(("Invalid species name: %s", speciesname)); - - clear_hydro_array(hydro_array); - accumulate_hydro_p(hydro_array, sp, interpolator_array); - synchronize_hydro_array(hydro_array); - - char hname[256]; - char hydro_scratch[128]; - char subhydro_scratch[128]; - - sprintf(hydro_scratch, "./%s", "hydro_hdf5"); - dump_mkdir(hydro_scratch); - sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); - dump_mkdir(subhydro_scratch); - - sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou); - double el1 = uptime(); - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - H5Pclose(plist_id); - - sprintf(hname, "Timestep_%zu", step_for_viou); - hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - - el1 = uptime() - el1; - //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts - //double el2 = uptime(); - - // Create a variable list of field values to output. - //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); - //size_t *varlist = new size_t[numvars]; - - //for (size_t i(0), c(0); i < total_field_variables; i++) - // if (global->fdParams.output_vars.bitset(i)) - // varlist[c++] = i; - - //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); - - - //typedef struct hydro { - // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align - //} hydro_t; - - //typedef struct hydro_array { - // hydro_t * ALIGNED(128) h; - // grid_t * g; - //} hydro_array_t; - - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); - hsize_t temp_buf_index; - hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; - plist_id = H5Pcreate(H5P_DATASET_XFER); - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - - //global->topology_x - - hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; - hydro_global_size[0] = (grid->nx * grid->gpx); - hydro_global_size[1] = (grid->ny * grid->gpy); - hydro_global_size[2] = (grid->nz * grid->gpz); - - hydro_local_size[0] = grid->nx; - hydro_local_size[1] = grid->ny; - hydro_local_size[2] = grid->nz; - - int mpi_rank_x, mpi_rank_y, mpi_rank_z; - RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - - global_offset[0] = (grid->nx) * mpi_rank_x; - global_offset[1] = (grid->ny) * mpi_rank_y; - global_offset[2] = (grid->nz) * mpi_rank_z; - - global_count[0] = (grid->nx); - global_count[1] = (grid->ny); - global_count[2] = (grid->nz); - -#ifdef DUMP_INFO_DEBUG - printf("global size = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - fflush(stdout); -#endif - - hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL); - hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); - hid_t dataspace_id; - - //typedef struct hydro { - // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align - //} hydro_t; - - if (hydro_dump_flag.jx) - DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.jy) - DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.jz) - DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.rho) - DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.px) - DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.py) - DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.pz) - DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.ke) - DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.txx) - DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tyy) - DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tzz) - DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.tyz) - DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tzx) - DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.txy) - DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); - - //el2 = uptime() - el2; - //sim_log("TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - - //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF - float attr_data[2][3]; - attr_data[0][0] = grid->x0; - attr_data[0][1] = grid->y0; - attr_data[0][2] = grid->z0; - attr_data[1][0] = grid->dx; - attr_data[1][1] = grid->dy; - attr_data[1][2] = grid->dz; - hsize_t dims[2]; - dims[0] = 2; - dims[1] = 3; - hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); - H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); - H5Sclose(va_geo_dataspace_id); - H5Aclose(va_geo_attribute_id); - - free(temp_buf); - H5Sclose(filespace); - H5Sclose(memspace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - - el3 = uptime() - el3; - //sim_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char output_xml_file[128]; - sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf"); - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - - int nframes = num_step / hydro_interval + 1; - - const int tframe = tframe_map[sp->id]; - -#ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" hydro_fields_interval: %d \n", hydro_interval); - printf(" current step: %lld \n", step_for_viou); - printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", tframe); -#endif - - char speciesname_new[128]; - sprintf(speciesname_new, "hydro_%s", speciesname); - if (tframe >= 1) - { - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - tframe_map[sp->id]++; - } -} - -// TODO": make the sp_name and speciesname varailbe naming consistent -void -vpic_simulation::dump_particles_hdf5( const char *sp_name, - const char *fbase, - int ftag ) -{ - size_t step_for_viou = step(); - char fname[256]; - char group_name[256]; - char particle_scratch[128]; - char subparticle_scratch[128]; - - int np_local; - species_t *sp; - - float *Pf; - int *Pi; - - // get the total number of particles. in this example, output only electrons - sp = species_list; - sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); - dump_mkdir(particle_scratch); - sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); - dump_mkdir(subparticle_scratch); - - // TODO: Allow the user to set this - - int stride_particle_dump = 1; - while (sp) - { - np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; - - // make a copy of the part of particle data to be dumped - double ec1 = uptime(); - - int sp_np = sp->np; - int sp_max_np = sp->max_np; - particle_t *ALIGNED(128) p_buf = NULL; - if (!p_buf) - MALLOC_ALIGNED(p_buf, np_local, 128); - particle_t *sp_p = sp->p; - sp->p = p_buf; - sp->np = np_local; - sp->max_np = np_local; - - for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) - { - COPY(&sp->p[i], &sp_p[iptl], 1); - } - - center_p(sp, interpolator_array); - - ec1 = uptime() - ec1; - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; - //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); - - Pf = (float *)sp->p; - Pi = (int *)sp->p; - - // open HDF5 file in "particle/T./" subdirectory - // filename: eparticle.h5p - sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); - sprintf(group_name, "/Timestep_%ld", step_for_viou); - double el1 = uptime(); - - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - - H5Pclose(plist_id); - - long long total_particles, offset; - long long numparticles = np_local; - MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - offset -= numparticles; - - hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); - - hsize_t memspace_count_temp = numparticles * 8; - hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); - - // Don't need, can just use H5S_ALL - //hsize_t linearspace_count_temp = numparticles; - //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); - - plist_id = H5Pcreate(H5P_DATASET_XFER); - - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); - - hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); - - el1 = uptime() - el1; - //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - - //double el2 = uptime(); - - // This point offset is silly, and loses the type safety (pf+1) - hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); - H5Dclose(dset_id); - -#define OUTPUT_CONVERT_GLOBAL_ID 1 -#ifdef OUTPUT_CONVERT_GLOBAL_ID - // TODO: make a function out of this too, its used in openpmd - std::vector global_pi; - global_pi.reserve(numparticles); - // TODO: this could be parallel - for (int i = 0; i < numparticles; i++) - { - int local_i = sp->p[i].i; - - int ix, iy, iz, rx, ry, rz; - - // Convert rank to local x/y/z - UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - // Calculate local ix/iy/iz - UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); - - // Convert ix/iy/iz to global - int gix = ix + (grid->nx * (rx)); - int giy = iy + (grid->ny * (ry)); - int giz = iz + (grid->nz * (rz)); - - // calculate global grid sizes - int gnx = grid->nx * grid->gpx; - int gny = grid->ny * grid->gpy; - int gnz = grid->nz * grid->gpz; - - // TODO: find a better way to account for the hard coded ghosts in VOXEL - int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); - - //std::cout << rank() << " local i " << local_i << " becomes " << global_i << std::endl; - global_pi[i] = global_i; - } - -#undef UNVOXEL - dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data()); - H5Dclose(dset_id); - -#else - dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); - H5Dclose(dset_id); -#endif - - dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); - H5Dclose(dset_id); - - //el2 = uptime() - el2; - //sim_log("Particle TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - H5Sclose(memspace); - H5Sclose(filespace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - el3 = uptime() - el3; - //sim_log("Particle TimeHDF5Close: " << el3 << " s"); - - sp->p = sp_p; - sp->np = sp_np; - sp->max_np = sp_max_np; - FREE_ALIGNED(p_buf); - - // Write metadata if step() == 0 - char meta_fname[256]; - - sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); - - double meta_el1 = uptime(); - - hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id); - hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - H5Pclose(meta_plist_id); - - long long meta_total_particles, meta_offset; - long long meta_numparticles = 1; - MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - meta_offset -= meta_numparticles; - - hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL); - hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL); - meta_plist_id = H5Pcreate(H5P_DATASET_XFER); - H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE); - H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL); - meta_el1 = uptime() - meta_el1; - //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts - - double meta_el2 = uptime(); - - hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dX \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dY \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dZ \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable i \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz); - H5Dclose(meta_dset_id); - - meta_el2 = uptime() - meta_el2; - //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); - double meta_el3 = uptime(); - H5Sclose(meta_memspace); - H5Sclose(meta_filespace); - H5Pclose(meta_plist_id); - H5Gclose(meta_group_id); - H5Fclose(meta_file_id); - meta_el3 = uptime() - meta_el3; - //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); - - sp = sp->next; - } -} -#endif - -void -vpic_simulation::dump_particles( const char *sp_name, - const char *fbase, - int ftag ) -{ - species_t *sp; - char fname[256]; - FileIO fileIO; - int dim[1], buf_start; - static particle_t * ALIGNED(128) p_buf = NULL; -# define PBUF_SIZE 32768 // 1MB of particles - - sp = find_species_name( sp_name, species_list ); - if( !sp ) ERROR(( "Invalid species name \"%s\".", sp_name )); - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 ); - - if( rank()==0 ) - MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase)); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() ); - else sprintf( fname, "%s.%i", fbase, rank() ); - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail ) ERROR(( "Could not open \"%s\"", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = grid->nx; - nyout = grid->ny; - nzout = grid->nz; - dxout = grid->dx; - dyout = grid->dy; - dzout = grid->dz; - - WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO ); - - dim[0] = sp->np; - WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO ); - - // Copy a PBUF_SIZE hunk of the particle list into the particle - // buffer, timecenter it and write it out. This is done this way to - // guarantee the particle list unchanged while not requiring too - // much memory. - - // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE - // LARGE. - - particle_t * sp_p = sp->p; sp->p = p_buf; - int sp_np = sp->np; sp->np = 0; - int sp_max_np = sp->max_np; sp->max_np = PBUF_SIZE; - for( buf_start=0; buf_startnp = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE; - COPY( sp->p, &sp_p[buf_start], sp->np ); - center_p( sp, interpolator_array ); - fileIO.write( sp->p, sp->np ); - } - sp->p = sp_p; - sp->np = sp_np; - sp->max_np = sp_max_np; - - if( fileIO.close() ) ERROR(("File close failed on dump particles!!!")); -} - /*------------------------------------------------------------------------------ * New dump logic *---------------------------------------------------------------------------*/ @@ -1683,6 +382,8 @@ vpic_simulation::global_header( const char * base, if( fileIO.close() ) ERROR(( "File close failed on global header!!!" )); } +// TODO: why is there field_dump and dump_fields? +// TODO: this could probably move into the dump_strategy void vpic_simulation::field_dump( DumpParameters & dumpParams ) { @@ -1721,12 +422,12 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) { # define f(x,y,z) f[ VOXEL(x,y,z, grid->nx,grid->ny,grid->nz) ] /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = (grid->nx)/istride; - nyout = (grid->ny)/jstride; - nzout = (grid->nz)/kstride; - dxout = (grid->dx)*istride; - dyout = (grid->dy)*jstride; - dzout = (grid->dz)*kstride; + size_t nxout = (grid->nx)/istride; + size_t nyout = (grid->ny)/jstride; + size_t nzout = (grid->nz)/kstride; + float dxout = (grid->dx)*istride; + float dyout = (grid->dy)*jstride; + float dzout = (grid->dz)*kstride; /* Banded output will write data as a single block-array as opposed to * the Array-of-Structure format that is used for native storage. @@ -1738,7 +439,7 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) { if(dumpParams.format == band) { - WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO); + WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO, step(), rank(), nproc()); dim[0] = nxout+2; dim[1] = nyout+2; @@ -1799,7 +500,7 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) { } else { // band_interleave - WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO); + WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO, step(), rank(), nproc()); dim[0] = nxout+2; dim[1] = nyout+2; @@ -1867,12 +568,12 @@ vpic_simulation::hydro_dump( const char * speciesname, int dim[3]; /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - nxout = (grid->nx)/istride; - nyout = (grid->ny)/jstride; - nzout = (grid->nz)/kstride; - dxout = (grid->dx)*istride; - dyout = (grid->dy)*jstride; - dzout = (grid->dz)*kstride; + size_t nxout = (grid->nx)/istride; + size_t nyout = (grid->ny)/jstride; + size_t nzout = (grid->nz)/kstride; + float dxout = (grid->dx)*istride; + float dyout = (grid->dy)*jstride; + float dzout = (grid->dz)*kstride; /* Banded output will write data as a single block-array as opposed to * the Array-of-Structure format that is used for native storage. @@ -1884,7 +585,7 @@ vpic_simulation::hydro_dump( const char * speciesname, */ if(dumpParams.format == band) { - WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO); + WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step(), rank(), nproc()); dim[0] = nxout+2; dim[1] = nyout+2; @@ -1928,7 +629,7 @@ vpic_simulation::hydro_dump( const char * speciesname, } else { // band_interleave - WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO); + WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step(), rank(), nproc()); dim[0] = nxout; dim[1] = nyout; diff --git a/src/vpic/dump.h b/src/vpic/dump.h new file mode 100644 index 00000000..966e627e --- /dev/null +++ b/src/vpic/dump.h @@ -0,0 +1,46 @@ +#ifndef dump_h +#define dump_h + +#include + +// TODO: should this be an enum? +namespace dump_type { + const int grid_dump = 0; + const int field_dump = 1; + const int hydro_dump = 2; + const int particle_dump = 3; + const int restart_dump = 4; + const int history_dump = 5; +} // namespace + +// TODO: namesapce? +std::array global_particle_index(int local_i, grid_t* grid, int rank) +{ + int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + + // Account for the "first" ghost cell + ix = ix - 1; + iy = iy - 1; + iz = iz - 1; + + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); + + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; + + // TODO: find a better way to account for the hard coded ghosts in VOXEL + int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); + + return { global_i, gix, giy, giz }; +} +#endif diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h new file mode 100644 index 00000000..c745df7c --- /dev/null +++ b/src/vpic/dump_strategy.h @@ -0,0 +1,1365 @@ +#ifndef Dump_Strategy_h +#define Dump_Strategy_h + +#include "../util/io/FileIO.h" +#include "../util/util_base.h" +#include "../util/io/FileUtils.h" +#include "dump.h" +#include "dumpmacros.h" + +#ifdef VPIC_ENABLE_HDF5 +#include "hdf5.h" // from the lib +#include "hdf5_header_info.h" // from vpic +#endif + +#ifdef VPIC_ENABLE_OPENPMD +#include +#endif + +class Dump_Strategy { + public: + int rank, nproc; + + Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty + + virtual void dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ); + virtual void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ); + virtual void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ); +}; + +class BinaryDump : public Dump_Strategy { + public: + using Dump_Strategy::Dump_Strategy; // inherit constructor + + // TODO: now we pass rank and step, ftag has odd semanticds + void dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) + { + char fname[256]; + FileIO fileIO; + int dim[3]; + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase )); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail ) ERROR(( "Could not open \"%s\".", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc); + + dim[0] = grid->nx+2; + dim[1] = grid->ny+2; + dim[2] = grid->nz+2; + WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO ); + fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] ); + if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" )); + } + void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) + { + char fname[256]; + FileIO fileIO; + int dim[1], buf_start; + static particle_t * ALIGNED(128) p_buf = NULL; +# define PBUF_SIZE 32768 // 1MB of particles + + if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name )); + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 ); + + if( rank==0 ) + MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase)); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail ) ERROR(( "Could not open \"%s\"", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); + + dim[0] = sp->np; + WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO ); + + // Copy a PBUF_SIZE hunk of the particle list into the particle + // buffer, timecenter it and write it out. This is done this way to + // guarantee the particle list unchanged while not requiring too + // much memory. + + // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE + // LARGE. + + particle_t * sp_p = sp->p; sp->p = p_buf; + int sp_np = sp->np; sp->np = 0; + int sp_max_np = sp->max_np; sp->max_np = PBUF_SIZE; + for( buf_start=0; buf_startnp = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE; + COPY( sp->p, &sp_p[buf_start], sp->np ); + center_p( sp, interpolator_array ); + fileIO.write( sp->p, sp->np ); + } + sp->p = sp_p; + sp->np = sp_np; + sp->max_np = sp_max_np; + + if( fileIO.close() ) ERROR(("File close failed on dump particles!!!")); + } + void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) + { + char fname[256]; + FileIO fileIO; + int dim[3]; + + if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name )); + + clear_hydro_array( hydro_array ); + accumulate_hydro_p( hydro_array, sp, interpolator_array ); + synchronize_hydro_array( hydro_array ); + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( rank==0 ) + MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail) ERROR(( "Could not open \"%s\".", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); + + dim[0] = grid->nx+2; + dim[1] = grid->ny+2; + dim[2] = grid->nz+2; + WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO ); + fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] ); + if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); + } +}; + +#ifdef VPIC_ENABLE_HDF5 +class HDF5Dump : public Dump_Strategy { + public: + using Dump_Strategy::Dump_Strategy; // inherit constructor +#define DUMP_DIR_FORMAT "./%s" + + /* define to do C-style indexing */ +#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + + // TODO: make function? +#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, main_body_head, time_step); \ + if (field_dump_flag.enabledE()) \ + write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ + if (field_dump_flag.div_e_err) \ + fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ + if (field_dump_flag.enabledCB()) \ + write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ + if (field_dump_flag.div_b_err) \ + fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ + if (field_dump_flag.enabledTCA()) \ + write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ + if (field_dump_flag.rhob) \ + fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ + if (field_dump_flag.enabledJF()) \ + write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ + if (field_dump_flag.rhof) \ + fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ + if (field_dump_flag.enabledEMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ + if (field_dump_flag.nmat) \ + fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ + if (field_dump_flag.enabledFMAT()) \ + write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ + if (field_dump_flag.cmat) \ + fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ + fprintf(fp, "%s", main_body_foot); \ + if (add_footer_flag) \ + fputs(footer, fp); \ + fclose(fp); \ + } + void dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) + { + size_t step_for_viou = step; + + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + +#ifdef DUMP_INFO_DEBUG + printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); + //printf("base dir for field: %s \n", fdParams.baseDir); + //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); + printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); + //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); + printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); +#endif + +#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + + char fname[256]; + char field_scratch[128]; + char subfield_scratch[128]; + + sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); + FileUtils::makeDirectory(field_scratch); + sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); + FileUtils::makeDirectory(subfield_scratch); + + sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); + double el1 = uptime(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + sprintf(fname, "Timestep_%zu", step_for_viou); + hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + el1 = uptime() - el1; + //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); + + /* + // Create a variable list of field values to output. + size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); + size_t * varlist = new size_t[numvars]; + + for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; + + printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ + +#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + // Local voxel mesh resolution. Voxels are + // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 + // with voxels 1:nx,1:ny,1:nz being non-ghost + // voxels. + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + plist_id = H5Pcreate(H5P_DATASET_XFER); + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + + //global->topology_x + + hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; + field_global_size[0] = (grid->nx * grid->gpx); + field_global_size[1] = (grid->ny * grid->gpy); + field_global_size[2] = (grid->nz * grid->gpz); + + field_local_size[0] = grid->nx; + field_local_size[1] = grid->ny; + field_local_size[2] = grid->nz; + + int gpx = grid->gpx; + int gpy = grid->gpy; + int gpz = grid->gpz; + + // Convert rank to local decomposition + int rx, ry, rz; + UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + mpi_rank_x = rx; + mpi_rank_y = ry; + mpi_rank_z = rz; + + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; + + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); + +#ifdef DUMP_INFO_DEBUG + printf("global size = %d %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]); + printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + fflush(stdout); +#endif + + hid_t filespace = H5Screate_simple(3, field_global_size, NULL); + hid_t memspace = H5Screate_simple(3, field_local_size, NULL); + hid_t dataspace_id; + + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + + if (field_dump_flag.ex) + DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); + if (field_dump_flag.ey) + DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); + if (field_dump_flag.ez) + DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_e_err) + DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.cbx) + DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); + if (field_dump_flag.cby) + DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); + if (field_dump_flag.cbz) + DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_b_err) + DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.tcax) + DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcay) + DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcaz) + DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhob) + DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); + + if (field_dump_flag.jfx) + DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfy) + DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfz) + DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhof) + DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); + + //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) + if (field_dump_flag.ematx) + DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); + if (field_dump_flag.ematy) + DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); + if (field_dump_flag.ematz) + DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); + if (field_dump_flag.nmat) + DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); + + if (field_dump_flag.fmatx) + DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); + if (field_dump_flag.fmaty) + DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); + if (field_dump_flag.fmatz) + DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); + if (field_dump_flag.cmat) + DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); + + el2 = uptime() - el2; + //sim_log("TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id); + + free(temp_buf); + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //sim_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + int nframes = num_step / field_interval + 1; + static int field_tframe = 0; + +#ifdef DUMP_INFO_DEBUG + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" field_interval: %d \n", field_interval); + printf(" current step: %lld \n", step_for_viou); + printf(" current step: %lld \n", step_for_viou); + + //printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", field_tframe); +#endif + + // TODO: this footer dumping is more likely better done in a + // destructor, rather than hoping a multiple division works out + if (field_tframe >= 1) + { + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + field_tframe++; + } + } + void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) + { + size_t step_for_viou = step; + char fname[256]; + char group_name[256]; + char particle_scratch[128]; + char subparticle_scratch[128]; + + int np_local; + + float *Pf; + int *Pi; + + // get the total number of particles. in this example, output only electrons + //sp = species_list; + sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); + FileUtils::makeDirector(particle_scratch); + sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); + FileUtils::makeDirector(subparticle_scratch); + + // TODO: Allow the user to set this + int stride_particle_dump = 1; + + np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; + + // make a copy of the part of particle data to be dumped + double ec1 = uptime(); + + int sp_np = sp->np; + int sp_max_np = sp->max_np; + particle_t *ALIGNED(128) p_buf = NULL; + if (!p_buf) + MALLOC_ALIGNED(p_buf, np_local, 128); + particle_t *sp_p = sp->p; + sp->p = p_buf; + sp->np = np_local; + sp->max_np = np_local; + + for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) + { + COPY(&sp->p[i], &sp_p[iptl], 1); + } + + center_p(sp, interpolator_array); + + ec1 = uptime() - ec1; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; + //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); + + Pf = (float *)sp->p; + Pi = (int *)sp->p; + + // open HDF5 file in "particle/T./" subdirectory + // filename: eparticle.h5p + sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); + sprintf(group_name, "/Timestep_%ld", step_for_viou); + double el1 = uptime(); + + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + H5Pclose(plist_id); + + long long total_particles, offset; + long long numparticles = np_local; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; + + hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); + + hsize_t memspace_count_temp = numparticles * 8; + hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + + // Don't need, can just use H5S_ALL + //hsize_t linearspace_count_temp = numparticles; + //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + + plist_id = H5Pcreate(H5P_DATASET_XFER); + + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); + + hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; + H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); + + el1 = uptime() - el1; + //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + + //double el2 = uptime(); + + // This point offset is silly, and loses the type safety (pf+1) + hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf); + H5Dclose(dset_id); + + dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1); + H5Dclose(dset_id); + + dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); + H5Dclose(dset_id); + +#define OUTPUT_CONVERT_GLOBAL_ID 1 +#ifdef OUTPUT_CONVERT_GLOBAL_ID + // TODO: make a function out of this too, its used in openpmd + std::vector global_pi; + global_pi.reserve(numparticles); + // TODO: this could be parallel + for (int i = 0; i < numparticles; i++) + { + int local_i = sp->p[i].i; + + int ix, iy, iz, rx, ry, rz; + + // Convert rank to local x/y/z + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); + + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; + + // TODO: find a better way to account for the hard coded ghosts in VOXEL + int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); + + //std::cout << rank << " local i " << local_i << " becomes " << global_i << std::endl; + global_pi[i] = global_i; + } + +#undef UNVOXEL + dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data()); + H5Dclose(dset_id); + +#else + dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); + H5Dclose(dset_id); +#endif + + dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); + H5Dclose(dset_id); + + dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5); + H5Dclose(dset_id); + + dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6); + H5Dclose(dset_id); + + dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); + H5Dclose(dset_id); + + //el2 = uptime() - el2; + //sim_log("Particle TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + H5Sclose(memspace); + H5Sclose(filespace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + el3 = uptime() - el3; + //sim_log("Particle TimeHDF5Close: " << el3 << " s"); + + sp->p = sp_p; + sp->np = sp_np; + sp->max_np = sp_max_np; + FREE_ALIGNED(p_buf); + + // Write metadata if step() == 0 + char meta_fname[256]; + + sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); + + double meta_el1 = uptime(); + + hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id); + hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Pclose(meta_plist_id); + + long long meta_total_particles, meta_offset; + long long meta_numparticles = 1; + MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + meta_offset -= meta_numparticles; + + hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL); + hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL); + meta_plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE); + H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL); + meta_el1 = uptime() - meta_el1; + //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts + + double meta_el2 = uptime(); + + hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dX \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dY \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable dZ \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz); + H5Dclose(meta_dset_id); + //if (rank == 0) printf ("Written variable i \n"); + + meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy); + H5Dclose(meta_dset_id); + + meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz); + H5Dclose(meta_dset_id); + + meta_el2 = uptime() - meta_el2; + //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); + double meta_el3 = uptime(); + H5Sclose(meta_memspace); + H5Sclose(meta_filespace); + H5Pclose(meta_plist_id); + H5Gclose(meta_group_id); + H5Fclose(meta_file_id); + meta_el3 = uptime() - meta_el3; + //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); + + } + + void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) + { + size_t step_for_viou = step; + +#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + //#define DUMP_INFO_DEBUG 1 + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + species_t *sp = find_species_name(speciesname, species_list); + if (!sp) + ERROR(("Invalid species name: %s", speciesname)); + + clear_hydro_array(hydro_array); + accumulate_hydro_p(hydro_array, sp, interpolator_array); + synchronize_hydro_array(hydro_array); + + char hname[256]; + char hydro_scratch[128]; + char subhydro_scratch[128]; + + sprintf(hydro_scratch, "./%s", "hydro_hdf5"); + FileUtils::makeDirector(hydro_scratch); + sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); + FileUtils::makeDirector(subhydro_scratch); + + sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou); + double el1 = uptime(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + sprintf(hname, "Timestep_%zu", step_for_viou); + hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + el1 = uptime() - el1; + //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts + //double el2 = uptime(); + + // Create a variable list of field values to output. + //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); + //size_t *varlist = new size_t[numvars]; + + //for (size_t i(0), c(0); i < total_field_variables; i++) + // if (global->fdParams.output_vars.bitset(i)) + // varlist[c++] = i; + + //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); + + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; + + //typedef struct hydro_array { + // hydro_t * ALIGNED(128) h; + // grid_t * g; + //} hydro_array_t; + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + plist_id = H5Pcreate(H5P_DATASET_XFER); + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + + //global->topology_x + + hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; + hydro_global_size[0] = (grid->nx * grid->gpx); + hydro_global_size[1] = (grid->ny * grid->gpy); + hydro_global_size[2] = (grid->nz * grid->gpz); + + hydro_local_size[0] = grid->nx; + hydro_local_size[1] = grid->ny; + hydro_local_size[2] = grid->nz; + + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; + + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); + +#ifdef DUMP_INFO_DEBUG + printf("global size = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + fflush(stdout); +#endif + + hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL); + hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); + hid_t dataspace_id; + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; + + if (hydro_dump_flag.jx) + DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jy) + DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jz) + DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.rho) + DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.px) + DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.py) + DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.pz) + DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.ke) + DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.txx) + DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tyy) + DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzz) + DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.tyz) + DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzx) + DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.txy) + DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); + + //el2 = uptime() - el2; + //sim_log("TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id); + + free(temp_buf); + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //sim_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char output_xml_file[128]; + sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf"); + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + int nframes = num_step / hydro_interval + 1; + + const int tframe = tframe_map[sp->id]; + +#ifdef DUMP_INFO_DEBUG + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" hydro_fields_interval: %d \n", hydro_interval); + printf(" current step: %lld \n", step_for_viou); + printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", tframe); +#endif + + char speciesname_new[128]; + sprintf(speciesname_new, "hydro_%s", speciesname); + if (tframe >= 1) + { + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + tframe_map[sp->id]++; + } + } +}; +#endif + +#ifdef VPIC_ENABLE_OPENPMD +class OpenPMDDump : public Dump_Strategy { + public: + static openPMD::Series* series; + using Dump_Strategy::Dump_Strategy; // inherit constructor + void dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) + { + std::cout << "Writing openPMD data" << std::endl; + + if (series == nullptr) { + std::cout << "init series" << std::endl; + series = new openPMD::Series( + fbase, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); + } + + std::cout << "Writing itration " << step << std::endl; + auto i = series->iterations[ step ]; + // TODO: it would be nice to set these... + //series.setAuthor( "Axel Huebl "); + //series.setMachine( "Hall Probe 5000, Model 3"); + i.setAttribute( "vacuum", true); + + auto cB = i.meshes["B"]; + auto E = i.meshes["E"]; + auto J = i.meshes["J"]; + + // record components + auto cbx = cB["x"]; + auto cby = cB["y"]; + auto cbz = cB["z"]; + + auto Ex = E["x"]; + auto Ey = E["y"]; + auto Ez = E["z"]; + + auto Jx = J["x"]; + auto Jy = J["y"]; + auto Jz = J["z"]; + + // TODO: set unitDimension so the anaylsis software knows what fields + // things are + + size_t gnx = (grid->nx * grid->gpx); + size_t gny = (grid->ny * grid->gpy); + size_t gnz = (grid->nz * grid->gpz); + openPMD::Extent global_extent = {gny, gny, gnz}; + + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + cbx.resetDataset(dataset); + cby.resetDataset(dataset); + cbz.resetDataset(dataset); + + Ex.resetDataset(dataset); + Ey.resetDataset(dataset); + Ez.resetDataset(dataset); + + Jx.resetDataset(dataset); + Jy.resetDataset(dataset); + Jz.resetDataset(dataset); + + // Convert rank to local x/y/z + int rx, ry, rz; + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + size_t nx = grid->nx; + size_t ny = grid->ny; + size_t nz = grid->nz; + + // NOTE: this assumes a static mesh decomposition in nx/ny/nz + size_t global_offset_x = (nx) * rx; + size_t global_offset_y = (ny) * ry; + size_t global_offset_z = (nz) * rz; + + openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; + openPMD::Extent chunk_extent = {nx, ny, nz}; + + // Store a local copy of the data which we pull out of the AoS + std::vector cbx_data; + std::vector cby_data; + std::vector cbz_data; + + std::vector ex_data; + std::vector ey_data; + std::vector ez_data; + + std::vector jx_data; + std::vector jy_data; + std::vector jz_data; + + size_t nv = nx * ny * nz; + + cbx_data.reserve(nv); + cby_data.reserve(nv); + cbz_data.reserve(nv); + + ex_data.reserve(nv); + ey_data.reserve(nv); + ez_data.reserve(nv); + + jx_data.reserve(nv); + jy_data.reserve(nv); + jz_data.reserve(nv); + + // TODO: make this AoS to SoA conversion a function + + // We could do 1D here, but we don't really care about the ghosts, and we + // can thread over nz/ny (collapsed?) + // Go over non-ghosts and grab just that data into a dense array + for (size_t k = 1; k < grid->nz + 1; k++) + { + for (size_t j = 1; j < grid->ny + 1; j++) + { + for (size_t i = 1; i < grid->nx + 1; i++) + { + int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); + int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + + cbx_data[local_index] = field_array->f[global_index].cbx; + cby_data[local_index] = field_array->f[global_index].cby; + cbz_data[local_index] = field_array->f[global_index].cbz; + + ex_data[local_index] = field_array->f[global_index].ex; + ey_data[local_index] = field_array->f[global_index].ey; + ez_data[local_index] = field_array->f[global_index].ez; + + jx_data[local_index] = field_array->f[global_index].jfx; + jy_data[local_index] = field_array->f[global_index].jfy; + jz_data[local_index] = field_array->f[global_index].jfz; + } + } + } + + cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); + cby.storeChunk( cby_data, chunk_offset, chunk_extent); + cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); + + Ex.storeChunk( ex_data, chunk_offset, chunk_extent); + Ey.storeChunk( ey_data, chunk_offset, chunk_extent); + Ez.storeChunk( ez_data, chunk_offset, chunk_extent); + + Jx.storeChunk( jx_data, chunk_offset, chunk_extent); + Jy.storeChunk( jy_data, chunk_offset, chunk_extent); + Jz.storeChunk( jz_data, chunk_offset, chunk_extent); + + series->flush(); + } + void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) + { + if (series == nullptr) { + std::cout << "init series" << std::endl; + series = new openPMD::Series( + fbase, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); + } + + auto i = series->iterations[ step ]; + + // TODO: set these + i.setTime( (float)step ); + i.setDt(1.0); + i.setTimeUnitSI(1.0); + + auto& p = i.particles[sp->name]; + + const int np = sp->np; + + // TODO: this could be a function call as it's used elsewhere (in hdf5) + unsigned long long total_particles, offset; + unsigned long long numparticles = np; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; + + openPMD::Extent global_extent = {total_particles}; + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + auto px = p["position"]["x"]; + auto pxo = p["positionOffset"]["x"]; + + px.resetDataset(dataset); + pxo.resetDataset(dataset); + + // convert data to SoA, allowing the user to chunk the operation + const int max_chunk = 32768*8; // 1MB SoA + // Loop over all particles in chunks + for (int i = 0; i < np; i += max_chunk) + { + // We have to be careful as the last chunk may not be full + // Find how many are left and do that many + size_t to_write = std::min(np-i, max_chunk); + + // Convert the chunk ready to write + std::vector x_pos; + std::vector x_off; + x_pos.reserve(to_write); + x_off.reserve(to_write); + + for (int j = 0; j < to_write; j++) + { + // TODO: do I need to center the particles? + auto& particle = sp->p[i+j]; + x_pos[j] = particle.dx; + std::array gi = global_particle_index(particle.i, grid, rank); + x_off[j] = (float)gi[1]; + } + + // Base offset plus i to account for chunks + auto o = openPMD::Offset{offset + i}; + auto e = openPMD::Extent{to_write}; + px.storeChunk(x_pos, o, e); + pxo.storeChunk(x_off, o, e); + } + + + } + void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) + { + } +}; +#endif + +/* + template + struct IODump : private Policy { + using Policy::dump_particles; + using Policy::dump_fields; + using Policy::dump_hydro; + }; + */ + +#endif diff --git a/src/vpic/dumpmacros.h b/src/vpic/dumpmacros.h index 9e46bf6b..bbb1b743 100644 --- a/src/vpic/dumpmacros.h +++ b/src/vpic/dumpmacros.h @@ -4,7 +4,7 @@ /* FIXME: WHEN THESE MACROS WERE HOISTED AND VARIOUS HACKS DONE TO THEM THEY BECAME _VERY_ _DANGEROUS. */ -#define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \ +#define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO,step,rank,nproc) do { \ /* Binary compatibility information */ \ WRITE( char, CHAR_BIT, fileIO ); \ WRITE( char, sizeof(short int), fileIO ); \ @@ -19,7 +19,7 @@ WRITE( int, 0 /* Version */, fileIO ); \ WRITE( int, dump_type, fileIO ); \ /* High level information */ \ - WRITE( int, step(), fileIO ); \ + WRITE( int, step, fileIO ); \ WRITE( int, nxout, fileIO ); \ WRITE( int, nyout, fileIO ); \ WRITE( int, nzout, fileIO ); \ @@ -33,21 +33,21 @@ WRITE( float, grid->cvac, fileIO ); \ WRITE( float, grid->eps0, fileIO ); \ WRITE( float, 0 /* damp */, fileIO ); \ - WRITE( int, rank(), fileIO ); \ - WRITE( int, nproc(), fileIO ); \ + WRITE( int, rank, fileIO ); \ + WRITE( int, nproc, fileIO ); \ /* Species parameters */ \ WRITE( int, sp_id, fileIO ); \ WRITE( float, q_m, fileIO ); \ } while(0) - + // Note dim _MUST_ be a pointer to an int - + #define WRITE_ARRAY_HEADER(p,ndim,dim,fileIO) do { \ WRITE( int, sizeof(p[0]), fileIO ); \ WRITE( int, ndim, fileIO ); \ fileIO.write( dim, ndim ); \ } while(0) - + // The WRITE macro copies the output "value" into a temporary variable // of the requested output "type" so that the write to the "file" // occurs from a known binary data type. For example, if grid.dx were @@ -60,12 +60,12 @@ // single precision write copies. However, specialty types could be // created so that the type cast __WRITE_tmp = (type)(value) // automatically does the underlying conversion in C++ - + #define WRITE(type,value,fileIO) do { \ type __WRITE_tmp = (type)(value); \ fileIO.write( &__WRITE_tmp, 1 ); \ } while(0) - + // Note: strlen does not include the terminating \0 #define WRITE_STRING(string,fileIO) do { \ int __WRITE_STRING_len = 0; \ @@ -74,103 +74,102 @@ if( __WRITE_STRING_len>0 ) \ fileIO.write( string, __WRITE_STRING_len ); \ } while(0) - + #define READ(type,value,fileIO) do { \ type __READ_tmp; \ fileIO.read(&__READ_tmp, 1 ); \ (value) = __READ_tmp; \ } while(0) -#define F_WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \ - /* Binary compatibility information */ \ - F_WRITE( char, CHAR_BIT, fileIO ); \ - F_WRITE( char, sizeof(short int), fileIO ); \ - F_WRITE( char, sizeof(int), fileIO ); \ - F_WRITE( char, sizeof(float), fileIO ); \ - F_WRITE( char, sizeof(double), fileIO ); \ - F_WRITE( short int, 0xcafe, fileIO ); \ - F_WRITE( int, 0xdeadbeef, fileIO ); \ - F_WRITE( float, 1.0, fileIO ); \ - F_WRITE( double, 1.0, fileIO ); \ - /* Dump type and header format version */ \ - F_WRITE( int, 0 /* Version */, fileIO ); \ - F_WRITE( int, dump_type, fileIO ); \ - /* High level information */ \ - F_WRITE( int, step(), fileIO ); \ - F_WRITE( int, imxstr-2, fileIO ); \ - F_WRITE( int, jmxstr-2, fileIO ); \ - F_WRITE( int, kmxstr-2, fileIO ); \ - F_WRITE( float, grid->dt, fileIO ); \ - F_WRITE( float, dxstr, fileIO ); \ - F_WRITE( float, dystr, fileIO ); \ - F_WRITE( float, dzstr, fileIO ); \ - F_WRITE( float, grid->x0, fileIO ); \ - F_WRITE( float, grid->y0, fileIO ); \ - F_WRITE( float, grid->z0, fileIO ); \ - F_WRITE( float, grid->cvac, fileIO ); \ - F_WRITE( float, grid->eps0, fileIO ); \ - F_WRITE( float, 0 /*damp*/, fileIO ); \ - F_WRITE( int, rank(), fileIO ); \ - F_WRITE( int, nproc(), fileIO ); \ - /* Species parameters */ \ - F_WRITE( int, sp_id, fileIO ); \ - F_WRITE( float, q_m, fileIO ); \ - } while(0) - -#define F_WRITE_HEADER_PAR(dump_type,sp_id,q_m,fileIO) do { \ - /* Binary compatibility information */ \ - F_WRITE( char, CHAR_BIT, fileIO ); \ - F_WRITE( char, sizeof(short int), fileIO ); \ - F_WRITE( char, sizeof(int), fileIO ); \ - F_WRITE( char, sizeof(float), fileIO ); \ - F_WRITE( char, sizeof(double), fileIO ); \ - F_WRITE( short int, 0xcafe, fileIO ); \ - F_WRITE( int, 0xdeadbeef, fileIO ); \ - F_WRITE( float, 1.0, fileIO ); \ - F_WRITE( double, 1.0, fileIO ); \ - /* Dump type and header format version */ \ - F_WRITE( int, 0 /* Version */, fileIO ); \ - F_WRITE( int, dump_type, fileIO ); \ - /* High level information */ \ - F_WRITE( int, step(), fileIO ); \ - F_WRITE( int, grid->nx, fileIO ); \ - F_WRITE( int, grid->ny, fileIO ); \ - F_WRITE( int, grid->nz, fileIO ); \ - F_WRITE( float, grid->dt, fileIO ); \ - F_WRITE( float, grid->dx, fileIO ); \ - F_WRITE( float, grid->dy, fileIO ); \ - F_WRITE( float, grid->dz, fileIO ); \ - F_WRITE( float, grid->x0, fileIO ); \ - F_WRITE( float, grid->y0, fileIO ); \ - F_WRITE( float, grid->z0, fileIO ); \ - F_WRITE( float, grid->cvac, fileIO ); \ - F_WRITE( float, grid->eps0, fileIO ); \ - F_WRITE( float, 0 /*damp*/, fileIO ); \ - F_WRITE( int, rank(), fileIO ); \ - F_WRITE( int, nproc(), fileIO ); \ - /* Species parameters */ \ - F_WRITE( int, sp_id, fileIO ); \ - F_WRITE( float, q_m, fileIO ); \ - } while(0) - +//#define F_WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \ + ///* Binary compatibility information */ \ + //F_WRITE( char, CHAR_BIT, fileIO ); \ + //F_WRITE( char, sizeof(short int), fileIO ); \ + //F_WRITE( char, sizeof(int), fileIO ); \ + //F_WRITE( char, sizeof(float), fileIO ); \ + //F_WRITE( char, sizeof(double), fileIO ); \ + //F_WRITE( short int, 0xcafe, fileIO ); \ + //F_WRITE( int, 0xdeadbeef, fileIO ); \ + //F_WRITE( float, 1.0, fileIO ); \ + //F_WRITE( double, 1.0, fileIO ); \ + ///* Dump type and header format version */ \ + //F_WRITE( int, 0 /* Version */, fileIO ); \ + //F_WRITE( int, dump_type, fileIO ); \ + ///* High level information */ \ + //F_WRITE( int, step(), fileIO ); \ + //F_WRITE( int, imxstr-2, fileIO ); \ + //F_WRITE( int, jmxstr-2, fileIO ); \ + //F_WRITE( int, kmxstr-2, fileIO ); \ + //F_WRITE( float, grid->dt, fileIO ); \ + //F_WRITE( float, dxstr, fileIO ); \ + //F_WRITE( float, dystr, fileIO ); \ + //F_WRITE( float, dzstr, fileIO ); \ + //F_WRITE( float, grid->x0, fileIO ); \ + //F_WRITE( float, grid->y0, fileIO ); \ + //F_WRITE( float, grid->z0, fileIO ); \ + //F_WRITE( float, grid->cvac, fileIO ); \ + //F_WRITE( float, grid->eps0, fileIO ); \ + //F_WRITE( float, 0 /*damp*/, fileIO ); \ + //F_WRITE( int, rank(), fileIO ); \ + //F_WRITE( int, nproc(), fileIO ); \ + ///* Species parameters */ \ + //F_WRITE( int, sp_id, fileIO ); \ + //F_WRITE( float, q_m, fileIO ); \ + //} while(0) + +//#define F_WRITE_HEADER_PAR(dump_type,sp_id,q_m,fileIO) do { \ + ///* Binary compatibility information */ \ + //F_WRITE( char, CHAR_BIT, fileIO ); \ + //F_WRITE( char, sizeof(short int), fileIO ); \ + //F_WRITE( char, sizeof(int), fileIO ); \ + //F_WRITE( char, sizeof(float), fileIO ); \ + //F_WRITE( char, sizeof(double), fileIO ); \ + //F_WRITE( short int, 0xcafe, fileIO ); \ + //F_WRITE( int, 0xdeadbeef, fileIO ); \ + //F_WRITE( float, 1.0, fileIO ); \ + //F_WRITE( double, 1.0, fileIO ); \ + ///* Dump type and header format version */ \ + //F_WRITE( int, 0 /* Version */, fileIO ); \ + //F_WRITE( int, dump_type, fileIO ); \ + ///* High level information */ \ + //F_WRITE( int, step(), fileIO ); \ + //F_WRITE( int, grid->nx, fileIO ); \ + //F_WRITE( int, grid->ny, fileIO ); \ + //F_WRITE( int, grid->nz, fileIO ); \ + //F_WRITE( float, grid->dt, fileIO ); \ + //F_WRITE( float, grid->dx, fileIO ); \ + //F_WRITE( float, grid->dy, fileIO ); \ + //F_WRITE( float, grid->dz, fileIO ); \ + //F_WRITE( float, grid->x0, fileIO ); \ + //F_WRITE( float, grid->y0, fileIO ); \ + //F_WRITE( float, grid->z0, fileIO ); \ + //F_WRITE( float, grid->cvac, fileIO ); \ + //F_WRITE( float, grid->eps0, fileIO ); \ + //F_WRITE( float, 0 /*damp*/, fileIO ); \ + //F_WRITE( int, rank(), fileIO ); \ + //F_WRITE( int, nproc(), fileIO ); \ + ///* Species parameters */ \ + //F_WRITE( int, sp_id, fileIO ); \ + //F_WRITE( float, q_m, fileIO ); \ + //} while(0) + // Note dim _MUST_ be a pointer to an int - -#define F_WRITE_ARRAY_HEADER(psiz,ndim,dim,fileIO) do { \ - F_WRITE( int, psiz, fileIO ); \ - F_WRITE( int, ndim, fileIO ); \ - fileIO.write( dim, ndim ); \ - } while(0) - -#define F_WRITE(type,value,fileIO) do { \ - type __F_WRITE_tmp = (type)(value); \ - fileIO.write( &__F_WRITE_tmp, 1 ); \ - } while(0) - -#define F_READ(type,value,fileIO) do { \ - type __F_READ_tmp; \ - fileIO.read( &__F_READ_tmp, 1 ); \ - (value) = __F_READ_tmp; \ - } while(0) +//#define F_WRITE_ARRAY_HEADER(psiz,ndim,dim,fileIO) do { \ + //F_WRITE( int, psiz, fileIO ); \ + //F_WRITE( int, ndim, fileIO ); \ + //fileIO.write( dim, ndim ); \ + //} while(0) + +//#define F_WRITE(type,value,fileIO) do { \ + //type __F_WRITE_tmp = (type)(value); \ + //fileIO.write( &__F_WRITE_tmp, 1 ); \ + //} while(0) + +//#define F_READ(type,value,fileIO) do { \ + //type __F_READ_tmp; \ + //fileIO.read( &__F_READ_tmp, 1 ); \ + //(value) = __F_READ_tmp; \ + //} while(0) #define ABORT(cond) if( cond ) ERROR(( #cond )) diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index bfd18767..3e3b3812 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -9,6 +9,7 @@ */ #include "vpic.h" +#include "dump_strategy.h" /* Note that, when a vpic_simulation is created (and thus registered with the checkpt service), it is created empty; none of the simulation @@ -71,8 +72,11 @@ reanimate_vpic_simulation( vpic_simulation * vpic ) { } -vpic_simulation::vpic_simulation() { - CLEAR( this, 1 ); +vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() )) +{ + // TODO: why is this a good idea? + // Is this just trying to 0 initialize everything? + // CLEAR( this, 1 ); /* Set non-zero defaults */ verbose = 1; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 6b657a16..73dfab29 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -24,6 +24,7 @@ #include "../util/bitfield.h" #include "../util/checksum.h" #include "../util/system.h" +#include "dump_strategy.h" #ifndef USER_GLOBAL_SIZE #define USER_GLOBAL_SIZE 16384 @@ -287,9 +288,12 @@ class vpic_simulation { int field_interval; int particle_interval; - size_t nxout, nyout, nzout; + // TODO: these can probably now be removed, as they should only be used by dump? + // TODO: check if any decks used them + //size_t nxout, nyout, nzout; + //float dxout, dyout, dzout; + size_t px, py, pz; - float dxout, dyout, dzout; int ndfld; int ndhyd; @@ -361,7 +365,7 @@ class vpic_simulation { /////////////// // Dump helpers - int dump_mkdir(const char * dname); + static int dump_mkdir(const char * dname); int dump_cwd(char * dname, size_t size); // Text dumps @@ -380,22 +384,9 @@ class vpic_simulation { void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); -#ifdef VPIC_ENABLE_OPENPMD - void dump_fields_openpmd( const char *fbase, int fname_tag = 1 ); - void dump_particles_openpmd( - const char *sp_name, - const char *fbase, - int ftag = 1 - ); -#endif + Dump_Strategy dump_strategy; #ifdef VPIC_ENABLE_HDF5 - void dump_particles_hdf5( const char *sp_name, const char *fbase, - int fname_tag = 1 ); - void dump_hydro_hdf5( const char *sp_name, const char *fbase, - int fname_tag = 1 ); - void dump_fields_hdf5( const char *fbase, int fname_tag = 1 ); - // Declare vars to use hydro_dump_flag_t hydro_dump_flag; field_dump_flag_t field_dump_flag; From 686c143c8dfefb18fcea92c0675ebc0714fdede2 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 6 Nov 2019 09:50:09 -0700 Subject: [PATCH 71/95] first compile and linking build of dump strategy --- src/field_advance/field_advance.h | 12 +- src/species_advance/species_advance.cc | 4 +- src/species_advance/species_advance_aos.h | 2 + src/util/io/FileIO.h | 1 + src/util/util_base.h | 12 +- src/vpic/dump.cc | 22 ++- src/vpic/dump.h | 31 +--- src/vpic/dump_strategy.cc | 158 +++++++++++++++++++++ src/vpic/dump_strategy.h | 163 +++------------------- src/vpic/vpic.cc | 7 +- src/vpic/vpic.h | 4 +- 11 files changed, 227 insertions(+), 189 deletions(-) create mode 100644 src/vpic/dump_strategy.cc diff --git a/src/field_advance/field_advance.h b/src/field_advance/field_advance.h index d1cee710..0d435e8a 100644 --- a/src/field_advance/field_advance.h +++ b/src/field_advance/field_advance.h @@ -13,7 +13,7 @@ // // This module implements the following the difference equations on a // superhexahedral domain decomposed Yee-mesh: -// +// // advance_b -> Finite Differenced Faraday // cB_new = cB_old - frac c dt curl E // @@ -32,7 +32,7 @@ // rapidly reduce RMS divergence error assuming divergences errors // are due to accumulation of numerical roundoff when integrating // Faraday. See clean_div.c for details. -// +// // div_clean_e -> Modified Marder pass on electric fields // E_new = E_old + drive D dt grad err_mul div ( epsr E_old - rho/eps0 ) // Since the total rho may not be known everywhere (for example in @@ -65,7 +65,7 @@ // fmatx,fmaty,fmatz are all on the "face // mesh". rhof,rhob,div_e_err,nmat are on the "nodes mesh". // div_b_err,cmat are on the "cell mesh". -// +// // Above, for "edge mesh" quantities, interior means that the // component is not a tangential field directly on the surface of the // domain. For "face mesh" quantities, interior means that the @@ -97,7 +97,7 @@ // ... // material_coefficients = new_material_coefficients(grid,material_list); // fields = new_fields(grid); -// +// // ... Set the initial field values and place materials ... // // synchronize_fields(fields,grid); @@ -107,7 +107,7 @@ // initial fields or errors in the source terms or different floating // point properties on different nodes cause the shared faces to have // different fields). -// +// // To advance the fields in a PIC simulation with TCA radation damping // and periodic divergence cleaning, the following sequence is // suggested: @@ -118,7 +118,7 @@ // if( should_clean_div_e ) { // ... adjust rho_f, rho_b and/or rho_c as necessary // do { -// rms_err = clean_div_e( fields, material_coefficients, grid ); +// rms_err = clean_div_e( fields, material_coefficients, grid ); // } while( rms_err_too_high ); // } // if( should_clean_div_b ) { diff --git a/src/species_advance/species_advance.cc b/src/species_advance/species_advance.cc index 0e85a646..2ed53cbb 100644 --- a/src/species_advance/species_advance.cc +++ b/src/species_advance/species_advance.cc @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -146,7 +146,7 @@ species( const char * name, sp->sort_out_of_place = sort_out_of_place; MALLOC_ALIGNED( sp->partition, g->nv+1, 128 ); - sp->g = g; + sp->g = g; /* id, next are set by append species */ diff --git a/src/species_advance/species_advance_aos.h b/src/species_advance/species_advance_aos.h index 3e1af9ad..47fa1a78 100644 --- a/src/species_advance/species_advance_aos.h +++ b/src/species_advance/species_advance_aos.h @@ -12,6 +12,8 @@ #ifndef _species_advance_aos_h_ #define _species_advance_aos_h_ +// TODO: should we restrict the direct include of this header? + typedef int32_t species_id; // Must be 32-bit wide for particle_injector_t // FIXME: Eventually particle_t (definitely) and their other formats diff --git a/src/util/io/FileIO.h b/src/util/io/FileIO.h index 0d8ed6da..74221451 100644 --- a/src/util/io/FileIO.h +++ b/src/util/io/FileIO.h @@ -13,6 +13,7 @@ #define FileIO_h #include +#include #include "FileIOData.h" /*! diff --git a/src/util/util_base.h b/src/util/util_base.h index bc9db329..4f2ada6c 100644 --- a/src/util/util_base.h +++ b/src/util/util_base.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -21,7 +21,7 @@ #endif // C99 does requires some key macros of stdint to only be defined in -// C++ implementations if explicitly requested. +// C++ implementations if explicitly requested. #define __STDC_LIMIT_MACROS @@ -102,7 +102,7 @@ typedef struct collective collective_t; #ifndef RESTRICT #define RESTRICT __restrict -#endif +#endif // Normal pointers (e.g. a *) are in whatever address space the given // compile unit uses. However, sometimes it is necessary to declare @@ -154,7 +154,7 @@ typedef struct collective collective_t; // allow correct autogeneration when no alignment necessary ... sigh // ... -#define PAD(s,a) ( (a) - ( (s) & ( (a)-1 ) ) ) +#define PAD(s,a) ( (a) - ( (s) & ( (a)-1 ) ) ) // POW2_CEIL rounds "u" up to the nearest multiple of the power of two // "a". If u is a multiple of "a", its value is unchanged. "a" should @@ -344,7 +344,7 @@ void detect_old_style_arguments(int* pargc, char *** pargv); #define MALLOC(x,n) \ util_malloc( "MALLOC( "#x", "#n" (%lu bytes) ) at " \ __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \ - &(x), (n)*sizeof(*(x)) ) + &(x), (n)*sizeof(*(x)) ) void util_malloc( const char * err_fmt, // Has exactly one %lu in it @@ -370,7 +370,7 @@ util_free( void * mem_ref ); #n" (%lu bytes), " \ #a" (%lu bytes) ) at " \ __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \ - &(x), (n)*sizeof(*(x)), (a) ) + &(x), (n)*sizeof(*(x)), (a) ) void diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 35d7441d..e4fd86ae 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -44,7 +44,7 @@ void vpic_simulation::dump_particles( const char *sp_name, int ftag ) { species_t * sp = find_species_name(sp_name, species_list); - dump_strategy.dump_particles( + dump_strategy->dump_particles( fbase, sp, grid, @@ -54,12 +54,28 @@ void vpic_simulation::dump_particles( const char *sp_name, ); } -void dump_fields( const char *fbase, int fname_tag = 1 ) +void vpic_simulation::dump_fields( const char *fbase, int ftag ) { + dump_strategy->dump_fields( + fbase, + step(),grid, + field_array, + ftag + ); } -void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 ) +void vpic_simulation::dump_hydro( const char *sp_name, const char *fbase, int ftag ) { + species_t * sp = find_species_name(sp_name, species_list); + dump_strategy->dump_hydro( + fbase, + step(), + hydro_array, + sp, + interpolator_array, + grid, + ftag + ); } void diff --git a/src/vpic/dump.h b/src/vpic/dump.h index 966e627e..1d17ee8a 100644 --- a/src/vpic/dump.h +++ b/src/vpic/dump.h @@ -2,6 +2,7 @@ #define dump_h #include +#include "../grid/grid.h" // TODO: should this be an enum? namespace dump_type { @@ -14,33 +15,5 @@ namespace dump_type { } // namespace // TODO: namesapce? -std::array global_particle_index(int local_i, grid_t* grid, int rank) -{ - int ix, iy, iz, rx, ry, rz; - // Convert rank to local x/y/z - UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - // Calculate local ix/iy/iz - UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); - - // Account for the "first" ghost cell - ix = ix - 1; - iy = iy - 1; - iz = iz - 1; - - // Convert ix/iy/iz to global - int gix = ix + (grid->nx * (rx)); - int giy = iy + (grid->ny * (ry)); - int giz = iz + (grid->nz * (rz)); - - // calculate global grid sizes - int gnx = grid->nx * grid->gpx; - int gny = grid->ny * grid->gpy; - int gnz = grid->nz * grid->gpz; - - // TODO: find a better way to account for the hard coded ghosts in VOXEL - int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); - - return { global_i, gix, giy, giz }; -} +std::array global_particle_index(int local_i, grid_t* grid, int rank); #endif diff --git a/src/vpic/dump_strategy.cc b/src/vpic/dump_strategy.cc new file mode 100644 index 00000000..adea2714 --- /dev/null +++ b/src/vpic/dump_strategy.cc @@ -0,0 +1,158 @@ +//BinaryDump::BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc) +//{ + //// empty +//} +#include "dump_strategy.h" + +void BinaryDump::dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) +{ + char fname[256]; + FileIO fileIO; + int dim[3]; + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase )); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail ) ERROR(( "Could not open \"%s\".", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc); + + dim[0] = grid->nx+2; + dim[1] = grid->ny+2; + dim[2] = grid->nz+2; + WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO ); + fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] ); + if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" )); +} + +void BinaryDump::dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) +{ + char fname[256]; + FileIO fileIO; + int dim[1], buf_start; + static particle_t * ALIGNED(128) p_buf = NULL; +# define PBUF_SIZE 32768 // 1MB of particles + + if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name )); + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 ); + + if( rank==0 ) + MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase)); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail ) ERROR(( "Could not open \"%s\"", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); + + dim[0] = sp->np; + WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO ); + + // Copy a PBUF_SIZE hunk of the particle list into the particle + // buffer, timecenter it and write it out. This is done this way to + // guarantee the particle list unchanged while not requiring too + // much memory. + + // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE + // LARGE. + + particle_t * sp_p = sp->p; sp->p = p_buf; + int sp_np = sp->np; sp->np = 0; + int sp_max_np = sp->max_np; sp->max_np = PBUF_SIZE; + for( buf_start=0; buf_startnp = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE; + COPY( sp->p, &sp_p[buf_start], sp->np ); + center_p( sp, interpolator_array ); + fileIO.write( sp->p, sp->np ); + } + sp->p = sp_p; + sp->np = sp_np; + sp->max_np = sp_max_np; + + if( fileIO.close() ) ERROR(("File close failed on dump particles!!!")); +} +void BinaryDump::dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) +{ + char fname[256]; + FileIO fileIO; + int dim[3]; + + if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name )); + + clear_hydro_array( hydro_array ); + accumulate_hydro_p( hydro_array, sp, interpolator_array ); + synchronize_hydro_array( hydro_array ); + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( rank==0 ) + MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); + + if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); + else sprintf( fname, "%s.%i", fbase, rank ); + FileIOStatus status = fileIO.open(fname, io_write); + if( status==fail) ERROR(( "Could not open \"%s\".", fname )); + + /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ + size_t nxout = grid->nx; + size_t nyout = grid->ny; + size_t nzout = grid->nz; + float dxout = grid->dx; + float dyout = grid->dy; + float dzout = grid->dz; + + WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); + + dim[0] = grid->nx+2; + dim[1] = grid->ny+2; + dim[2] = grid->nz+2; + WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO ); + fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] ); + if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); +} diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index c745df7c..6a5ba92c 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1,12 +1,18 @@ #ifndef Dump_Strategy_h #define Dump_Strategy_h +// TODO: should I drop the ./src here? #include "../util/io/FileIO.h" #include "../util/util_base.h" #include "../util/io/FileUtils.h" +#include "../field_advance/field_advance.h" +#include "../sf_interface/sf_interface.h" +#include "../species_advance/species_advance.h" + #include "dump.h" #include "dumpmacros.h" + #ifdef VPIC_ENABLE_HDF5 #include "hdf5.h" // from the lib #include "hdf5_header_info.h" // from vpic @@ -21,6 +27,7 @@ class Dump_Strategy { int rank, nproc; Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty + virtual ~Dump_Strategy() { }; virtual void dump_fields( const char *fbase, @@ -28,7 +35,7 @@ class Dump_Strategy { grid_t* grid, field_array_t* field_array, int ftag - ); + ) = 0; virtual void dump_hydro( const char *fbase, int step, @@ -37,7 +44,7 @@ class Dump_Strategy { interpolator_array_t* interpolator_array, grid_t* grid, int ftag - ); + ) = 0; virtual void dump_particles( const char *fbase, species_t* sp, @@ -45,12 +52,13 @@ class Dump_Strategy { int step, interpolator_array_t* interpolator_array, int ftag - ); + ) = 0; }; class BinaryDump : public Dump_Strategy { public: using Dump_Strategy::Dump_Strategy; // inherit constructor + BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc){ } // empty // TODO: now we pass rank and step, ftag has odd semanticds void dump_fields( @@ -59,39 +67,16 @@ class BinaryDump : public Dump_Strategy { grid_t* grid, field_array_t* field_array, int ftag - ) - { - char fname[256]; - FileIO fileIO; - int dim[3]; - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase )); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); - else sprintf( fname, "%s.%i", fbase, rank ); - - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail ) ERROR(( "Could not open \"%s\".", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - size_t nxout = grid->nx; - size_t nyout = grid->ny; - size_t nzout = grid->nz; - float dxout = grid->dx; - float dyout = grid->dy; - float dzout = grid->dz; - - WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc); - - dim[0] = grid->nx+2; - dim[1] = grid->ny+2; - dim[2] = grid->nz+2; - WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO ); - fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] ); - if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" )); - } + ); + void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ); void dump_particles( const char *fbase, species_t* sp, @@ -99,111 +84,7 @@ class BinaryDump : public Dump_Strategy { int step, interpolator_array_t* interpolator_array, int ftag - ) - { - char fname[256]; - FileIO fileIO; - int dim[1], buf_start; - static particle_t * ALIGNED(128) p_buf = NULL; -# define PBUF_SIZE 32768 // 1MB of particles - - if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name )); - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 ); - - if( rank==0 ) - MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase)); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); - else sprintf( fname, "%s.%i", fbase, rank ); - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail ) ERROR(( "Could not open \"%s\"", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - size_t nxout = grid->nx; - size_t nyout = grid->ny; - size_t nzout = grid->nz; - float dxout = grid->dx; - float dyout = grid->dy; - float dzout = grid->dz; - - WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); - - dim[0] = sp->np; - WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO ); - - // Copy a PBUF_SIZE hunk of the particle list into the particle - // buffer, timecenter it and write it out. This is done this way to - // guarantee the particle list unchanged while not requiring too - // much memory. - - // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE - // LARGE. - - particle_t * sp_p = sp->p; sp->p = p_buf; - int sp_np = sp->np; sp->np = 0; - int sp_max_np = sp->max_np; sp->max_np = PBUF_SIZE; - for( buf_start=0; buf_startnp = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE; - COPY( sp->p, &sp_p[buf_start], sp->np ); - center_p( sp, interpolator_array ); - fileIO.write( sp->p, sp->np ); - } - sp->p = sp_p; - sp->np = sp_np; - sp->max_np = sp_max_np; - - if( fileIO.close() ) ERROR(("File close failed on dump particles!!!")); - } - void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) - { - char fname[256]; - FileIO fileIO; - int dim[3]; - - if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name )); - - clear_hydro_array( hydro_array ); - accumulate_hydro_p( hydro_array, sp, interpolator_array ); - synchronize_hydro_array( hydro_array ); - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( rank==0 ) - MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); - - if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank ); - else sprintf( fname, "%s.%i", fbase, rank ); - FileIOStatus status = fileIO.open(fname, io_write); - if( status==fail) ERROR(( "Could not open \"%s\".", fname )); - - /* IMPORTANT: these values are written in WRITE_HEADER_V0 */ - size_t nxout = grid->nx; - size_t nyout = grid->ny; - size_t nzout = grid->nz; - float dxout = grid->dx; - float dyout = grid->dy; - float dzout = grid->dz; - - WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc); - - dim[0] = grid->nx+2; - dim[1] = grid->ny+2; - dim[2] = grid->nz+2; - WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO ); - fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] ); - if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" )); - } + ); }; #ifdef VPIC_ENABLE_HDF5 diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index 3e3b3812..0dd2a418 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -72,7 +72,7 @@ reanimate_vpic_simulation( vpic_simulation * vpic ) { } -vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() )) +vpic_simulation::vpic_simulation() { // TODO: why is this a good idea? // Is this just trying to 0 initialize everything? @@ -112,6 +112,11 @@ vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() ) REGISTER_OBJECT( this, checkpt_vpic_simulation, restore_vpic_simulation, reanimate_vpic_simulation ); + // Initialize the dump strategy to use the binary dumpin, assuming the user + // may overwrite this later + dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc() )); + + // TODO: this this still makes sense now we have a dump strategy #ifdef VPIC_ENABLE_HDF5 // Default init hdf5 dump flags field_interval = 1; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 73dfab29..939480f7 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -384,7 +384,9 @@ class vpic_simulation { void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); - Dump_Strategy dump_strategy; + // Very likely a user will forgot to delete this if they change the strategy, + // a smart ptr will save us from the small leak + std::unique_ptr dump_strategy; #ifdef VPIC_ENABLE_HDF5 // Declare vars to use From 461598513e39a21c4f4482dbb8a1391a94da5037 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 6 Nov 2019 10:45:28 -0700 Subject: [PATCH 72/95] fixed include for unique pointer and add comment on inheritance --- src/vpic/dump.cc | 29 +++++++++++++++++++++++++++++ src/vpic/dump_strategy.h | 3 +++ src/vpic/vpic.h | 1 + 3 files changed, 33 insertions(+) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index e4fd86ae..005bb048 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -23,6 +23,35 @@ // COMPATIBLE WITH EXISTING EXTERNAL 3RD PARTY VISUALIZATION SOFTWARE. // IN THE LONG RUN, THIS EXTERNAL SOFTWARE WILL NEED TO BE UPDATED. +std::array global_particle_index(int local_i, grid_t* grid, int rank) +{ + int ix, iy, iz, rx, ry, rz; + // Convert rank to local x/y/z + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + // Calculate local ix/iy/iz + UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + + // Account for the "first" ghost cell + ix = ix - 1; + iy = iy - 1; + iz = iz - 1; + + // Convert ix/iy/iz to global + int gix = ix + (grid->nx * (rx)); + int giy = iy + (grid->ny * (ry)); + int giz = iz + (grid->nz * (rz)); + + // calculate global grid sizes + int gnx = grid->nx * grid->gpx; + int gny = grid->ny * grid->gpy; + int gnz = grid->nz * grid->gpz; + + // TODO: find a better way to account for the hard coded ghosts in VOXEL + int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); + + return { global_i, gix, giy, giz }; +} // TODO: this should live somewhere more sensible, but it's better than the // global static it replaces std::unordered_map tframe_map; diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 6a5ba92c..d3c3586a 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -22,6 +22,9 @@ #include #endif +// Runtime inheritance is obviously not very "VPIC like", as we will [probably] +// incur a penalty for the vtable lookup, but given we're about to do IO this +// is very negligible. class Dump_Strategy { public: int rank, nproc; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 939480f7..5c6c72c3 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -15,6 +15,7 @@ #include #include +#include // unique_ptr #include "../boundary/boundary.h" #include "../collision/collision.h" From acd77ce95801abc905a00e7afb9d28e93c759dfb Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 12 Nov 2019 12:34:34 -0700 Subject: [PATCH 73/95] hdf5 backend seems to be working, including example of how to enable --- sample/harrisHDF5 | 29 +++-- src/vpic/dump.cc | 21 ++- src/vpic/dump_strategy.h | 253 +++++++++++++++++++++++++++--------- src/vpic/hdf5_header_info.h | 215 +++++++++++++++--------------- src/vpic/vpic.cc | 14 +- src/vpic/vpic.h | 154 ++-------------------- 6 files changed, 356 insertions(+), 330 deletions(-) diff --git a/sample/harrisHDF5 b/sample/harrisHDF5 index 2b3b21bf..c6c326de 100644 --- a/sample/harrisHDF5 +++ b/sample/harrisHDF5 @@ -49,7 +49,7 @@ begin_initialization { // Example of how to call / set dumping - field_dump_flag.disableEMAT(); + //field_dump_flag.disableEMAT(); double input_mass_ratio; @@ -141,16 +141,16 @@ begin_initialization { num_step = int(0.2*taui/(wci*dt)); status_interval = int(1./(wci*dt)); - field_interval = status_interval; - hydro_interval = status_interval; + field_interval = 1; //status_interval; + hydro_interval = 1; //status_interval; sync_shared_interval = status_interval; clean_div_e_interval = status_interval; clean_div_b_interval = status_interval; global->energies_interval = status_interval; - global->fields_interval = status_interval; - global->ehydro_interval = status_interval; - global->ihydro_interval = status_interval; + global->fields_interval = 1; //status_interval; + global->ehydro_interval = 1; //status_interval; + global->ihydro_interval = 1; //status_interval; global->eparticle_interval = status_interval; global->iparticle_interval = status_interval; global->restart_interval = status_interval; @@ -324,6 +324,11 @@ begin_initialization { // - Increment the time step // - Call user diagnostics // - (periodically) Print a status message + + // Explicitly enable HDF5 backend for IO dump + // WARNING: Call this after you have set `num_step` (for now.. soon fixed) + + enable_hdf5_dump(); } begin_diagnostics { @@ -369,8 +374,8 @@ begin_diagnostics { // algorithm. As a result, JF is not valid until at least one timestep has // been completed. Field dumps are in a binary format. Each rank makes a // field dump. - if( step()==-10 ) dump_fields_hdf5("fields"); // Get first valid total J - if( should_dump(fields) ) dump_fields_hdf5("fields"); + if( step()==-10 ) dump_fields("fields"); // Get first valid total J + if( should_dump(fields) ) dump_fields("fields"); // Hydro dumps store particle charge density, current density and // stress-energy tensor. All these quantities are known at the time @@ -381,16 +386,16 @@ begin_diagnostics { // purely diagnostic. It is not used by the simulation and it is not // accumulated using a self-consistent charge-conserving method. Hydro dumps // are in a binary format. Each rank makes a hydro dump. - if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); - if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); + if(should_dump(ehydro) ) dump_hydro("electron","ehydro"); + if( should_dump(ihydro) ) dump_hydro("ion", "ihydro"); // Particle dumps store the particle data for a given species. The data // written is known at the time t = time(). By default, particle dumps // are tagged with step(). However, if a "0" is added to the call, the // filename will not be tagged. Particle dumps are in a binary format. // Each rank makes a particle dump. - if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle"); - if( should_dump(iparticle) ) dump_particles_hdf5("ion", "iparticle"); + if( should_dump(eparticle) ) dump_particles("electron","eparticle"); + if( should_dump(iparticle) ) dump_particles("ion", "iparticle"); // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string // and tag is an integer. A typical usage is: diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 005bb048..9ac120a2 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -52,9 +52,6 @@ std::array global_particle_index(int local_i, grid_t* grid, int rank) return { global_i, gix, giy, giz }; } -// TODO: this should live somewhere more sensible, but it's better than the -// global static it replaces -std::unordered_map tframe_map; int vpic_simulation::dump_mkdir(const char * dname) { return FileUtils::makeDirectory(dname); @@ -68,6 +65,24 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) { * ASCII dump IO *****************************************************************************/ +void vpic_simulation::enable_binary_dump() { + dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc(), num_step )); +} + +#ifdef VPIC_ENABLE_HDF5 +void vpic_simulation::enable_hdf5_dump() { + std::cout << "Enabling HDF5 IO backend" << std::endl; + dump_strategy = std::unique_ptr(new HDF5Dump( rank(), nproc(), num_step )); +} +#endif + +#ifdef VPIC_ENABLE_OPENPMD +void vpic_simulation::enable_openpmd_dump() { + std::cout << "Enabling openPMD IO backend" << std::endl; + dump_strategy = std::unique_ptr(new OpenPMDDump( rank(), nproc(), num_step )); +} +#endif + void vpic_simulation::dump_particles( const char *sp_name, const char *fbase, int ftag ) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index d3c3586a..08fe6e60 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1,6 +1,13 @@ #ifndef Dump_Strategy_h #define Dump_Strategy_h +#include +#include + +#include // TODO: it would be good if this didn't have to know about MPI + +#define DUMP_INFO_DEBUG 1 + // TODO: should I drop the ./src here? #include "../util/io/FileIO.h" #include "../util/util_base.h" @@ -27,9 +34,14 @@ // is very negligible. class Dump_Strategy { public: - int rank, nproc; + int rank, nproc, num_step; + + Dump_Strategy(int _rank, int _nproc, int total_steps) : + rank(_rank), + nproc(_nproc), + num_step(total_steps) // TODO: remove the need for this + { } // empty - Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty virtual ~Dump_Strategy() { }; virtual void dump_fields( @@ -61,7 +73,7 @@ class Dump_Strategy { class BinaryDump : public Dump_Strategy { public: using Dump_Strategy::Dump_Strategy; // inherit constructor - BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc){ } // empty + BinaryDump(int _rank, int _nproc, int total_steps) : Dump_Strategy(_rank, _nproc, total_steps){ } // empty // TODO: now we pass rank and step, ftag has odd semanticds void dump_fields( @@ -91,49 +103,157 @@ class BinaryDump : public Dump_Strategy { }; #ifdef VPIC_ENABLE_HDF5 + +struct field_dump_flag_t +{ + bool ex = true, ey = true, ez = true, div_e_err = true; + bool cbx = true, cby = true, cbz = true, div_b_err = true; + bool tcax = true, tcay = true, tcaz = true, rhob = true; + bool jfx = true, jfy = true, jfz = true, rhof = true; + bool ematx = true, ematy = true, ematz = true, nmat = true; + bool fmatx = true, fmaty = true, fmatz = true, cmat = true; + void disableE() + { + ex = false, ey = false, ez = false, div_e_err = false; + } + + void disableCB() + { + cbx = false, cby = false, cbz = false, div_b_err = false; + } + + void disableTCA() + { + tcax = false, tcay = false, tcaz = false, rhob = false; + } + + void disableJF() + { + jfx = false, jfy = false, jfz = false, rhof = false; + } + + void disableEMAT() + { + ematx = false, ematy = false, ematz = false, nmat = false; + } + + void disableFMAT() + { + fmatx = false, fmaty = false, fmatz = false, cmat = false; + } + + void resetToDefaults() + { + ex = true, ey = true, ez = true, div_e_err = true; + cbx = true, cby = true, cbz = true, div_b_err = true; + tcax = true, tcay = true, tcaz = true, rhob = true; + jfx = true, jfy = true, jfz = true, rhof = true; + ematx = true, ematy = true, ematz = true, nmat = true; + fmatx = true, fmaty = true, fmatz = true, cmat = true; + } + + bool enabledE() + { + return ex && ey && ez; + } + + bool enabledCB() + { + return cbx && cby && cbz; + } + + bool enabledTCA() + { + return tcax && tcay && tcaz; + } + + bool enabledJF() + { + return jfx && jfy && jfz; + } + + bool enabledEMAT() + { + return ematx && ematy && ematz; + } + + bool enabledFMAT() + { + return fmatx && fmaty && fmatz; + } +}; + +struct hydro_dump_flag_t +{ + bool jx = true, jy = true, jz = true, rho = true; + bool px = true, py = true, pz = true, ke = true; + bool txx = true, tyy = true, tzz = true; + bool tyz = true, tzx = true, txy = true; + + void disableJ() + { + jx = false, jy = false, jz = false, rho = false; + } + + void disableP() + { + px = false, py = false, pz = false, ke = false; + } + + void disableTD() //Stress diagonal + { + txx = false, tyy = false, tzz = false; + } + + void disableTOD() //Stress off-diagonal + { + tyz = false, tzx = false, txy = false; + } + void resetToDefaults() + { + jx = true, jy = true, jz = true, rho = true; + px = true, py = true, pz = true, ke = true; + txx = true, tyy = true, tzz = true; + tyz = true, tzx = true, txy = true; + } + + bool enabledJ() + { + return jx && jy && jz; + } + + bool enabledP() + { + return px && py && pz; + } + + bool enabledTD() + { + return txx && tyy && tzz; + } + + bool enabledTOD() + { + return tyz && tzx && txy; + } +}; class HDF5Dump : public Dump_Strategy { + std::unordered_map tframe_map; public: using Dump_Strategy::Dump_Strategy; // inherit constructor + + // TODO: replace these with a common dump interface + // Declare vars to use + hydro_dump_flag_t hydro_dump_flag; + field_dump_flag_t field_dump_flag; + #define DUMP_DIR_FORMAT "./%s" - /* define to do C-style indexing */ -#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] +// TODO: naming a macro so close to existing functions AND data is not a good +// define to do C-style indexing +#define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] // TODO: make function? -#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ - { \ - FILE *fp; \ - fp = fopen(xml_file_name, "a"); \ - fprintf(fp, main_body_head, time_step); \ - if (field_dump_flag.enabledE()) \ - write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ - if (field_dump_flag.div_e_err) \ - fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ - if (field_dump_flag.enabledCB()) \ - write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ - if (field_dump_flag.div_b_err) \ - fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ - if (field_dump_flag.enabledTCA()) \ - write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ - if (field_dump_flag.rhob) \ - fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ - if (field_dump_flag.enabledJF()) \ - write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ - if (field_dump_flag.rhof) \ - fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ - if (field_dump_flag.enabledEMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ - if (field_dump_flag.nmat) \ - fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ - if (field_dump_flag.enabledFMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ - if (field_dump_flag.cmat) \ - fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ - fprintf(fp, "%s", main_body_foot); \ - if (add_footer_flag) \ - fputs(footer, fp); \ - fclose(fp); \ - } void dump_fields( const char *fbase, int step, @@ -257,6 +377,7 @@ class HDF5Dump : public Dump_Strategy { int rx, ry, rz; UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + int mpi_rank_x, mpi_rank_y, mpi_rank_z; mpi_rank_x = rx; mpi_rank_y = ry; mpi_rank_z = rz; @@ -270,9 +391,9 @@ class HDF5Dump : public Dump_Strategy { global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = %d %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]); - printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); + printf("global size = %llu %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); fflush(stdout); #endif @@ -390,6 +511,13 @@ class HDF5Dump : public Dump_Strategy { char dxdydz[128]; sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + // TODO: remove or let the user set + int field_interval = 1; + + // TODO: remove this dependence on number of steps + std::cout << "num_step " << num_step << std::endl; + int nframes = num_step / field_interval + 1; static int field_tframe = 0; @@ -401,8 +529,8 @@ class HDF5Dump : public Dump_Strategy { printf(" dxdydz: %s \n", dxdydz); printf(" nframes: %d \n", nframes); printf(" field_interval: %d \n", field_interval); - printf(" current step: %lld \n", step_for_viou); - printf(" current step: %lld \n", step_for_viou); + printf(" current step: %zd \n", step_for_viou); + printf(" current step: %zd \n", step_for_viou); //printf(" Simulation time: %f \n", grid->t0); printf(" tframe: %d \n", field_tframe); @@ -459,9 +587,9 @@ class HDF5Dump : public Dump_Strategy { // get the total number of particles. in this example, output only electrons //sp = species_list; sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); - FileUtils::makeDirector(particle_scratch); + FileUtils::makeDirectory(particle_scratch); sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); - FileUtils::makeDirector(subparticle_scratch); + FileUtils::makeDirectory(subparticle_scratch); // TODO: Allow the user to set this int stride_particle_dump = 1; @@ -588,7 +716,6 @@ class HDF5Dump : public Dump_Strategy { global_pi[i] = global_i; } -#undef UNVOXEL dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data()); H5Dclose(dset_id); @@ -740,7 +867,7 @@ class HDF5Dump : public Dump_Strategy { { \ for (size_t k(1); k < grid->nz + 1; k++) \ { \ - temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME; \ + temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME; \ temp_buf_index = temp_buf_index + 1; \ } \ } \ @@ -756,9 +883,10 @@ class HDF5Dump : public Dump_Strategy { MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - species_t *sp = find_species_name(speciesname, species_list); if (!sp) - ERROR(("Invalid species name: %s", speciesname)); + { + ERROR(("Invalid species")); + } clear_hydro_array(hydro_array); accumulate_hydro_p(hydro_array, sp, interpolator_array); @@ -769,11 +897,11 @@ class HDF5Dump : public Dump_Strategy { char subhydro_scratch[128]; sprintf(hydro_scratch, "./%s", "hydro_hdf5"); - FileUtils::makeDirector(hydro_scratch); + FileUtils::makeDirectory(hydro_scratch); sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); - FileUtils::makeDirector(subhydro_scratch); + FileUtils::makeDirectory(subhydro_scratch); - sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou); + sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou); double el1 = uptime(); hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); @@ -832,7 +960,7 @@ class HDF5Dump : public Dump_Strategy { hydro_local_size[2] = grid->nz; int mpi_rank_x, mpi_rank_y, mpi_rank_z; - RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz); global_offset[0] = (grid->nx) * mpi_rank_x; global_offset[1] = (grid->ny) * mpi_rank_y; @@ -843,9 +971,9 @@ class HDF5Dump : public Dump_Strategy { global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %d %d %d \n", global_count[0], global_count[1], global_count[2]); + printf("global size = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); fflush(stdout); #endif @@ -929,7 +1057,7 @@ class HDF5Dump : public Dump_Strategy { if (mpi_rank == 0) { char output_xml_file[128]; - sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf"); + sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf"); char dimensions_3d[128]; sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); char dimensions_4d[128]; @@ -939,6 +1067,10 @@ class HDF5Dump : public Dump_Strategy { char dxdydz[128]; sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + // TODO: remove or let user set + int hydro_interval = 1; + + // TODO: remove this dependence on number of steps int nframes = num_step / hydro_interval + 1; const int tframe = tframe_map[sp->id]; @@ -951,13 +1083,14 @@ class HDF5Dump : public Dump_Strategy { printf(" dxdydz: %s \n", dxdydz); printf(" nframes: %d \n", nframes); printf(" hydro_fields_interval: %d \n", hydro_interval); - printf(" current step: %lld \n", step_for_viou); + printf(" current step: %zu \n", step_for_viou); printf(" Simulation time: %f \n", grid->t0); printf(" tframe: %d \n", tframe); #endif + // TODO: why doesnt this just use the cstr? char speciesname_new[128]; - sprintf(speciesname_new, "hydro_%s", speciesname); + sprintf(speciesname_new, "hydro_%s", sp->name); if (tframe >= 1) { if (tframe == (nframes - 1)) diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h index baed8f7d..e3810612 100644 --- a/src/vpic/hdf5_header_info.h +++ b/src/vpic/hdf5_header_info.h @@ -3,120 +3,123 @@ #define FIELD_ARRAY_NAME field_array -// XML header stuff -const char *header = "\n\n\n\t\n"; -const char *header_topology = "\t\t\n"; -const char *header_geom = "\t\t\n"; -const char *header_origin = "\t\t\t \n\t\t\t%s\n"; -const char *header_dxdydz = "\t\t\t \n\t\t\t%s\n"; -const char *footer_geom = "\t\t\n"; -const char *grid_line = "\t\t \n \ -\t\t\t\n"; -const char *footer = "\t\t\n\t\n\n"; +namespace VPIC_HDF { + // XML header stuff + static const char *header = "\n\n\n\t\n"; + static const char *header_topology = "\t\t\n"; + static const char *header_geom = "\t\t\n"; + static const char *header_origin = "\t\t\t \n\t\t\t%s\n"; + static const char *header_dxdydz = "\t\t\t \n\t\t\t%s\n"; + static const char *footer_geom = "\t\t\n"; + static const char *grid_line = "\t\t \n \ + \t\t\t\n"; + static const char *footer = "\t\t\n\t\n\n"; -const char *main_body_head = "\t\t\t \n \ -\t\t\t\t \n \ -\t\t\t\t \n"; -const char *main_body_foot = "\t\t\t\n"; + static const char *main_body_head = "\t\t\t \n \ + \t\t\t\t \n \ + \t\t\t\t \n"; + static const char *main_body_foot = "\t\t\t\n"; -const char *main_body_attributeV = "\ - \t\t\t\t \n \ - \t\t\t\t\t \n \ - \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ - \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ - \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ - \t\t\t\t\t \n \ - \t\t\t\t \n "; + static const char *main_body_attributeV = "\ + \t\t\t\t \n \ + \t\t\t\t\t \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t\t \n \ + \t\t\t\t \n "; -const char *main_body_attributeS = "\ - \t\t\t\t \n \ - \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ - \t\t\t\t \n "; + static const char *main_body_attributeS = "\ + \t\t\t\t \n \ + \t\t\t\t\t\t T.%d/%s_%d.h5:/Timestep_%d/%s \n \ + \t\t\t\t \n "; + +} // end namespace #define create_file_with_header(xml_file_name, dimensions, orignal, dxdydz, nframes, fields_interval) \ - { \ - FILE *fp; \ - fp = fopen(xml_file_name, "w"); \ - fputs(header, fp); \ - fprintf(fp, header_topology, dimensions); \ - fputs(header_geom, fp); \ - fprintf(fp, header_origin, orignal); \ - fprintf(fp, header_dxdydz, dxdydz); \ - fputs(footer_geom, fp); \ - fprintf(fp, grid_line, nframes); \ - int i; \ - for (i = 0; i < nframes; i++) \ - fprintf(fp, "%d ", i*fields_interval); \ - fputs(grid_line_footer, fp); \ - fclose(fp); \ - } + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "w"); \ + fputs(VPIC_HDF::header, fp); \ + fprintf(fp, VPIC_HDF::header_topology, dimensions); \ + fputs(VPIC_HDF::header_geom, fp); \ + fprintf(fp, VPIC_HDF::header_origin, orignal); \ + fprintf(fp, VPIC_HDF::header_dxdydz, dxdydz); \ + fputs(VPIC_HDF::footer_geom, fp); \ + fprintf(fp, VPIC_HDF::grid_line, nframes); \ + int i; \ + for (i = 0; i < nframes; i++) \ + fprintf(fp, "%d ", i*fields_interval); \ + fputs(VPIC_HDF::grid_line_footer, fp); \ + fclose(fp); \ + } #define write_main_body_attribute(fpp, main_body_attribute_p, attribute_name, dims_4d_p, dims_3d_p, file_name_pre_p, time_step_p, a1, a2, a3) \ - { \ - fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p, \ - dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1, \ - dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2, \ - dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3); \ - } + { \ + fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2, \ + dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3); \ + } #define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ - { \ - FILE *fp; \ - fp = fopen(xml_file_name, "a"); \ - fprintf(fp, main_body_head, time_step); \ - if (field_dump_flag.enabledE()) \ - write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ - if (field_dump_flag.div_e_err) \ - fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ - if (field_dump_flag.enabledCB()) \ - write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ - if (field_dump_flag.div_b_err) \ - fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ - if (field_dump_flag.enabledTCA()) \ - write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ - if (field_dump_flag.rhob) \ - fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ - if (field_dump_flag.enabledJF()) \ - write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ - if (field_dump_flag.rhof) \ - fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ - if (field_dump_flag.enabledEMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ - if (field_dump_flag.nmat) \ - fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ - if (field_dump_flag.enabledFMAT()) \ - write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ - if (field_dump_flag.cmat) \ - fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ - fprintf(fp, "%s", main_body_foot); \ - if (add_footer_flag) \ - fputs(footer, fp); \ - fclose(fp); \ - } + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, VPIC_HDF::main_body_head, time_step); \ + if (field_dump_flag.enabledE()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez"); \ + if (field_dump_flag.div_e_err) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err"); \ + if (field_dump_flag.enabledCB()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz"); \ + if (field_dump_flag.div_b_err) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err"); \ + if (field_dump_flag.enabledTCA()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz"); \ + if (field_dump_flag.rhob) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob"); \ + if (field_dump_flag.enabledJF()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz"); \ + if (field_dump_flag.rhof) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof"); \ + if (field_dump_flag.enabledEMAT()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \ + if (field_dump_flag.nmat) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat"); \ + if (field_dump_flag.enabledFMAT()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \ + if (field_dump_flag.cmat) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat"); \ + fprintf(fp, "%s", VPIC_HDF::main_body_foot); \ + if (add_footer_flag) \ + fputs(VPIC_HDF::footer, fp); \ + fclose(fp); \ + } #define invert_hydro_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag) \ - { \ - FILE *fp; \ - fp = fopen(xml_file_name, "a"); \ - fprintf(fp, main_body_head, time_step); \ - if (hydro_dump_flag.enabledJ()) \ - write_main_body_attribute(fp, main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz"); \ - if (hydro_dump_flag.rho) \ - fprintf(fp, main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho"); \ - if (hydro_dump_flag.enabledP()) \ - write_main_body_attribute(fp, main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz"); \ - if (hydro_dump_flag.ke) \ - fprintf(fp, main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke"); \ - if (hydro_dump_flag.enabledTD()) \ - write_main_body_attribute(fp, main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz"); \ - if (hydro_dump_flag.enabledTOD()) \ - write_main_body_attribute(fp, main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \ - fprintf(fp, "%s", main_body_foot); \ - if (add_footer_flag) \ - fputs(footer, fp); \ - fclose(fp); \ - } - + { \ + FILE *fp; \ + fp = fopen(xml_file_name, "a"); \ + fprintf(fp, VPIC_HDF::main_body_head, time_step); \ + if (hydro_dump_flag.enabledJ()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz"); \ + if (hydro_dump_flag.rho) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho"); \ + if (hydro_dump_flag.enabledP()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz"); \ + if (hydro_dump_flag.ke) \ + fprintf(fp, VPIC_HDF::main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke"); \ + if (hydro_dump_flag.enabledTD()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz"); \ + if (hydro_dump_flag.enabledTOD()) \ + write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \ + fprintf(fp, "%s", VPIC_HDF::main_body_foot); \ + if (add_footer_flag) { \ + fputs(VPIC_HDF::footer, fp); \ + } \ + fclose(fp); \ + } #endif // VPIC_HDF5_HEAD_INFO diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index 0dd2a418..9d36ff3e 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -114,16 +114,20 @@ vpic_simulation::vpic_simulation() // Initialize the dump strategy to use the binary dumpin, assuming the user // may overwrite this later - dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc() )); + //dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc() )); + enable_binary_dump(); // TODO: this this still makes sense now we have a dump strategy -#ifdef VPIC_ENABLE_HDF5 +//#ifdef VPIC_ENABLE_HDF5 // Default init hdf5 dump flags + //field_interval = 1; + //hydro_interval = 1; + //field_dump_flag = field_dump_flag_t(); + //hydro_dump_flag = hydro_dump_flag_t(); +//#endif + field_interval = 1; hydro_interval = 1; - field_dump_flag = field_dump_flag_t(); - hydro_dump_flag = hydro_dump_flag_t(); -#endif } vpic_simulation::~vpic_simulation() { diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 5c6c72c3..118567d0 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -36,144 +36,6 @@ #endif // #include "dumpvars.h" - -// TODO: move these to a better header? -#ifdef VPIC_ENABLE_HDF5 -struct field_dump_flag_t -{ - bool ex = true, ey = true, ez = true, div_e_err = true; - bool cbx = true, cby = true, cbz = true, div_b_err = true; - bool tcax = true, tcay = true, tcaz = true, rhob = true; - bool jfx = true, jfy = true, jfz = true, rhof = true; - bool ematx = true, ematy = true, ematz = true, nmat = true; - bool fmatx = true, fmaty = true, fmatz = true, cmat = true; - void disableE() - { - ex = false, ey = false, ez = false, div_e_err = false; - } - - void disableCB() - { - cbx = false, cby = false, cbz = false, div_b_err = false; - } - - void disableTCA() - { - tcax = false, tcay = false, tcaz = false, rhob = false; - } - - void disableJF() - { - jfx = false, jfy = false, jfz = false, rhof = false; - } - - void disableEMAT() - { - ematx = false, ematy = false, ematz = false, nmat = false; - } - - void disableFMAT() - { - fmatx = false, fmaty = false, fmatz = false, cmat = false; - } - - void resetToDefaults() - { - ex = true, ey = true, ez = true, div_e_err = true; - cbx = true, cby = true, cbz = true, div_b_err = true; - tcax = true, tcay = true, tcaz = true, rhob = true; - jfx = true, jfy = true, jfz = true, rhof = true; - ematx = true, ematy = true, ematz = true, nmat = true; - fmatx = true, fmaty = true, fmatz = true, cmat = true; - } - - bool enabledE() - { - return ex && ey && ez; - } - - bool enabledCB() - { - return cbx && cby && cbz; - } - - bool enabledTCA() - { - return tcax && tcay && tcaz; - } - - bool enabledJF() - { - return jfx && jfy && jfz; - } - - bool enabledEMAT() - { - return ematx && ematy && ematz; - } - - bool enabledFMAT() - { - return fmatx && fmaty && fmatz; - } -}; - -struct hydro_dump_flag_t -{ - bool jx = true, jy = true, jz = true, rho = true; - bool px = true, py = true, pz = true, ke = true; - bool txx = true, tyy = true, tzz = true; - bool tyz = true, tzx = true, txy = true; - - void disableJ() - { - jx = false, jy = false, jz = false, rho = false; - } - - void disableP() - { - px = false, py = false, pz = false, ke = false; - } - - void disableTD() //Stress diagonal - { - txx = false, tyy = false, tzz = false; - } - - void disableTOD() //Stress off-diagonal - { - tyz = false, tzx = false, txy = false; - } - void resetToDefaults() - { - jx = true, jy = true, jz = true, rho = true; - px = true, py = true, pz = true, ke = true; - txx = true, tyy = true, tzz = true; - tyz = true, tzx = true, txy = true; - } - - bool enabledJ() - { - return jx && jy && jz; - } - - bool enabledP() - { - return px && py && pz; - } - - bool enabledTD() - { - return txx && tyy && tzz; - } - - bool enabledTOD() - { - return tyz && tzx && txy; - } -}; -#endif - typedef FileIO FILETYPE; const uint32_t all (0xffffffff); @@ -266,6 +128,16 @@ class vpic_simulation { int advance( void ); void finalize( void ); + // TODO: decide if I should collapse this to an enum + // An enum would stop these ifdefs being so leaky + void enable_binary_dump(); +#ifdef VPIC_ENABLE_HDF5 + void enable_hdf5_dump(); +#endif +#ifdef VPIC_ENABLE_OPENPMD + void enable_openpmd_dump(); +#endif + protected: // Directly initialized by user @@ -389,12 +261,6 @@ class vpic_simulation { // a smart ptr will save us from the small leak std::unique_ptr dump_strategy; -#ifdef VPIC_ENABLE_HDF5 - // Declare vars to use - hydro_dump_flag_t hydro_dump_flag; - field_dump_flag_t field_dump_flag; -#endif - // convenience functions for simlog output void create_field_list(char * strlist, DumpParameters & dumpParams); void create_hydro_list(char * strlist, DumpParameters & dumpParams); From a90666677cd2e898606468fe58eb8e63d5130fed Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 12 Nov 2019 12:44:53 -0700 Subject: [PATCH 74/95] get pmd backend working too --- sample/harrisOpenPMD | 9 ++++++--- src/vpic/dump_strategy.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index 16abb432..5b3274dc 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -42,6 +42,9 @@ begin_globals { }; begin_initialization { + + enable_openpmd_dump(); + // At this point, there is an empty grid and the random number generator is // seeded with the rank. The grid, materials, species need to be defined. // Then the initial non-zero fields need to be loaded at time level 0 and the @@ -370,8 +373,8 @@ begin_diagnostics { std::string openpm_field_name = "fields.h5"; //std::string openpm_field_name = "fields.bp"; - if( step()==-10 ) dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J - if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str()); + if( step()==-10 ) dump_fields(openpm_field_name.c_str()); // Get first valid total J + if( should_dump(fields) ) dump_fields(openpm_field_name.c_str()); // Hydro dumps store particle charge density, current density and // stress-energy tensor. All these quantities are known at the time @@ -390,7 +393,7 @@ begin_diagnostics { // are tagged with step(). However, if a "0" is added to the call, the // filename will not be tagged. Particle dumps are in a binary format. // Each rank makes a particle dump. - if( should_dump(eparticle) ) dump_particles_openpmd("electron","eparticle"); + if( should_dump(eparticle) ) dump_particles("electron","eparticle"); //if( should_dump(iparticle) ) dump_particles_hdf5("ion", "iparticle"); // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 08fe6e60..be22689f 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1123,7 +1123,7 @@ class HDF5Dump : public Dump_Strategy { #ifdef VPIC_ENABLE_OPENPMD class OpenPMDDump : public Dump_Strategy { public: - static openPMD::Series* series; + openPMD::Series* series; using Dump_Strategy::Dump_Strategy; // inherit constructor void dump_fields( const char *fbase, From 8fe8ace50c54ae6a7a52279d492ddd85fb93b88a Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 12 Nov 2019 16:49:02 -0700 Subject: [PATCH 75/95] Adding explicitly deleted constructors to better follow rule of 3. Adding missing header too --- src/vpic/dump_strategy.h | 1 + src/vpic/vpic.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index be22689f..aaf0bda6 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -3,6 +3,7 @@ #include #include +#include #include // TODO: it would be good if this didn't have to know about MPI diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 118567d0..1513de41 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -123,6 +123,9 @@ class vpic_simulation { public: vpic_simulation(); ~vpic_simulation(); + vpic_simulation(const vpic_simulation&) = delete; + vpic_simulation& operator=(const vpic_simulation&) = delete; + void initialize( int argc, char **argv ); void modify( const char *fname ); int advance( void ); From 79faebb9247a8d80fb76cb3ff5d93fd219bfe06e Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 12 Nov 2019 18:00:30 -0700 Subject: [PATCH 76/95] remove needless call to copy constructor in unit tests --- test/unit/energy_comparison/3d_test.cc | 2 +- test/unit/energy_comparison/weibel_driver.cc | 2 +- test/unit/grid_heating/gridHeatingTestElec.cxx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/energy_comparison/3d_test.cc b/test/unit/energy_comparison/3d_test.cc index 4c44736b..d84e4627 100644 --- a/test/unit/energy_comparison/3d_test.cc +++ b/test/unit/energy_comparison/3d_test.cc @@ -312,7 +312,7 @@ TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" ) ofs.close(); // Init and run sim - vpic_simulation simulation = vpic_simulation(); + vpic_simulation simulation; // TODO: We should do this in a safer manner simulation.initialize( 0, NULL ); diff --git a/test/unit/energy_comparison/weibel_driver.cc b/test/unit/energy_comparison/weibel_driver.cc index eb4702db..cbc64e2c 100644 --- a/test/unit/energy_comparison/weibel_driver.cc +++ b/test/unit/energy_comparison/weibel_driver.cc @@ -310,7 +310,7 @@ TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" ) ofs.close(); // Init and run sim - vpic_simulation simulation = vpic_simulation(); + vpic_simulation simulation; // TODO: We should do this in a safer manner simulation.initialize( 0, NULL ); diff --git a/test/unit/grid_heating/gridHeatingTestElec.cxx b/test/unit/grid_heating/gridHeatingTestElec.cxx index dc0e1ca5..9e804588 100644 --- a/test/unit/grid_heating/gridHeatingTestElec.cxx +++ b/test/unit/grid_heating/gridHeatingTestElec.cxx @@ -249,7 +249,7 @@ begin_initialization { TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" ) { // Init and run sim - vpic_simulation simulation = vpic_simulation(); + vpic_simulation simulation; // TODO: We should do this in a safer manner simulation.initialize( 0, NULL ); From 37af4279abd3d7c494b3b74db8174f3b91002c7c Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Thu, 14 Nov 2019 08:49:57 -0700 Subject: [PATCH 77/95] change pmd series object to be a stack object --- src/vpic/dump_strategy.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index aaf0bda6..2a9356dd 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1124,7 +1124,7 @@ class HDF5Dump : public Dump_Strategy { #ifdef VPIC_ENABLE_OPENPMD class OpenPMDDump : public Dump_Strategy { public: - openPMD::Series* series; + //openPMD::Series* series; using Dump_Strategy::Dump_Strategy; // inherit constructor void dump_fields( const char *fbase, @@ -1136,17 +1136,17 @@ class OpenPMDDump : public Dump_Strategy { { std::cout << "Writing openPMD data" << std::endl; - if (series == nullptr) { + //if (series == nullptr) { std::cout << "init series" << std::endl; - series = new openPMD::Series( + openPMD::Series series = openPMD::Series( fbase, openPMD::AccessType::CREATE, MPI_COMM_WORLD - ); - } + ); + //} std::cout << "Writing itration " << step << std::endl; - auto i = series->iterations[ step ]; + auto i = series.iterations[ step ]; // TODO: it would be nice to set these... //series.setAuthor( "Axel Huebl "); //series.setMachine( "Hall Probe 5000, Model 3"); @@ -1175,7 +1175,7 @@ class OpenPMDDump : public Dump_Strategy { size_t gnx = (grid->nx * grid->gpx); size_t gny = (grid->ny * grid->gpy); size_t gnz = (grid->nz * grid->gpz); - openPMD::Extent global_extent = {gny, gny, gnz}; + openPMD::Extent global_extent = {gnx, gny, gnz}; openPMD::Datatype datatype = openPMD::determineDatatype(); openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); @@ -1208,6 +1208,12 @@ class OpenPMDDump : public Dump_Strategy { openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; openPMD::Extent chunk_extent = {nx, ny, nz}; + std::cout << "Local offset " << + " x: " << global_offset_x << + " y: " << global_offset_y << + " z: " << global_offset_z << + std::endl; + // Store a local copy of the data which we pull out of the AoS std::vector cbx_data; std::vector cby_data; @@ -1276,7 +1282,7 @@ class OpenPMDDump : public Dump_Strategy { Jy.storeChunk( jy_data, chunk_offset, chunk_extent); Jz.storeChunk( jz_data, chunk_offset, chunk_extent); - series->flush(); + series.flush(); } void dump_particles( const char *fbase, @@ -1287,16 +1293,16 @@ class OpenPMDDump : public Dump_Strategy { int ftag ) { - if (series == nullptr) { + //if (series == nullptr) { std::cout << "init series" << std::endl; - series = new openPMD::Series( + openPMD::Series series = openPMD::Series( fbase, openPMD::AccessType::CREATE, MPI_COMM_WORLD ); - } + //} - auto i = series->iterations[ step ]; + auto i = series.iterations[ step ]; // TODO: set these i.setTime( (float)step ); @@ -1356,6 +1362,7 @@ class OpenPMDDump : public Dump_Strategy { } + series.flush(); } void dump_hydro( const char *fbase, From 208729730c8c5dce23cac8f1444f24918e78f596 Mon Sep 17 00:00:00 2001 From: Robert Francis Bird - 294511 Date: Wed, 15 Jan 2020 11:43:05 -0700 Subject: [PATCH 78/95] fix bug where vpic_simulation class variables were not inited. If a deck forgot to do it we ran with UB --- src/vpic/vpic.cc | 11 ++++---- src/vpic/vpic.h | 69 ++++++++++++++++++++++++++---------------------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc index 9d36ff3e..cc081358 100644 --- a/src/vpic/vpic.cc +++ b/src/vpic/vpic.cc @@ -78,11 +78,12 @@ vpic_simulation::vpic_simulation() // Is this just trying to 0 initialize everything? // CLEAR( this, 1 ); - /* Set non-zero defaults */ - verbose = 1; - num_comm_round = 3; - num_div_e_round = 2; - num_div_b_round = 2; + // Now done in the class def / header + ///* Set non-zero defaults */ + //verbose = 1; + //num_comm_round = 3; + //num_div_e_round = 2; + //num_div_b_round = 2; #if defined(VPIC_USE_PTHREADS) // Pthreads case. int n_rng = serial.n_pipeline; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 1513de41..615b6191 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -145,50 +145,55 @@ class vpic_simulation { // Directly initialized by user - int verbose; // Should system be verbose - int num_step; // Number of steps to take - int num_comm_round; // Num comm round - int status_interval; // How often to print status messages - int clean_div_e_interval; // How often to clean div e - int num_div_e_round; // How many clean div e rounds per div e interval - int clean_div_b_interval; // How often to clean div b - int num_div_b_round; // How many clean div b rounds per div b interval - int sync_shared_interval; // How often to synchronize shared faces + int verbose = 1; // Should system be verbose + int num_step = 1; // Number of steps to take + int num_comm_round = 3; // Num comm round + int status_interval = 0; // How often to print status messages + + int clean_div_e_interval = 0; // How often to clean div e + int num_div_e_round = 2; // How many clean div e rounds per div e interval + + int clean_div_b_interval = 0; // How often to clean div b + int num_div_b_round = 2; // How many clean div b rounds per div b interval + + int sync_shared_interval = 0; // How often to synchronize shared faces // FIXME: THESE INTERVALS SHOULDN'T BE PART OF vpic_simulation // THE BIG LIST FOLLOWING IT SHOULD BE CLEANED UP TOO - double quota; - int checkpt_interval; - int hydro_interval; - int field_interval; - int particle_interval; + double quota = 0; + int checkpt_interval = 0; + int hydro_interval = 0; + int field_interval = 0; + int particle_interval = 0; // TODO: these can probably now be removed, as they should only be used by dump? // TODO: check if any decks used them //size_t nxout, nyout, nzout; //float dxout, dyout, dzout; - size_t px, py, pz; - - int ndfld; - int ndhyd; - int ndpar; - int ndhis; - int ndgrd; - int head_option; - int istride; - int jstride; - int kstride; - int stride_option; - int pstride; - int nprobe; + size_t px = 0; + size_t py = 0; + size_t pz = 0; + + int ndfld = 0; + int ndhyd = 0; + int ndpar = 0; + int ndhis = 0; + int ndgrd = 0; + int head_option = 0; + int istride = 0; + int jstride = 0; + int kstride = 0; + int stride_option = 0; + int pstride = 0; + int nprobe = 0; int ijkprobe[NVARHISMX][4]; float xyzprobe[NVARHISMX][3]; - int block_dump; - int stepdigit; - int rankdigit; - int ifenergies; + int block_dump = 0; + int stepdigit = 0; + int rankdigit = 0; + int ifenergies = 0; // Helper initialized by user From 6a753346f5b4edd6dacf901c5ae132105d989575 Mon Sep 17 00:00:00 2001 From: Robert Francis Bird - 294511 Date: Wed, 15 Jan 2020 11:52:20 -0700 Subject: [PATCH 79/95] change dump_Strategy constructor interface to not take num_steps, as it's not known when it's currently constructed. I instead now pass it in after user init, but the interface is a mess --- deck/main.cc | 6 ++++++ src/vpic/dump.cc | 6 +++--- src/vpic/dump_strategy.h | 7 +++---- src/vpic/vpic.h | 12 ++++++++---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/deck/main.cc b/deck/main.cc index f9f7fb1b..8a9d2352 100644 --- a/deck/main.cc +++ b/deck/main.cc @@ -98,6 +98,12 @@ int main(int argc, char** argv) } simulation = new vpic_simulation(); simulation->initialize( argc, argv ); + + // do post init setup to consume deck values + // which includes setting dump starts steps, as we didn't know it sooner + // TODO: make this use sane functions + simulation->dump_strategy->num_step = simulation->num_step; + REGISTER_OBJECT( &simulation, checkpt_main, restore_main, NULL ); } diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 9ac120a2..46c5ad02 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -66,20 +66,20 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) { *****************************************************************************/ void vpic_simulation::enable_binary_dump() { - dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc(), num_step )); + dump_strategy = std::unique_ptr(new BinaryDump( rank(), nproc() )); } #ifdef VPIC_ENABLE_HDF5 void vpic_simulation::enable_hdf5_dump() { std::cout << "Enabling HDF5 IO backend" << std::endl; - dump_strategy = std::unique_ptr(new HDF5Dump( rank(), nproc(), num_step )); + dump_strategy = std::unique_ptr(new HDF5Dump( rank(), nproc() )); } #endif #ifdef VPIC_ENABLE_OPENPMD void vpic_simulation::enable_openpmd_dump() { std::cout << "Enabling openPMD IO backend" << std::endl; - dump_strategy = std::unique_ptr(new OpenPMDDump( rank(), nproc(), num_step )); + dump_strategy = std::unique_ptr(new OpenPMDDump( rank(), nproc() )); } #endif diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 2a9356dd..f603cb34 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -37,10 +37,9 @@ class Dump_Strategy { public: int rank, nproc, num_step; - Dump_Strategy(int _rank, int _nproc, int total_steps) : + Dump_Strategy(int _rank, int _nproc ) : rank(_rank), - nproc(_nproc), - num_step(total_steps) // TODO: remove the need for this + nproc(_nproc) { } // empty virtual ~Dump_Strategy() { }; @@ -74,7 +73,7 @@ class Dump_Strategy { class BinaryDump : public Dump_Strategy { public: using Dump_Strategy::Dump_Strategy; // inherit constructor - BinaryDump(int _rank, int _nproc, int total_steps) : Dump_Strategy(_rank, _nproc, total_steps){ } // empty + //BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } // empty // TODO: now we pass rank and step, ftag has odd semanticds void dump_fields( diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 615b6191..bbc75235 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -141,12 +141,19 @@ class vpic_simulation { void enable_openpmd_dump(); #endif + // TODO: remake these protected + + // Very likely a user will forgot to delete this if they change the strategy, + // a smart ptr will save us from the small leak + std::unique_ptr dump_strategy; + + int num_step = 1; // Number of steps to take + protected: // Directly initialized by user int verbose = 1; // Should system be verbose - int num_step = 1; // Number of steps to take int num_comm_round = 3; // Num comm round int status_interval = 0; // How often to print status messages @@ -265,9 +272,6 @@ class vpic_simulation { void dump_particles( const char *sp_name, const char *fbase, int fname_tag = 1 ); - // Very likely a user will forgot to delete this if they change the strategy, - // a smart ptr will save us from the small leak - std::unique_ptr dump_strategy; // convenience functions for simlog output void create_field_list(char * strlist, DumpParameters & dumpParams); From 831b774a72d76d308cd5217c7af4d1719f55651c Mon Sep 17 00:00:00 2001 From: Robert Francis Bird - 294511 Date: Wed, 15 Jan 2020 15:04:14 -0700 Subject: [PATCH 80/95] port H5_ALL bug found by Patrick, and replace with linearspace --- src/vpic/dump_strategy.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index f603cb34..014ab0d8 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -650,9 +650,8 @@ class HDF5Dump : public Dump_Strategy { hsize_t memspace_count_temp = numparticles * 8; hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); - // Don't need, can just use H5S_ALL - //hsize_t linearspace_count_temp = numparticles; - //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + hsize_t linearspace_count_temp = numparticles; + hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); plist_id = H5Pcreate(H5P_DATASET_XFER); @@ -717,7 +716,7 @@ class HDF5Dump : public Dump_Strategy { } dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data()); + ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, linearspace, filespace, plist_id, global_pi.data()); H5Dclose(dset_id); #else From 2e108b95229e6101e8b105f7eb24f56faead23d6 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 21 Jan 2020 11:45:55 -0700 Subject: [PATCH 81/95] added particle and field dump code for openpmd --- src/vpic/dump_strategy.h | 158 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 1 deletion(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 014ab0d8..df3be2a8 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1153,6 +1153,11 @@ class OpenPMDDump : public Dump_Strategy { auto cB = i.meshes["B"]; auto E = i.meshes["E"]; auto J = i.meshes["J"]; + auto Tca = i.meshes["Tca"]; + auto Emat = i.meshes["Emat"]; + auto Fmat = i.meshes["Fmat"]; + auto Rho = i.meshes["Rho"]; + auto DivErr = i.meshes["DivErr"]; // record components auto cbx = cB["x"]; @@ -1167,8 +1172,28 @@ class OpenPMDDump : public Dump_Strategy { auto Jy = J["y"]; auto Jz = J["z"]; + auto Tcax = Tca["x"]; + auto Tcay = Tca["y"]; + auto Tcaz = Tca["z"]; + + auto Ematx = Emat["x"]; + auto Ematy = Emat["y"]; + auto Ematz = Emat["z"]; + + auto Fmatx = Fmat["x"]; + auto Fmaty = Fmat["y"]; + auto Fmatz = Fmat["z"]; + + auto RhoB = Rho["B"]; + auto RhoF = Rho["F"]; + + auto DivEErr = DivErr["E"]; + auto DivBErr = DivErr["B"]; + // TODO: set unitDimension so the anaylsis software knows what fields // things are + // + // // TODO: add timers for the convert and for the write size_t gnx = (grid->nx * grid->gpx); size_t gny = (grid->ny * grid->gpy); @@ -1190,6 +1215,24 @@ class OpenPMDDump : public Dump_Strategy { Jy.resetDataset(dataset); Jz.resetDataset(dataset); + Tcax.resetDataset(dataset); + Tcay.resetDataset(dataset); + Tcaz.resetDataset(dataset); + + Ematx.resetDataset(dataset); + Ematy.resetDataset(dataset); + Ematz.resetDataset(dataset); + + Fmatx.resetDataset(dataset); + Fmaty.resetDataset(dataset); + Fmatz.resetDataset(dataset); + + RhoB.resetDataset(dataset); + RhoF.resetDataset(dataset); + + DivEErr.resetDataset(dataset); + DivBErr.resetDataset(dataset); + // Convert rank to local x/y/z int rx, ry, rz; UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); @@ -1225,6 +1268,26 @@ class OpenPMDDump : public Dump_Strategy { std::vector jy_data; std::vector jz_data; + std::vector tcax_data; + std::vector tcay_data; + std::vector tcaz_data; + + // TODO: these are material_id (ints not floats) + std::vector ematx_data; + std::vector ematy_data; + std::vector ematz_data; + + std::vector fmatx_data; + std::vector fmaty_data; + std::vector fmatz_data; + // end todo + + std::vector rhob_data; + std::vector rhof_data; + + std::vector divb_data; + std::vector dive_data; + size_t nv = nx * ny * nz; cbx_data.reserve(nv); @@ -1239,6 +1302,24 @@ class OpenPMDDump : public Dump_Strategy { jy_data.reserve(nv); jz_data.reserve(nv); + tcax_data.reserve(nv); + tcay_data.reserve(nv); + tcaz_data.reserve(nv); + + ematx_data.reserve(nv); + ematy_data.reserve(nv); + ematz_data.reserve(nv); + + fmatx_data.reserve(nv); + fmaty_data.reserve(nv); + fmatz_data.reserve(nv); + + rhob_data.reserve(nv); + rhof_data.reserve(nv); + + divb_data.reserve(nv); + dive_data.reserve(nv); + // TODO: make this AoS to SoA conversion a function // We could do 1D here, but we don't really care about the ghosts, and we @@ -1264,6 +1345,24 @@ class OpenPMDDump : public Dump_Strategy { jx_data[local_index] = field_array->f[global_index].jfx; jy_data[local_index] = field_array->f[global_index].jfy; jz_data[local_index] = field_array->f[global_index].jfz; + + tcax_data[local_index] = field_array->f[global_index].tcax; + tcay_data[local_index] = field_array->f[global_index].tcay; + tcaz_data[local_index] = field_array->f[global_index].tcaz; + + ematx_data[local_index] = field_array->f[global_index].ematx; + ematy_data[local_index] = field_array->f[global_index].ematy; + ematz_data[local_index] = field_array->f[global_index].ematz; + + fmatx_data[local_index] = field_array->f[global_index].fmatx; + fmaty_data[local_index] = field_array->f[global_index].fmaty; + fmatz_data[local_index] = field_array->f[global_index].fmatz; + + rhob_data[local_index] = field_array->f[global_index].rhob; + rhof_data[local_index] = field_array->f[global_index].rhof; + + dive_data[local_index] = field_array->f[global_index].dive; + divb_data[local_index] = field_array->f[global_index].divb; } } } @@ -1280,8 +1379,27 @@ class OpenPMDDump : public Dump_Strategy { Jy.storeChunk( jy_data, chunk_offset, chunk_extent); Jz.storeChunk( jz_data, chunk_offset, chunk_extent); + Tcax.storeChunk( tcax_data, chunk_offset, chunk_extent); + Tcay.storeChunk( tcay_data, chunk_offset, chunk_extent); + Tcaz.storeChunk( tcaz_data, chunk_offset, chunk_extent); + + Ematx.storeChunk( ematx_data, chunk_offset, chunk_extent); + Ematy.storeChunk( ematy_data, chunk_offset, chunk_extent); + Ematz.storeChunk( ematz_data, chunk_offset, chunk_extent); + + Fmatx.storeChunk( fmatx_data, chunk_offset, chunk_extent); + Fmaty.storeChunk( fmaty_data, chunk_offset, chunk_extent); + Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent); + + RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent); + RhoF.storeChunk( rhof, chunk_offset, chunk_extent); + + DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent); + DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent); + series.flush(); } + void dump_particles( const char *fbase, species_t* sp, @@ -1343,13 +1461,42 @@ class OpenPMDDump : public Dump_Strategy { x_pos.reserve(to_write); x_off.reserve(to_write); + std::vector y_pos; + std::vector y_off; + y_pos.reserve(to_write); + y_off.reserve(to_write); + + std::vector z_pos; + std::vector z_off; + z_pos.reserve(to_write); + z_off.reserve(to_write); + + std::vector ux_pos; + ux_pos.reserve(to_write); + + std::vector uy_pos; + uy_pos.reserve(to_write); + + std::vector uz_pos; + uz_pos.reserve(to_write); + for (int j = 0; j < to_write; j++) { // TODO: do I need to center the particles? auto& particle = sp->p[i+j]; + x_pos[j] = particle.dx; + y_pos[j] = particle.dy; + z_pos[j] = particle.dz; + + ux_pos[j] = particle.ux; + uy_pos[j] = particle.uy; + uz_pos[j] = particle.uz; + std::array gi = global_particle_index(particle.i, grid, rank); x_off[j] = (float)gi[1]; + y_off[j] = (float)gi[2]; + z_off[j] = (float)gi[3]; } // Base offset plus i to account for chunks @@ -1357,8 +1504,17 @@ class OpenPMDDump : public Dump_Strategy { auto e = openPMD::Extent{to_write}; px.storeChunk(x_pos, o, e); pxo.storeChunk(x_off, o, e); - } + py.storeChunk(y_pos, o, e); + pyo.storeChunk(y_off, o, e); + + pz.storeChunk(z_pos, o, e); + pzo.storeChunk(z_off, o, e); + + ux.storeChunk(ux_pos, o, e); + uy.storeChunk(uy_pos, o, e); + uz.storeChunk(uz_pos, o, e); + } series.flush(); } From 5abef64053efae9cbdff5b37ac1f6d1fff6f9b23 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 21 Jan 2020 11:46:34 -0700 Subject: [PATCH 82/95] add note on compile time option --- src/vpic/dump_strategy.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 2a9356dd..97dadfe5 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -682,8 +682,10 @@ class HDF5Dump : public Dump_Strategy { ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); H5Dclose(dset_id); + // TODO: make this a compile time option #define OUTPUT_CONVERT_GLOBAL_ID 1 #ifdef OUTPUT_CONVERT_GLOBAL_ID + // TODO: make a function out of this too, its used in openpmd std::vector global_pi; global_pi.reserve(numparticles); From 6e16fa001eb82174f5ce85f95c1e2fd7af21c6a4 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 21 Jan 2020 11:50:49 -0700 Subject: [PATCH 83/95] merge local code and fix up compilation of new openpmd code --- src/vpic/dump_strategy.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 9eaec6d9..65447c04 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1363,8 +1363,8 @@ class OpenPMDDump : public Dump_Strategy { rhob_data[local_index] = field_array->f[global_index].rhob; rhof_data[local_index] = field_array->f[global_index].rhof; - dive_data[local_index] = field_array->f[global_index].dive; - divb_data[local_index] = field_array->f[global_index].divb; + dive_data[local_index] = field_array->f[global_index].div_e_err; + divb_data[local_index] = field_array->f[global_index].div_b_err; } } } @@ -1394,7 +1394,7 @@ class OpenPMDDump : public Dump_Strategy { Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent); RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent); - RhoF.storeChunk( rhof, chunk_offset, chunk_extent); + RhoF.storeChunk( rhof_data, chunk_offset, chunk_extent); DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent); DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent); @@ -1445,9 +1445,29 @@ class OpenPMDDump : public Dump_Strategy { auto px = p["position"]["x"]; auto pxo = p["positionOffset"]["x"]; + auto py = p["position"]["y"]; + auto pyo = p["positionOffset"]["y"]; + + auto pz = p["position"]["z"]; + auto pzo = p["positionOffset"]["z"]; + + auto ux = p["velocity"]["x"]; + auto uy = p["velocity"]["y"]; + auto uz = p["velocity"]["z"]; + px.resetDataset(dataset); pxo.resetDataset(dataset); + py.resetDataset(dataset); + pyo.resetDataset(dataset); + + pz.resetDataset(dataset); + pzo.resetDataset(dataset); + + ux.resetDataset(dataset); + uy.resetDataset(dataset); + uz.resetDataset(dataset); + // convert data to SoA, allowing the user to chunk the operation const int max_chunk = 32768*8; // 1MB SoA // Loop over all particles in chunks From 3a2e2cc4c0466ad63543db9d86b63cbaccaf8e28 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 21 Jan 2020 12:15:07 -0700 Subject: [PATCH 84/95] particles and fields both pass pmd validator, with no errors. Some warnings (such as author) need fixing --- sample/harrisOpenPMD | 2 ++ src/vpic/dump.cc | 2 +- src/vpic/dump_strategy.h | 14 ++++++++++---- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index 5b3274dc..568710d9 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -371,6 +371,8 @@ begin_diagnostics { // been completed. Field dumps are in a binary format. Each rank makes a // field dump. + // TODO: passing in the field extension as part of the name doesn't work for + // the other functions, as they use it to look up species std::string openpm_field_name = "fields.h5"; //std::string openpm_field_name = "fields.bp"; if( step()==-10 ) dump_fields(openpm_field_name.c_str()); // Get first valid total J diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 46c5ad02..65d15910 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -87,7 +87,7 @@ void vpic_simulation::dump_particles( const char *sp_name, const char *fbase, int ftag ) { - species_t * sp = find_species_name(sp_name, species_list); + species_t* sp = find_species_name(sp_name, species_list); dump_strategy->dump_particles( fbase, sp, diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 65447c04..5efa5c4f 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1145,7 +1145,7 @@ class OpenPMDDump : public Dump_Strategy { ); //} - std::cout << "Writing itration " << step << std::endl; + std::cout << "Writing iteration " << step << std::endl; auto i = series.iterations[ step ]; // TODO: it would be nice to set these... //series.setAuthor( "Axel Huebl "); @@ -1411,13 +1411,19 @@ class OpenPMDDump : public Dump_Strategy { int ftag ) { + // TODO: move this to class level, and make it user settable, so it + // can be used more widely + std::string file_type = ".h5"; + std::string full_file_name = fbase + file_type; + + std::cout << "writing particles to " << full_file_name << std::endl; + //if (series == nullptr) { - std::cout << "init series" << std::endl; openPMD::Series series = openPMD::Series( - fbase, + full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD - ); + ); //} auto i = series.iterations[ step ]; From f8b45c83c5191e293ba039cfcfc21c1bd0324766 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 21 Jan 2020 12:34:44 -0700 Subject: [PATCH 85/95] first pass adding hydro dump for openpmd --- sample/harrisOpenPMD | 2 +- src/vpic/dump_strategy.h | 221 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 213 insertions(+), 10 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index 568710d9..fdddccab 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -387,7 +387,7 @@ begin_diagnostics { // purely diagnostic. It is not used by the simulation and it is not // accumulated using a self-consistent charge-conserving method. Hydro dumps // are in a binary format. Each rank makes a hydro dump. - //if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro"); + if(should_dump(ehydro) ) dump_hydro("electron","ehydro"); //if( should_dump(ihydro) ) dump_hydro_hdf5("ion", "ihydro"); // Particle dumps store the particle data for a given species. The data diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 5efa5c4f..56aacb4e 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1162,9 +1162,9 @@ class OpenPMDDump : public Dump_Strategy { auto DivErr = i.meshes["DivErr"]; // record components - auto cbx = cB["x"]; - auto cby = cB["y"]; - auto cbz = cB["z"]; + auto Cbx = cB["x"]; + auto Cby = cB["y"]; + auto Cbz = cB["z"]; auto Ex = E["x"]; auto Ey = E["y"]; @@ -1205,9 +1205,9 @@ class OpenPMDDump : public Dump_Strategy { openPMD::Datatype datatype = openPMD::determineDatatype(); openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - cbx.resetDataset(dataset); - cby.resetDataset(dataset); - cbz.resetDataset(dataset); + Cbx.resetDataset(dataset); + Cby.resetDataset(dataset); + Cbz.resetDataset(dataset); Ex.resetDataset(dataset); Ey.resetDataset(dataset); @@ -1235,6 +1235,7 @@ class OpenPMDDump : public Dump_Strategy { DivEErr.resetDataset(dataset); DivBErr.resetDataset(dataset); + // TODO: hoist this conversion code, as is it used elsewhere // Convert rank to local x/y/z int rx, ry, rz; UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); @@ -1369,9 +1370,9 @@ class OpenPMDDump : public Dump_Strategy { } } - cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); - cby.storeChunk( cby_data, chunk_offset, chunk_extent); - cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); + Cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); + Cby.storeChunk( cby_data, chunk_offset, chunk_extent); + Cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); Ex.storeChunk( ex_data, chunk_offset, chunk_extent); Ey.storeChunk( ey_data, chunk_offset, chunk_extent); @@ -1556,6 +1557,208 @@ class OpenPMDDump : public Dump_Strategy { int ftag ) { + // TODO: move this to class level, and make it user settable, so it + // can be used more widely + std::string file_type = ".h5"; + std::string full_file_name = fbase + file_type; + + std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl; + + //if (series == nullptr) { + openPMD::Series series = openPMD::Series( + full_file_name, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); + //} + + auto i = series.iterations[ step ]; + + // TODO: set these + i.setTime( (float)step ); + i.setDt(1.0); + i.setTimeUnitSI(1.0); + + if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name )); + + // TODO: do we want each backend to have to explicitly call these + // manually? Or, as it is common, should we hoist it to the VPIC + // call-site + clear_hydro_array( hydro_array ); + accumulate_hydro_p( hydro_array, sp, interpolator_array ); + synchronize_hydro_array( hydro_array ); + + if( !fbase ) ERROR(( "Invalid filename" )); + + if( rank==0 ) + MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); + + // Write data + //float jx, jy, jz, rho; // Current and charge density => , + //float px, py, pz, ke; // Momentum and K.E. density => , + //float txx, tyy, tzz; // Stress diagonal => , i==j + //float tyz, tzx, txy; // Stress off-diagonal => , i!=j + auto J = i.meshes["J"]; + auto P = i.meshes["P"]; + auto T = i.meshes["T"]; + auto _Ke = i.meshes["Ke"]; + auto _Rho = i.meshes["Rho"]; + + auto Jx = J["x"]; + auto Jy = J["y"]; + auto Jz = J["z"]; + + auto Px = P["x"]; + auto Py = P["y"]; + auto Pz = P["z"]; + + auto Txx = T["xx"]; + auto Tyy = T["yy"]; + auto Tzz = T["zz"]; + auto Tyz = T["yz"]; + auto Tzx = T["zx"]; + auto Txy = T["xy"]; + + auto Rho = _Rho["rho"]; // TODO: bad name.. + auto Ke = _Ke["ke"]; // TODO: bad name.. + + size_t gnx = (grid->nx * grid->gpx); + size_t gny = (grid->ny * grid->gpy); + size_t gnz = (grid->nz * grid->gpz); + openPMD::Extent global_extent = {gnx, gny, gnz}; + + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + Jx.resetDataset(dataset); + Jy.resetDataset(dataset); + Jz.resetDataset(dataset); + + Px.resetDataset(dataset); + Py.resetDataset(dataset); + Pz.resetDataset(dataset); + + Txx.resetDataset(dataset); + Tyy.resetDataset(dataset); + Tzz.resetDataset(dataset); + Tyz.resetDataset(dataset); + Tzx.resetDataset(dataset); + Txy.resetDataset(dataset); + + Rho.resetDataset(dataset); + Ke.resetDataset(dataset); + + // TODO: hoist this conversion code, as is it used elsewhere + // Convert rank to local x/y/z + int rx, ry, rz; + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + size_t nx = grid->nx; + size_t ny = grid->ny; + size_t nz = grid->nz; + + // NOTE: this assumes a static mesh decomposition in nx/ny/nz + size_t global_offset_x = (nx) * rx; + size_t global_offset_y = (ny) * ry; + size_t global_offset_z = (nz) * rz; + + openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; + openPMD::Extent chunk_extent = {nx, ny, nz}; + + std::cout << "Local offset " << + " x: " << global_offset_x << + " y: " << global_offset_y << + " z: " << global_offset_z << + std::endl; + + std::vector jx_data; + std::vector jy_data; + std::vector jz_data; + + std::vector px_data; + std::vector py_data; + std::vector pz_data; + + std::vector txx_data; + std::vector tyy_data; + std::vector tzz_data; + std::vector tyz_data; + std::vector tzx_data; + std::vector txy_data; + + std::vector rho_data; + std::vector ke_data; + + size_t nv = nx * ny * nz; + + jx_data.reserve(nv); + jy_data.reserve(nv); + jz_data.reserve(nv); + + px_data.reserve(nv); + py_data.reserve(nv); + pz_data.reserve(nv); + + txx_data.reserve(nv); + tyy_data.reserve(nv); + tzz_data.reserve(nv); + tyz_data.reserve(nv); + tzx_data.reserve(nv); + txy_data.reserve(nv); + + rho_data.reserve(nv); + ke_data.reserve(nv); + + // Transpose AoS to SoAs + for (size_t k = 1; k < grid->nz + 1; k++) + { + for (size_t j = 1; j < grid->ny + 1; j++) + { + for (size_t i = 1; i < grid->nx + 1; i++) + { + int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); + int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + + jx_data[local_index] = hydro_array->h[global_index].jx; + jy_data[local_index] = hydro_array->h[global_index].jy; + jz_data[local_index] = hydro_array->h[global_index].jz; + + px_data[local_index] = hydro_array->h[global_index].px; + py_data[local_index] = hydro_array->h[global_index].py; + pz_data[local_index] = hydro_array->h[global_index].pz; + + txx_data[local_index] = hydro_array->h[global_index].txx; + tyy_data[local_index] = hydro_array->h[global_index].tyy; + tzz_data[local_index] = hydro_array->h[global_index].tzz; + tyz_data[local_index] = hydro_array->h[global_index].tyz; + tzx_data[local_index] = hydro_array->h[global_index].tzx; + txy_data[local_index] = hydro_array->h[global_index].txy; + + rho_data[local_index] = hydro_array->h[global_index].rho; + ke_data[local_index] = hydro_array->h[global_index].ke; + } + } + } + + Jx.storeChunk( jx_data, chunk_offset, chunk_extent); + Jy.storeChunk( jy_data, chunk_offset, chunk_extent); + Jz.storeChunk( jz_data, chunk_offset, chunk_extent); + + Px.storeChunk( px_data, chunk_offset, chunk_extent); + Py.storeChunk( py_data, chunk_offset, chunk_extent); + Pz.storeChunk( pz_data, chunk_offset, chunk_extent); + + Txx.storeChunk( txx_data, chunk_offset, chunk_extent); + Tyy.storeChunk( tyy_data, chunk_offset, chunk_extent); + Tzz.storeChunk( tzz_data, chunk_offset, chunk_extent); + Tyz.storeChunk( tyz_data, chunk_offset, chunk_extent); + Tzx.storeChunk( tzx_data, chunk_offset, chunk_extent); + Txy.storeChunk( txy_data, chunk_offset, chunk_extent); + + Rho.storeChunk( rho_data, chunk_offset, chunk_extent); + Ke.storeChunk( ke_data, chunk_offset, chunk_extent); + + series.flush(); } }; #endif From 0ff039587ba199838eba60f067cb2e6a8fe95e4a Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Thu, 30 Jan 2020 11:14:06 -0700 Subject: [PATCH 86/95] fix vector semantic where reserve was used instead of resize and increase write chunk size to 256MB. also fix bug where pmd particles were not being flushed at proper times --- src/vpic/dump_strategy.h | 100 ++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 56aacb4e..a1b96199 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -686,7 +686,7 @@ class HDF5Dump : public Dump_Strategy { // TODO: make a function out of this too, its used in openpmd std::vector global_pi; - global_pi.reserve(numparticles); + global_pi.resize(numparticles); // TODO: this could be parallel for (int i = 0; i < numparticles; i++) { @@ -1293,35 +1293,37 @@ class OpenPMDDump : public Dump_Strategy { size_t nv = nx * ny * nz; - cbx_data.reserve(nv); - cby_data.reserve(nv); - cbz_data.reserve(nv); + // TODO: resize here will zero out the data which we don't need, we + // could change to a different semantic to avoid this + cbx_data.resize(nv); + cby_data.resize(nv); + cbz_data.resize(nv); - ex_data.reserve(nv); - ey_data.reserve(nv); - ez_data.reserve(nv); + ex_data.resize(nv); + ey_data.resize(nv); + ez_data.resize(nv); - jx_data.reserve(nv); - jy_data.reserve(nv); - jz_data.reserve(nv); + jx_data.resize(nv); + jy_data.resize(nv); + jz_data.resize(nv); - tcax_data.reserve(nv); - tcay_data.reserve(nv); - tcaz_data.reserve(nv); + tcax_data.resize(nv); + tcay_data.resize(nv); + tcaz_data.resize(nv); - ematx_data.reserve(nv); - ematy_data.reserve(nv); - ematz_data.reserve(nv); + ematx_data.resize(nv); + ematy_data.resize(nv); + ematz_data.resize(nv); - fmatx_data.reserve(nv); - fmaty_data.reserve(nv); - fmatz_data.reserve(nv); + fmatx_data.resize(nv); + fmaty_data.resize(nv); + fmatz_data.resize(nv); - rhob_data.reserve(nv); - rhof_data.reserve(nv); + rhob_data.resize(nv); + rhof_data.resize(nv); - divb_data.reserve(nv); - dive_data.reserve(nv); + divb_data.resize(nv); + dive_data.resize(nv); // TODO: make this AoS to SoA conversion a function @@ -1476,7 +1478,7 @@ class OpenPMDDump : public Dump_Strategy { uz.resetDataset(dataset); // convert data to SoA, allowing the user to chunk the operation - const int max_chunk = 32768*8; // 1MB SoA + const int max_chunk = 32768*256*8; // 256MB SoA // Loop over all particles in chunks for (int i = 0; i < np; i += max_chunk) { @@ -1487,27 +1489,27 @@ class OpenPMDDump : public Dump_Strategy { // Convert the chunk ready to write std::vector x_pos; std::vector x_off; - x_pos.reserve(to_write); - x_off.reserve(to_write); + x_pos.resize(to_write); + x_off.resize(to_write); std::vector y_pos; std::vector y_off; - y_pos.reserve(to_write); - y_off.reserve(to_write); + y_pos.resize(to_write); + y_off.resize(to_write); std::vector z_pos; std::vector z_off; - z_pos.reserve(to_write); - z_off.reserve(to_write); + z_pos.resize(to_write); + z_off.resize(to_write); std::vector ux_pos; - ux_pos.reserve(to_write); + ux_pos.resize(to_write); std::vector uy_pos; - uy_pos.reserve(to_write); + uy_pos.resize(to_write); std::vector uz_pos; - uz_pos.reserve(to_write); + uz_pos.resize(to_write); for (int j = 0; j < to_write; j++) { @@ -1543,9 +1545,9 @@ class OpenPMDDump : public Dump_Strategy { ux.storeChunk(ux_pos, o, e); uy.storeChunk(uy_pos, o, e); uz.storeChunk(uz_pos, o, e); - } - series.flush(); + series.flush(); + } } void dump_hydro( const char *fbase, @@ -1691,23 +1693,23 @@ class OpenPMDDump : public Dump_Strategy { size_t nv = nx * ny * nz; - jx_data.reserve(nv); - jy_data.reserve(nv); - jz_data.reserve(nv); + jx_data.resize(nv); + jy_data.resize(nv); + jz_data.resize(nv); - px_data.reserve(nv); - py_data.reserve(nv); - pz_data.reserve(nv); + px_data.resize(nv); + py_data.resize(nv); + pz_data.resize(nv); - txx_data.reserve(nv); - tyy_data.reserve(nv); - tzz_data.reserve(nv); - tyz_data.reserve(nv); - tzx_data.reserve(nv); - txy_data.reserve(nv); + txx_data.resize(nv); + tyy_data.resize(nv); + tzz_data.resize(nv); + tyz_data.resize(nv); + tzx_data.resize(nv); + txy_data.resize(nv); - rho_data.reserve(nv); - ke_data.reserve(nv); + rho_data.resize(nv); + ke_data.resize(nv); // Transpose AoS to SoAs for (size_t k = 1; k < grid->nz + 1; k++) From c6d01c668f35686a30498e8156d21d9781cf1793 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Sun, 2 Feb 2020 11:34:20 -0700 Subject: [PATCH 87/95] add code to expose user definable max openpmd particle write chunk size --- CMakeLists.txt | 2 ++ src/vpic/dump_strategy.cc | 3 +++ src/vpic/dump_strategy.h | 10 ++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c912f52..8af21869 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,8 @@ option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you ins # option to set minimum number of particles set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing") +# TODO: better name for this? +set(PMD_MAX_IO_CHUNK AUTO CACHE STRING "Select the maxiumum IO write size to use when writing -- applies to particles only, and is specified as number of particles. currently only honored by OpenPMD backend") #------------------------------------------------------------------------------# # Create include and link aggregates diff --git a/src/vpic/dump_strategy.cc b/src/vpic/dump_strategy.cc index adea2714..e03ca36c 100644 --- a/src/vpic/dump_strategy.cc +++ b/src/vpic/dump_strategy.cc @@ -57,6 +57,9 @@ void BinaryDump::dump_particles( FileIO fileIO; int dim[1], buf_start; static particle_t * ALIGNED(128) p_buf = NULL; + + // TODO: reconcile this with MAX_IO_CHUNK, and update Cmake option + // description to explain what backends use it # define PBUF_SIZE 32768 // 1MB of particles if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name )); diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index a1b96199..4e823e34 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -680,6 +680,7 @@ class HDF5Dump : public Dump_Strategy { ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); H5Dclose(dset_id); + // TODO: should we add the ability to chunk the particle write? // TODO: make this a compile time option #define OUTPUT_CONVERT_GLOBAL_ID 1 #ifdef OUTPUT_CONVERT_GLOBAL_ID @@ -1476,9 +1477,14 @@ class OpenPMDDump : public Dump_Strategy { ux.resetDataset(dataset); uy.resetDataset(dataset); uz.resetDataset(dataset); - // convert data to SoA, allowing the user to chunk the operation - const int max_chunk = 32768*256*8; // 256MB SoA + + // TODO: Add code the convert to global offsets +#ifndef PMD_MAX_IO_CHUNK // in particles +#define PMD_MAX_IO_CHUNK 16777216; // 512MB total write +#endif + const int max_chunk = PMD_MAX_IO_CHUNK; + // Loop over all particles in chunks for (int i = 0; i < np; i += max_chunk) { From fd1559a410f2c3e4682fa879984fd8b73e31d3b3 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Sun, 2 Feb 2020 11:43:26 -0700 Subject: [PATCH 88/95] add missing define for max io chunk, and hoist global id output option to cmake level --- CMakeLists.txt | 11 +++++++++++ src/vpic/dump_strategy.h | 3 +-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8af21869..ec4907a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,12 +90,15 @@ option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install H option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you install OpenPM" OFF) +option(OUTPUT_CONVERT_GLOBAL_ID "Convert particle cell id to be global, such that it tells you a unique global location instead of a local offset" ON) + # option to set minimum number of particles set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing") # TODO: better name for this? set(PMD_MAX_IO_CHUNK AUTO CACHE STRING "Select the maxiumum IO write size to use when writing -- applies to particles only, and is specified as number of particles. currently only honored by OpenPMD backend") + #------------------------------------------------------------------------------# # Create include and link aggregates # @@ -129,6 +132,10 @@ if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO") add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES}) endif() +if(NOT PMD_MAX_IO_CHUNK STREQUAL "AUTO") + add_definitions(-DPMD_MAX_IO_CHUNK=${PMD_MAX_IO_CHUNK}) +endif() + find_package(Threads REQUIRED) #------------------------------------------------------------------------------# @@ -379,6 +386,10 @@ if(USE_OPENPMD) endif() endif(USE_OPENPMD) +if(OUTPUT_CONVERT_GLOBAL_ID) + add_definitions(-DOUTPUT_CONVERT_GLOBAL_ID) +endif(OUTPUT_CONVERT_GLOBAL_ID) + macro(build_a_vpic name deck) if(NOT EXISTS ${deck}) message(FATAL_ERROR "Could not find deck '${deck}'") diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 4e823e34..0451ce55 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -681,8 +681,7 @@ class HDF5Dump : public Dump_Strategy { H5Dclose(dset_id); // TODO: should we add the ability to chunk the particle write? - // TODO: make this a compile time option -#define OUTPUT_CONVERT_GLOBAL_ID 1 + #ifdef OUTPUT_CONVERT_GLOBAL_ID // TODO: make a function out of this too, its used in openpmd From 2cc7b1d8a587eb53dd8f8cd243bed7411cb766f1 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Sun, 2 Feb 2020 11:51:14 -0700 Subject: [PATCH 89/95] first effort to expose user configrable file extension for openmpd --- sample/harrisOpenPMD | 4 ++++ src/vpic/dump_strategy.h | 10 ++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index fdddccab..197dd38a 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -45,6 +45,10 @@ begin_initialization { enable_openpmd_dump(); + // TODO: this should be done through a setter once we have a common options + // interface + dump_strategy->file_type = ".bp"; + // At this point, there is an empty grid and the random number generator is // seeded with the rank. The grid, materials, species need to be defined. // Then the initial non-zero fields need to be loaded at time level 0 and the diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 0451ce55..560f5813 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1126,6 +1126,10 @@ class OpenPMDDump : public Dump_Strategy { public: //openPMD::Series* series; using Dump_Strategy::Dump_Strategy; // inherit constructor + + //std::string file_type = ".h5"; + std::string file_type = ".bp"; + void dump_fields( const char *fbase, int step, @@ -1414,9 +1418,6 @@ class OpenPMDDump : public Dump_Strategy { int ftag ) { - // TODO: move this to class level, and make it user settable, so it - // can be used more widely - std::string file_type = ".h5"; std::string full_file_name = fbase + file_type; std::cout << "writing particles to " << full_file_name << std::endl; @@ -1564,9 +1565,6 @@ class OpenPMDDump : public Dump_Strategy { int ftag ) { - // TODO: move this to class level, and make it user settable, so it - // can be used more widely - std::string file_type = ".h5"; std::string full_file_name = fbase + file_type; std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl; From a648e47da34ced2e72bfe617a90beb4dc4c07493 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 4 Feb 2020 15:55:27 -0700 Subject: [PATCH 90/95] renable hdf5 debug timing prints --- sample/harrisOpenPMD | 2 +- src/vpic/dump_strategy.h | 44 ++++++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD index 197dd38a..697e0250 100644 --- a/sample/harrisOpenPMD +++ b/sample/harrisOpenPMD @@ -47,7 +47,7 @@ begin_initialization { // TODO: this should be done through a setter once we have a common options // interface - dump_strategy->file_type = ".bp"; + //dump_strategy->file_type = ".bp"; // At this point, there is an empty grid and the random number generator is // seeded with the rank. The grid, materials, species need to be defined. diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 560f5813..3ca057a4 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -30,6 +30,16 @@ #include #endif +// TODO: delete this +#define _LOG_PREFIX \ + __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: " +#define io_log(x) do { \ + if( rank==0 ) { \ + std::cerr << _LOG_PREFIX << x << std::endl; \ + std::cerr.flush(); \ + } \ + } while(0) + // Runtime inheritance is obviously not very "VPIC like", as we will [probably] // incur a penalty for the vtable lookup, but given we're about to do IO this // is very negligible. @@ -321,7 +331,7 @@ class HDF5Dump : public Dump_Strategy { hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); el1 = uptime() - el1; - //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts double el2 = uptime(); /* @@ -468,7 +478,7 @@ class HDF5Dump : public Dump_Strategy { DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); el2 = uptime() - el2; - //sim_log("TimeHDF5Write: " << el2 << " s"); + io_log("TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); @@ -497,7 +507,7 @@ class HDF5Dump : public Dump_Strategy { H5Fclose(file_id); el3 = uptime() - el3; - //sim_log("TimeHDF5Close: " << el3 << " s"); + io_log("TimeHDF5Close: " << el3 << " s"); if (mpi_rank == 0) { @@ -621,7 +631,7 @@ class HDF5Dump : public Dump_Strategy { MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; - //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); + io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); Pf = (float *)sp->p; Pi = (int *)sp->p; @@ -663,9 +673,9 @@ class HDF5Dump : public Dump_Strategy { H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); el1 = uptime() - el1; - //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - //double el2 = uptime(); + double el2 = uptime(); // This point offset is silly, and loses the type safety (pf+1) hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); @@ -743,8 +753,8 @@ class HDF5Dump : public Dump_Strategy { ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); H5Dclose(dset_id); - //el2 = uptime() - el2; - //sim_log("Particle TimeHDF5Write: " << el2 << " s"); + el2 = uptime() - el2; + io_log("Particle TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); H5Sclose(memspace); @@ -753,7 +763,7 @@ class HDF5Dump : public Dump_Strategy { H5Gclose(group_id); H5Fclose(file_id); el3 = uptime() - el3; - //sim_log("Particle TimeHDF5Close: " << el3 << " s"); + io_log("Particle TimeHDF5Close: " << el3 << " s"); sp->p = sp_p; sp->np = sp_np; @@ -785,7 +795,7 @@ class HDF5Dump : public Dump_Strategy { H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE); H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL); meta_el1 = uptime() - meta_el1; - //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts + io_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts double meta_el2 = uptime(); @@ -834,7 +844,7 @@ class HDF5Dump : public Dump_Strategy { H5Dclose(meta_dset_id); meta_el2 = uptime() - meta_el2; - //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); + io_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); double meta_el3 = uptime(); H5Sclose(meta_memspace); H5Sclose(meta_filespace); @@ -842,7 +852,7 @@ class HDF5Dump : public Dump_Strategy { H5Gclose(meta_group_id); H5Fclose(meta_file_id); meta_el3 = uptime() - meta_el3; - //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); + io_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); } @@ -913,8 +923,8 @@ class HDF5Dump : public Dump_Strategy { hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); el1 = uptime() - el1; - //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts - //double el2 = uptime(); + io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); // Create a variable list of field values to output. //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); @@ -1023,8 +1033,8 @@ class HDF5Dump : public Dump_Strategy { if (hydro_dump_flag.txy) DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); - //el2 = uptime() - el2; - //sim_log("TimeHDF5Write: " << el2 << " s"); + el2 = uptime() - el2; + io_log("TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); @@ -1053,7 +1063,7 @@ class HDF5Dump : public Dump_Strategy { H5Fclose(file_id); el3 = uptime() - el3; - //sim_log("TimeHDF5Close: " << el3 << " s"); + io_log("TimeHDF5Close: " << el3 << " s"); if (mpi_rank == 0) { From 46ae346d13e211afb3d655f579bb5501e384fbfe Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 27 Oct 2020 12:18:41 -0600 Subject: [PATCH 91/95] default vpic_simulation ptrs to nullptr --- src/vpic/vpic.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index bbc75235..fd025ee8 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -210,21 +210,21 @@ class vpic_simulation { random numbers. Keeping the synchronous generators in sync is the generator users responsibility. */ - rng_pool_t * entropy; // Local entropy pool - rng_pool_t * sync_entropy; // Synchronous entropy pool - grid_t * grid; // define_*_grid et al - material_t * material_list; // define_material - field_array_t * field_array; // define_field_array - interpolator_array_t * interpolator_array; // define_interpolator_array - accumulator_array_t * accumulator_array; // define_accumulator_array - hydro_array_t * hydro_array; // define_hydro_array - species_t * species_list; // define_species / - // species helpers - particle_bc_t * particle_bc_list; // define_particle_bc / - // boundary helpers - emitter_t * emitter_list; // define_emitter / - // emitter helpers - collision_op_t * collision_op_list; // collision helpers + rng_pool_t * entropy = nullptr; // Local entropy pool + rng_pool_t * sync_entropy = nullptr; // Synchronous entropy pool + grid_t * grid = nullptr; // define_*_grid et al + material_t * material_list = nullptr; // define_material + field_array_t * field_array = nullptr; // define_field_array + interpolator_array_t * interpolator_array = nullptr; // define_interpolator_array + accumulator_array_t * accumulator_array = nullptr; // define_accumulator_array + hydro_array_t * hydro_array = nullptr; // define_hydro_array + species_t * species_list = nullptr; // define_species / + // species helpers + particle_bc_t * particle_bc_list = nullptr; // define_particle_bc / + // boundary helpers + emitter_t * emitter_list = nullptr; // define_emitter / + // emitter helpers + collision_op_t * collision_op_list = nullptr; // collision helpers // User defined checkpt preserved variables // Note: user_global is aliased with user_global_t (see deck_wrapper.cxx) From ade0a0f0ce5f242ecd29c76b31acff76b3404c0d Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 10 Nov 2020 14:35:10 -0700 Subject: [PATCH 92/95] disable restore test until we add logic to perform restore of dump strat --- src/vpic/dump_strategy.h | 4 +++- src/vpic/vpic.h | 1 + test/integrated/to_completion/CMakeLists.txt | 12 ++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 3ca057a4..0eb5425f 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1150,10 +1150,12 @@ class OpenPMDDump : public Dump_Strategy { { std::cout << "Writing openPMD data" << std::endl; + std::string full_file_name = fbase + file_type; + //if (series == nullptr) { std::cout << "init series" << std::endl; openPMD::Series series = openPMD::Series( - fbase, + full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD ); diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index fd025ee8..2a4b1767 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -145,6 +145,7 @@ class vpic_simulation { // Very likely a user will forgot to delete this if they change the strategy, // a smart ptr will save us from the small leak + // TODO: this does not survive the dump right now std::unique_ptr dump_strategy; int num_step = 1; // Number of steps to take diff --git a/test/integrated/to_completion/CMakeLists.txt b/test/integrated/to_completion/CMakeLists.txt index 5baecf43..690e738c 100644 --- a/test/integrated/to_completion/CMakeLists.txt +++ b/test/integrated/to_completion/CMakeLists.txt @@ -58,11 +58,11 @@ add_test(${generate_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} # Run using the restore file # TODO: caps? -set(perform_restore "perform_${RESTART_BINARY}") -build_a_vpic(${perform_restore} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck) -add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} - ${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${perform_restore} - ${MPIEXEC_POSTFLAGS} ${RESTART_ARGS}) +#set(perform_restore "perform_${RESTART_BINARY}") +#build_a_vpic(${perform_restore} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck) +#add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} +#${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${perform_restore} +#${MPIEXEC_POSTFLAGS} ${RESTART_ARGS}) # TODO: re-enable modify test #list(APPEND MODIFY_BINARY restore-modify) @@ -76,4 +76,4 @@ add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} set(RESTORE_LABEL "restore_group") set_tests_properties(${perform_restore} PROPERTIES DEPENDS ${generate_restore}) #set_property(TEST ${generate_restore} PROPERTY FIXTURES_SETUP ${RESTORE_LABEL}) -#set_property(TEST ${perform_restore} PROPERTY FIXTURES_REQUIRED ${RESTORE_LABEL}) \ No newline at end of file +#set_property(TEST ${perform_restore} PROPERTY FIXTURES_REQUIRED ${RESTORE_LABEL}) From 34bdedd1fb234cf42764d2a86f6aef25477dd47f Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Mon, 23 Nov 2020 10:33:05 -0700 Subject: [PATCH 93/95] apply Bin's changed --- src/vpic/dump_strategy.h | 2276 ++++++++++++++++++++++---------------- 1 file changed, 1302 insertions(+), 974 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 0eb5425f..c6fb7613 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -5,9 +5,27 @@ #include #include +//#define DUMP_INFO_DEBUG 1 +//#define H5_ASYNC 1 +#ifdef H5_ASYNC +#include "h5_vol_external_async_native.h" +#endif +//#define CHUNK_FLAG 1 + + +//#define METADATA_COLL_WRITE 1 +//#define TRUE 1 + + +#define HAS_FIELD_COMP 1 +#define HAS_PARTICLE_COMP 1 +#define HAS_HYDRO_COMP 1 + +//#define HAS_INDEPENDENT_IO 1 + #include // TODO: it would be good if this didn't have to know about MPI +#include -#define DUMP_INFO_DEBUG 1 // TODO: should I drop the ./src here? #include "../util/io/FileIO.h" @@ -30,54 +48,63 @@ #include #endif + +//#define N_FILE_N_PROCESS 1 +//#define TEST_MPIIO 1 + // TODO: delete this #define _LOG_PREFIX \ - __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: " + __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: " + +/* #define io_log(x) do { \ - if( rank==0 ) { \ - std::cerr << _LOG_PREFIX << x << std::endl; \ - std::cerr.flush(); \ - } \ - } while(0) +if( rank==0 ) { \ +std::cerr << _LOG_PREFIX << x << std::endl; \ +std::cerr.flush(); \ +} \ +} while(0) +*/ + + // Runtime inheritance is obviously not very "VPIC like", as we will [probably] // incur a penalty for the vtable lookup, but given we're about to do IO this // is very negligible. class Dump_Strategy { public: - int rank, nproc, num_step; + int rank, nproc, num_step; - Dump_Strategy(int _rank, int _nproc ) : - rank(_rank), - nproc(_nproc) + Dump_Strategy(int _rank, int _nproc ) : + rank(_rank), + nproc(_nproc) { } // empty - virtual ~Dump_Strategy() { }; + virtual ~Dump_Strategy() { }; - virtual void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ) = 0; - virtual void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) = 0; - virtual void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) = 0; + virtual void dump_fields( + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) = 0; + virtual void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) = 0; + virtual void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) = 0; }; class BinaryDump : public Dump_Strategy { @@ -92,7 +119,7 @@ class BinaryDump : public Dump_Strategy { grid_t* grid, field_array_t* field_array, int ftag - ); + ); void dump_hydro( const char *fbase, int step, @@ -101,7 +128,7 @@ class BinaryDump : public Dump_Strategy { interpolator_array_t* interpolator_array, grid_t* grid, int ftag - ); + ); void dump_particles( const char *fbase, species_t* sp, @@ -109,1025 +136,1326 @@ class BinaryDump : public Dump_Strategy { int step, interpolator_array_t* interpolator_array, int ftag - ); + ); }; #ifdef VPIC_ENABLE_HDF5 struct field_dump_flag_t { - bool ex = true, ey = true, ez = true, div_e_err = true; - bool cbx = true, cby = true, cbz = true, div_b_err = true; - bool tcax = true, tcay = true, tcaz = true, rhob = true; - bool jfx = true, jfy = true, jfz = true, rhof = true; - bool ematx = true, ematy = true, ematz = true, nmat = true; - bool fmatx = true, fmaty = true, fmatz = true, cmat = true; - void disableE() - { - ex = false, ey = false, ez = false, div_e_err = false; - } - - void disableCB() - { - cbx = false, cby = false, cbz = false, div_b_err = false; - } - - void disableTCA() - { - tcax = false, tcay = false, tcaz = false, rhob = false; - } - - void disableJF() - { - jfx = false, jfy = false, jfz = false, rhof = false; - } - - void disableEMAT() - { - ematx = false, ematy = false, ematz = false, nmat = false; - } - - void disableFMAT() - { - fmatx = false, fmaty = false, fmatz = false, cmat = false; - } - - void resetToDefaults() - { - ex = true, ey = true, ez = true, div_e_err = true; - cbx = true, cby = true, cbz = true, div_b_err = true; - tcax = true, tcay = true, tcaz = true, rhob = true; - jfx = true, jfy = true, jfz = true, rhof = true; - ematx = true, ematy = true, ematz = true, nmat = true; - fmatx = true, fmaty = true, fmatz = true, cmat = true; - } - - bool enabledE() - { - return ex && ey && ez; - } - - bool enabledCB() - { - return cbx && cby && cbz; - } - - bool enabledTCA() - { - return tcax && tcay && tcaz; - } - - bool enabledJF() - { - return jfx && jfy && jfz; - } - - bool enabledEMAT() - { - return ematx && ematy && ematz; - } - - bool enabledFMAT() - { - return fmatx && fmaty && fmatz; - } + bool ex = true, ey = true, ez = true, div_e_err = true; + bool cbx = true, cby = true, cbz = true, div_b_err = true; + bool tcax = true, tcay = true, tcaz = true, rhob = true; + bool jfx = true, jfy = true, jfz = true, rhof = true; + bool ematx = true, ematy = true, ematz = true, nmat = true; + bool fmatx = true, fmaty = true, fmatz = true, cmat = true; + void disableE() + { + ex = false, ey = false, ez = false, div_e_err = false; + } + + void disableCB() + { + cbx = false, cby = false, cbz = false, div_b_err = false; + } + + void disableTCA() + { + tcax = false, tcay = false, tcaz = false, rhob = false; + } + + void disableJF() + { + jfx = false, jfy = false, jfz = false, rhof = false; + } + + void disableEMAT() + { + ematx = false, ematy = false, ematz = false, nmat = false; + } + + void disableFMAT() + { + fmatx = false, fmaty = false, fmatz = false, cmat = false; + } + + void resetToDefaults() + { + ex = true, ey = true, ez = true, div_e_err = true; + cbx = true, cby = true, cbz = true, div_b_err = true; + tcax = true, tcay = true, tcaz = true, rhob = true; + jfx = true, jfy = true, jfz = true, rhof = true; + ematx = true, ematy = true, ematz = true, nmat = true; + fmatx = true, fmaty = true, fmatz = true, cmat = true; + } + + bool enabledE() + { + return ex && ey && ez; + } + + bool enabledCB() + { + return cbx && cby && cbz; + } + + bool enabledTCA() + { + return tcax && tcay && tcaz; + } + + bool enabledJF() + { + return jfx && jfy && jfz; + } + + bool enabledEMAT() + { + return ematx && ematy && ematz; + } + + bool enabledFMAT() + { + return fmatx && fmaty && fmatz; + } }; struct hydro_dump_flag_t { - bool jx = true, jy = true, jz = true, rho = true; - bool px = true, py = true, pz = true, ke = true; - bool txx = true, tyy = true, tzz = true; - bool tyz = true, tzx = true, txy = true; - - void disableJ() - { - jx = false, jy = false, jz = false, rho = false; - } - - void disableP() - { - px = false, py = false, pz = false, ke = false; - } - - void disableTD() //Stress diagonal - { - txx = false, tyy = false, tzz = false; - } - - void disableTOD() //Stress off-diagonal - { - tyz = false, tzx = false, txy = false; - } - void resetToDefaults() - { - jx = true, jy = true, jz = true, rho = true; - px = true, py = true, pz = true, ke = true; - txx = true, tyy = true, tzz = true; - tyz = true, tzx = true, txy = true; - } - - bool enabledJ() - { - return jx && jy && jz; - } - - bool enabledP() - { - return px && py && pz; - } - - bool enabledTD() - { - return txx && tyy && tzz; - } - - bool enabledTOD() - { - return tyz && tzx && txy; - } + bool jx = true, jy = true, jz = true, rho = true; + bool px = true, py = true, pz = true, ke = true; + bool txx = true, tyy = true, tzz = true; + bool tyz = true, tzx = true, txy = true; + + void disableJ() + { + jx = false, jy = false, jz = false, rho = false; + } + + void disableP() + { + px = false, py = false, pz = false, ke = false; + } + + void disableTD() //Stress diagonal + { + txx = false, tyy = false, tzz = false; + } + + void disableTOD() //Stress off-diagonal + { + tyz = false, tzx = false, txy = false; + } + void resetToDefaults() + { + jx = true, jy = true, jz = true, rho = true; + px = true, py = true, pz = true, ke = true; + txx = true, tyy = true, tzz = true; + tyz = true, tzx = true, txy = true; + } + + bool enabledJ() + { + return jx && jy && jz; + } + + bool enabledP() + { + return px && py && pz; + } + + bool enabledTD() + { + return txx && tyy && tzz; + } + + bool enabledTOD() + { + return tyz && tzx && txy; + } }; class HDF5Dump : public Dump_Strategy { std::unordered_map tframe_map; public: - using Dump_Strategy::Dump_Strategy; // inherit constructor + using Dump_Strategy::Dump_Strategy; // inherit constructor - // TODO: replace these with a common dump interface - // Declare vars to use - hydro_dump_flag_t hydro_dump_flag; - field_dump_flag_t field_dump_flag; + // TODO: replace these with a common dump interface + // Declare vars to use + hydro_dump_flag_t hydro_dump_flag; + field_dump_flag_t field_dump_flag; #define DUMP_DIR_FORMAT "./%s" -// TODO: naming a macro so close to existing functions AND data is not a good -// define to do C-style indexing + // TODO: naming a macro so close to existing functions AND data is not a good + // define to do C-style indexing #define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - // TODO: make function? - void dump_fields( + + /** + * @brief Dump field data to the HDf5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param step + * @param grid + * @param field_array + * @param ftag + */ + void dump_fields( const char *fbase, int step, grid_t* grid, field_array_t* field_array, int ftag - ) - { - size_t step_for_viou = step; + ) + { + size_t step_for_viou = step; - int mpi_size, mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); #ifdef DUMP_INFO_DEBUG - printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); - //printf("base dir for field: %s \n", fdParams.baseDir); - //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); - printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); - printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); - //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); - printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); + printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); + //printf("base dir for field: %s \n", fdParams.baseDir); + //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); + printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); + //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); + printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); #endif -#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } - - char fname[256]; - char field_scratch[128]; - char subfield_scratch[128]; - sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); - FileUtils::makeDirectory(field_scratch); - sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); - FileUtils::makeDirectory(subfield_scratch); - sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); - double el1 = uptime(); - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - H5Pclose(plist_id); + char fname[256]; + char field_scratch[128]; + char subfield_scratch[128]; + + sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); + FileUtils::makeDirectory(field_scratch); + sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); + FileUtils::makeDirectory(subfield_scratch); + + sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); + double el1 = uptime(); + + // int file_exist(const char *filename) + //{ + // struct stat buffer; + // return (stat(filename, &buffer) == 0); + //} + + //https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c +#ifdef HAS_FIELD_COMP + if(!mpi_rank) + printf("Using Field Compund type !\n"); + hid_t field_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(field_t)); + H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err), H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err), H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob), H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof), H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat), H5T_NATIVE_SHORT); + + H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz), H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat), H5T_NATIVE_SHORT); +#endif - sprintf(fname, "Timestep_%zu", step_for_viou); - hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - el1 = uptime() - el1; - io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + //struct stat buffer; + //if((stat(fname, &buffer) == 0)){ + // file_exist_flag = 1; + // if(!mpi_rank) + // printf("Write original files /w HDF5! \n"); + // } + // file_exist_flag = 0; + + hid_t plist_id; + hid_t file_id; + plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + //H5Pset_alignment(plist_id, 4194304, 4194304); + /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){ + exit(-1); + }*/ + +#ifdef METADATA_COLL_WRITE + if(!mpi_rank) printf("Enable collective metadata write !\n"); + H5Pset_coll_metadata_write(plist_id, TRUE); +#endif + file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); - /* - // Create a variable list of field values to output. - size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); - size_t * varlist = new size_t[numvars]; - for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; - printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ + sprintf(fname, "Timestep_%zu", step_for_viou); + hid_t group_id; + group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); -#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - // Local voxel mesh resolution. Voxels are - // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 - // with voxels 1:nx,1:ny,1:nz being non-ghost - // voxels. - - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); - hsize_t temp_buf_index; - hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; - plist_id = H5Pcreate(H5P_DATASET_XFER); - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - - //global->topology_x - - hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; - field_global_size[0] = (grid->nx * grid->gpx); - field_global_size[1] = (grid->ny * grid->gpy); - field_global_size[2] = (grid->nz * grid->gpz); - - field_local_size[0] = grid->nx; - field_local_size[1] = grid->ny; - field_local_size[2] = grid->nz; - - int gpx = grid->gpx; - int gpy = grid->gpy; - int gpz = grid->gpz; - - // Convert rank to local decomposition - int rx, ry, rz; - UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + el1 = uptime() - el1; + //io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); - int mpi_rank_x, mpi_rank_y, mpi_rank_z; - mpi_rank_x = rx; - mpi_rank_y = ry; - mpi_rank_z = rz; + /* + // Create a variable list of field values to output. + size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); + size_t * varlist = new size_t[numvars]; - global_offset[0] = (grid->nx) * mpi_rank_x; - global_offset[1] = (grid->ny) * mpi_rank_y; - global_offset[2] = (grid->nz) * mpi_rank_z; + for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; - global_count[0] = (grid->nx); - global_count[1] = (grid->ny); - global_count[2] = (grid->nz); + printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ -#ifdef DUMP_INFO_DEBUG - printf("global size = %llu %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]); - printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - fflush(stdout); +#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + // Local voxel mesh resolution. Voxels are + // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 + // with voxels 1:nx,1:ny,1:nz being non-ghost + // voxels. + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + //Comment out for test only + + plist_id = H5Pcreate(H5P_DATASET_XFER); +#ifdef HAS_INDEPENDENT_IO + if(!mpi_rank) printf("\n ###\n VPIC Independent I/O! \n ###\n"); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT); +#else + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); #endif - hid_t filespace = H5Screate_simple(3, field_global_size, NULL); - hid_t memspace = H5Screate_simple(3, field_local_size, NULL); - hid_t dataspace_id; - - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - - if (field_dump_flag.ex) - DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); - if (field_dump_flag.ey) - DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); - if (field_dump_flag.ez) - DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_e_err) - DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.cbx) - DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); - if (field_dump_flag.cby) - DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); - if (field_dump_flag.cbz) - DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_b_err) - DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.tcax) - DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcay) - DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcaz) - DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhob) - DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); - - if (field_dump_flag.jfx) - DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfy) - DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfz) - DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhof) - DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); - - //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) - if (field_dump_flag.ematx) - DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); - if (field_dump_flag.ematy) - DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); - if (field_dump_flag.ematz) - DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); - if (field_dump_flag.nmat) - DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); - - if (field_dump_flag.fmatx) - DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); - if (field_dump_flag.fmaty) - DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); - if (field_dump_flag.fmatz) - DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); - if (field_dump_flag.cmat) - DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); - - el2 = uptime() - el2; - io_log("TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - - //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF - float attr_data[2][3]; - attr_data[0][0] = grid->x0; - attr_data[0][1] = grid->y0; - attr_data[0][2] = grid->z0; - attr_data[1][0] = grid->dx; - attr_data[1][1] = grid->dy; - attr_data[1][2] = grid->dz; - hsize_t dims[2]; - dims[0] = 2; - dims[1] = 3; - hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); - H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); - H5Sclose(va_geo_dataspace_id); - H5Aclose(va_geo_attribute_id); - - free(temp_buf); - H5Sclose(filespace); - H5Sclose(memspace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - - el3 = uptime() - el3; - io_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - // TODO: remove or let the user set - int field_interval = 1; - // TODO: remove this dependence on number of steps - std::cout << "num_step " << num_step << std::endl; + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - int nframes = num_step / field_interval + 1; - static int field_tframe = 0; + //global->topology_x -#ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" field_interval: %d \n", field_interval); - printf(" current step: %zd \n", step_for_viou); - printf(" current step: %zd \n", step_for_viou); - - //printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", field_tframe); -#endif + hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; + field_global_size[0] = (grid->nx * grid->gpx); + field_global_size[1] = (grid->ny * grid->gpy); + field_global_size[2] = (grid->nz * grid->gpz); - // TODO: this footer dumping is more likely better done in a - // destructor, rather than hoping a multiple division works out - if (field_tframe >= 1) - { - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - field_tframe++; - } - } - void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) - { - size_t step_for_viou = step; - char fname[256]; - char group_name[256]; - char particle_scratch[128]; - char subparticle_scratch[128]; - - int np_local; - - float *Pf; - int *Pi; - - // get the total number of particles. in this example, output only electrons - //sp = species_list; - sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); - FileUtils::makeDirectory(particle_scratch); - sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); - FileUtils::makeDirectory(subparticle_scratch); - - // TODO: Allow the user to set this - int stride_particle_dump = 1; - - np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; - - // make a copy of the part of particle data to be dumped - double ec1 = uptime(); - - int sp_np = sp->np; - int sp_max_np = sp->max_np; - particle_t *ALIGNED(128) p_buf = NULL; - if (!p_buf) - MALLOC_ALIGNED(p_buf, np_local, 128); - particle_t *sp_p = sp->p; - sp->p = p_buf; - sp->np = np_local; - sp->max_np = np_local; - - for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) - { - COPY(&sp->p[i], &sp_p[iptl], 1); - } + field_local_size[0] = grid->nx; + field_local_size[1] = grid->ny; + field_local_size[2] = grid->nz; - center_p(sp, interpolator_array); + int gpx = grid->gpx; + int gpy = grid->gpy; + int gpz = grid->gpz; - ec1 = uptime() - ec1; - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + // Convert rank to local decomposition + int rx, ry, rz; + UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; - io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local); + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + mpi_rank_x = rx; + mpi_rank_y = ry; + mpi_rank_z = rz; - Pf = (float *)sp->p; - Pi = (int *)sp->p; + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; - // open HDF5 file in "particle/T./" subdirectory - // filename: eparticle.h5p - sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); - sprintf(group_name, "/Timestep_%ld", step_for_viou); - double el1 = uptime(); + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); +#ifdef DUMP_INFO_DEBUG + if(mpi_rank < 4){ + printf("grid nx, ny nz = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + printf("global size = %llu %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); + fflush(stdout); + } +#endif - H5Pclose(plist_id); + hid_t filespace; //= H5Screate_simple(3, field_global_size, NULL); + hid_t memspace; // = H5Screate_simple(3, field_local_size, NULL); + //if(!file_exist_flag){ + filespace = H5Screate_simple(3, field_global_size, NULL); + //} + memspace = H5Screate_simple(3, field_local_size, NULL); - long long total_particles, offset; - long long numparticles = np_local; - MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - offset -= numparticles; + hsize_t chunk_dims[3]; + chunk_dims[0] = 288; //grid->nx; //8 x 8 x 8 + chunk_dims[1] = 24; //grid->ny; // + chunk_dims[2] = 24; //grid->nz; - hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); - hsize_t memspace_count_temp = numparticles * 8; - hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL); - hsize_t linearspace_count_temp = numparticles; - hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + hid_t dataspace_id; + hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE); +#ifdef CHUNK_FLAG + H5Pset_chunk(dcpl_id, 3, chunk_dims); + if(!mpi_rank) printf("Enable chunking !\n"); +#endif - plist_id = H5Pcreate(H5P_DATASET_XFER); +#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + } field_t;*/ + + +#ifdef HAS_FIELD_COMP + field_t * field_buf; + temp_buf_index = 0; + int global_index; + field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) * (grid->nz)); + for (size_t i(1); i < grid->nx + 1; i++){ + for (size_t j(1); j < grid->ny + 1; j++){ + for (size_t k(1); k < grid->nz + 1; k++){ + field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k); + temp_buf_index++; + } + } + } + dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + dataspace_id = H5Dget_space(dset_id); + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); + H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id, field_buf); + free(field_buf); + H5Sclose(dataspace_id); + H5Dclose(dset_id); + H5Tclose(field_comp_type_it); +#else - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); + if (field_dump_flag.ex) + DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); + if (field_dump_flag.ey) + DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); + if (field_dump_flag.ez) + DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_e_err) + DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.cbx) + DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); + if (field_dump_flag.cby) + DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); + if (field_dump_flag.cbz) + DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_b_err) + DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.tcax) + DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcay) + DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcaz) + DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhob) + DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); + + if (field_dump_flag.jfx) + DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfy) + DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfz) + DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhof) + DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); + + //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) + if (field_dump_flag.ematx) + DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); + if (field_dump_flag.ematy) + DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); + if (field_dump_flag.ematz) + DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); + if (field_dump_flag.nmat) + DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); + + if (field_dump_flag.fmatx) + DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); + if (field_dump_flag.fmaty) + DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); + if (field_dump_flag.fmatz) + DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); + if (field_dump_flag.cmat) + DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); - hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); +#endif - el1 = uptime() - el1; - io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + H5D_mpio_actual_io_mode_t actual_io_mode; + H5Pget_mpio_actual_io_mode(plist_id, &actual_io_mode); + /* + + switch(actual_io_mode){ + case H5D_MPIO_NO_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: "); + break; + case H5D_MPIO_CHUNK_INDEPENDENT: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: "); + break; + case H5D_MPIO_CHUNK_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: "); + break; + case H5D_MPIO_CHUNK_MIXED: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: "); + break; + case H5D_MPIO_CONTIGUOUS_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: "); + break; + default : + io_log("H5Pget_mpio_actual_io_mode: None returend: "); + break; + } + + H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode; + H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode); + switch(actual_chunk_opt_mode){ + case H5D_MPIO_NO_CHUNK_OPTIMIZATION: + io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_NO_CHUNK_OPTIMIZATION: "); + break; + case H5D_MPIO_MULTI_CHUNK: + io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: "); + break; + // case H5D_MPIO_MULTI_CHUNK_NO_OPT: + // io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK_NO_OPT: "); + // break; + case H5D_MPIO_LINK_CHUNK: + io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: "); + break; + default : + io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: "); + break; + } - // This point offset is silly, and loses the type safety (pf+1) - hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf); - H5Dclose(dset_id); + uint32_t local_no_collective_cause, global_no_collective_cause; + H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause, &global_no_collective_cause); + + switch(local_no_collective_cause){ + case H5D_MPIO_COLLECTIVE: + io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: "); + break; + case H5D_MPIO_SET_INDEPENDENT: + io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); + break; + case H5D_MPIO_DATA_TRANSFORMS: + io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); + break; + //case H5D_MPIO_SET_MPIPOSIX: + // io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); + // break; + case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: + io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); + break; + //case H5D_MPIO_POINT_SELECTIONS: + // io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); + // break; + // case H5D_MPIO_FILTERS: + // io_log("local_no_collective_cause: H5D_MPIO_FILTERS: "); + // break; + default : + io_log("local_no_collective_cause: None returend: "); + break; + } + + + switch(global_no_collective_cause){ + case H5D_MPIO_COLLECTIVE: + io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: "); + break; + case H5D_MPIO_SET_INDEPENDENT: + io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); + break; + case H5D_MPIO_DATA_TRANSFORMS: + io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); + break; + //case H5D_MPIO_SET_MPIPOSIX: + // io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); + // break; + case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: + io_log("global_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); + break; + //case H5D_MPIO_POINT_SELECTIONS: + // io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); + // break; + // case H5D_MPIO_FILTERS: + // io_log("global_no_collective_cause: H5D_MPIO_FILTERS: "); + // break; + default : + io_log("global_no_collective_cause: None returend: "); + break; + } + */ + + el2 = uptime() - el2; + //io_log("TimeHDF5Write: " << el2 << " s"); + + double el3 = uptime(); + + /* + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + if(!file_exist_flag){ + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id); + } + */ + free(temp_buf); + //if(!file_exist_flag) + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //io_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + + // TODO: remove or let the user set + int field_interval = 1; + + // TODO: remove this dependence on number of steps + //std::cout << "num_step " << num_step << std::endl; + + int nframes = num_step / field_interval + 1; + static int field_tframe = 0; - dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1); - H5Dclose(dset_id); +#ifdef DUMP_INFO_DEBUG + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" field_interval: %d \n", field_interval); + printf(" current step: %zd \n", step_for_viou); + printf(" current step: %zd \n", step_for_viou); + + //printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", field_tframe); +#endif - dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2); - H5Dclose(dset_id); + // TODO: this footer dumping is more likely better done in a + // destructor, rather than hoping a multiple division works out + if (field_tframe >= 1) + { + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); + if (field_tframe == (nframes - 1)) + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + field_tframe++; + } + } + /** + * @brief dump_particles to the HDF5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param sp + * @param grid + * @param step + * @param interpolator_array + * @param ftag + */ + void dump_particles( + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) + { + static int file_index = 0; + file_index ++; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + double dump_particles_uptime = uptime(); + time_t seconds = time(NULL); + // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f, epoch_seconds = %ld \n ", mpi_rank, file_index, dump_particles_uptime, seconds); + + + size_t step_for_viou = step; + char fname[256]; + char group_name[256]; + char particle_scratch[128]; + char subparticle_scratch[128]; + + int np_local; + + float *Pf; + int *Pi; + + // get the total number of particles. in this example, output only electrons + //sp = species_list; + sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); + FileUtils::makeDirectory(particle_scratch); + sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); + FileUtils::makeDirectory(subparticle_scratch); + + // TODO: Allow the user to set this + int stride_particle_dump = 1; + + np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; + + // make a copy of the part of particle data to be dumped + double ec1 = uptime(); + + int sp_np = sp->np; + int sp_max_np = sp->max_np; + particle_t *ALIGNED(128) p_buf = NULL; + if (!p_buf) + MALLOC_ALIGNED(p_buf, np_local, 128); + particle_t *sp_p = sp->p; + sp->p = p_buf; + sp->np = np_local; + sp->max_np = np_local; + + for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) + { + COPY(&sp->p[i], &sp_p[iptl], 1); + } - // TODO: should we add the ability to chunk the particle write? + center_p(sp, interpolator_array); -#ifdef OUTPUT_CONVERT_GLOBAL_ID + ec1 = uptime() - ec1; - // TODO: make a function out of this too, its used in openpmd - std::vector global_pi; - global_pi.resize(numparticles); - // TODO: this could be parallel - for (int i = 0; i < numparticles; i++) - { - int local_i = sp->p[i].i; - int ix, iy, iz, rx, ry, rz; + //if(!mpi_rank || mpi_rank == 2047 ) + // std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; - // Convert rank to local x/y/z - UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); +#ifndef N_FILE_N_PROCESS + int np_local_max, np_local_min; + MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD); + //io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " << np_local_max << ", local_min = "<< np_local_min); +#endif - // Calculate local ix/iy/iz - UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2); + Pf = (float *)sp->p; + Pi = (int *)sp->p; - // Convert ix/iy/iz to global - int gix = ix + (grid->nx * (rx)); - int giy = iy + (grid->ny * (ry)); - int giz = iz + (grid->nz * (rz)); + // open HDF5 file in "particle/T./" subdirectory + // filename: eparticle.h5p +#ifndef N_FILE_N_PROCESS + sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); +#else + sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name, step_for_viou, mpi_rank); +#endif - // calculate global grid sizes - int gnx = grid->nx * grid->gpx; - int gny = grid->ny * grid->gpy; - int gnz = grid->nz * grid->gpz; + sprintf(group_name, "/Timestep_%ld", step_for_viou); + double el1 = uptime(); - // TODO: find a better way to account for the hard coded ghosts in VOXEL - int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2); - //std::cout << rank << " local i " << local_i << " becomes " << global_i << std::endl; - global_pi[i] = global_i; - } - dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, linearspace, filespace, plist_id, global_pi.data()); - H5Dclose(dset_id); + long long total_particles, offset; + long long numparticles = np_local; +#ifndef N_FILE_N_PROCESS + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; #else - dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3); - H5Dclose(dset_id); + total_particles = np_local; + offset = 0; #endif - dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6); - H5Dclose(dset_id); - - dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7); - H5Dclose(dset_id); - - el2 = uptime() - el2; - io_log("Particle TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - H5Sclose(memspace); - H5Sclose(filespace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - el3 = uptime() - el3; - io_log("Particle TimeHDF5Close: " << el3 << " s"); - - sp->p = sp_p; - sp->np = sp_np; - sp->max_np = sp_max_np; - FREE_ALIGNED(p_buf); - - // Write metadata if step() == 0 - char meta_fname[256]; - - sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); - - double meta_el1 = uptime(); - - hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id); - hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - H5Pclose(meta_plist_id); - - long long meta_total_particles, meta_offset; - long long meta_numparticles = 1; - MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - meta_offset -= meta_numparticles; - - hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL); - hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL); - meta_plist_id = H5Pcreate(H5P_DATASET_XFER); - H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE); - H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL); - meta_el1 = uptime() - meta_el1; - io_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts - - double meta_el2 = uptime(); - - hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dX \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dY \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable dZ \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz); - H5Dclose(meta_dset_id); - //if (rank == 0) printf ("Written variable i \n"); - - meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy); - H5Dclose(meta_dset_id); - - meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz); - H5Dclose(meta_dset_id); - - meta_el2 = uptime() - meta_el2; - io_log("Metafile TimeHDF5Write: " << meta_el2 << " s"); - double meta_el3 = uptime(); - H5Sclose(meta_memspace); - H5Sclose(meta_filespace); - H5Pclose(meta_plist_id); - H5Gclose(meta_group_id); - H5Fclose(meta_file_id); - meta_el3 = uptime() - meta_el3; - io_log("Metafile TimeHDF5Close: " << meta_el3 << " s"); - } + hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS); - void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) - { - size_t step_for_viou = step; +#ifndef N_FILE_N_PROCESS + H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); +#endif -#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } - //#define DUMP_INFO_DEBUG 1 - int mpi_size, mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); +#ifdef H5_ASYNC + if(!mpi_rank) printf("Enable async on particle data"); - if (!sp) - { - ERROR(("Invalid species")); - } + assert(H5Pset_vol_async(file_plist_id)); +#endif - clear_hydro_array(hydro_array); - accumulate_hydro_p(hydro_array, sp, interpolator_array); - synchronize_hydro_array(hydro_array); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id); + //if(!mpi_rank ) + //io_log("++Particle H5Fcreate) "); - char hname[256]; - char hydro_scratch[128]; - char subhydro_scratch[128]; - sprintf(hydro_scratch, "./%s", "hydro_hdf5"); - FileUtils::makeDirectory(hydro_scratch); - sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); - FileUtils::makeDirectory(subhydro_scratch); + hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + //if(!mpi_rank ) + //io_log("++Particle H5Gcreate) "); - sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou); - double el1 = uptime(); - hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - H5Pclose(plist_id); +#ifdef HAS_PARTICLE_COMP + if(!mpi_rank) + printf("Using Partilce Compund type !\n"); + hid_t particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t)); + H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx), H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy), H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz), H5T_NATIVE_FLOAT); - sprintf(hname, "Timestep_%zu", step_for_viou); - hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i), H5T_NATIVE_INT); - el1 = uptime() - el1; - io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux), H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy), H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz), H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "w", HOFFSET(particle_t, w), H5T_NATIVE_FLOAT); +#endif - // Create a variable list of field values to output. - //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); - //size_t *varlist = new size_t[numvars]; + hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); - //for (size_t i(0), c(0); i < total_field_variables; i++) - // if (global->fdParams.output_vars.bitset(i)) - // varlist[c++] = i; + //if(!mpi_rank ) + //io_log("++Particle H5Sselect_hyperslab) "); - //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); + //plist_id = H5P_DEFAULT; + hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER); +#ifndef N_FILE_N_PROCESS +#ifdef HAS_INDEPENDENT_IO + if(!mpi_rank) { + printf("\n ###\n VPIC Independent I/O! \n ###\n"); + } + H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT); +#else + H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE); +#endif +#endif - //typedef struct hydro { - // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align - //} hydro_t; +#ifdef H5_ASYNC + H5Pset_dxpl_async(io_plist_id, true); +#endif + hsize_t linearspace_count_temp = numparticles; + hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + + hsize_t memspace_count_temp; + hid_t memspace; +#ifdef HAS_PARTICLE_COMP + memspace_count_temp = numparticles ; + memspace = H5Screate_simple(1, &memspace_count_temp, NULL); +#else + memspace_count_temp = numparticles * 8; + memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; + H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); +#endif + el1 = uptime() - el1; + //if(!mpi_rank || mpi_rank == 2047 ) + //io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); + int ierr; + +#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p){\ + hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p);\ + H5Dclose(dset_id);\ +} + + + //MPI_Info_set(info, "romio_cb_write", "disable"); +#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p){\ + MPI_File fh;\ + MPI_Status status;\ + sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name, step_for_viou, dname_p);\ + if(mpi_rank == 0) printf("fname= %s \n", fname);\ + MPI_Info info;\ + MPI_Info_create(&info);\ + MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE, info, &fh);\ + MPI_File_write_at(fh, offset_p, data_buf_p, count_p,type_p, &status);\ + MPI_Info_free(&info);\ + MPI_File_close(&fh);\ +} + +#ifdef HAS_PARTICLE_COMP + hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id, sp->p); + H5Dclose(dset_id); +#else +#ifdef TEST_MPIIO + //Here we don't use the stripe but just for performance test + if(!mpi_rank) printf("Test MPI-IO\n"); + WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("i", offset * sizeof(int), Pf, numparticles, MPI_INT); + WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("q", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); +#else +#ifndef N_FILE_N_PROCESS + if(!mpi_rank) printf("Test HDF5-IO Single \n"); +#else + if(!mpi_rank) printf("Test HDF5-IO N Files N Process\n"); +#endif + //if(!mpi_rank ) + //io_log("++Particle Starting to write ) "); + WRITE_H5_FILE(group_id, Pf, H5T_NATIVE_FLOAT, "dX") + WRITE_H5_FILE(group_id, Pf+1, H5T_NATIVE_FLOAT, "dY") + WRITE_H5_FILE(group_id, Pf+2, H5T_NATIVE_FLOAT, "dZ") + WRITE_H5_FILE(group_id, Pi+3, H5T_NATIVE_INT, "i") + WRITE_H5_FILE(group_id, Pf+4, H5T_NATIVE_FLOAT, "ux") + WRITE_H5_FILE(group_id, Pf+5, H5T_NATIVE_FLOAT, "uy") + WRITE_H5_FILE(group_id, Pf+6, H5T_NATIVE_FLOAT, "uz") + WRITE_H5_FILE(group_id, Pf+7, H5T_NATIVE_FLOAT, "q") +#endif +#endif + el2 = uptime() - el2; + //io_log("Particle TimeHDF5Write: " << el2 << " s"); - //typedef struct hydro_array { - // hydro_t * ALIGNED(128) h; - // grid_t * g; - //} hydro_array_t; + double el3 = uptime(); + H5Sclose(memspace); + H5Sclose(filespace); + H5Pclose(file_plist_id); + H5Pclose(io_plist_id); + H5Gclose(group_id); - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); - hsize_t temp_buf_index; - hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; - plist_id = H5Pcreate(H5P_DATASET_XFER); - //Comment out for test only - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - //global->topology_x - hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; - hydro_global_size[0] = (grid->nx * grid->gpx); - hydro_global_size[1] = (grid->ny * grid->gpy); - hydro_global_size[2] = (grid->nz * grid->gpz); + H5Fclose(file_id); - hydro_local_size[0] = grid->nx; - hydro_local_size[1] = grid->ny; - hydro_local_size[2] = grid->nz; +#ifdef H5_ASYNC + H5VLasync_finalize(); +#endif + el3 = uptime() - el3; + //io_log("Particle TimeHDF5Close: " << el3 << " s"); - int mpi_rank_x, mpi_rank_y, mpi_rank_z; - UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz); + } - global_offset[0] = (grid->nx) * mpi_rank_x; - global_offset[1] = (grid->ny) * mpi_rank_y; - global_offset[2] = (grid->nz) * mpi_rank_z; +/** + * @brief Dump hydro data to the HDf5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param step + * @param hydro_array + * @param sp + * @param interpolator_array + * @param grid + * @param ftag + */ +void dump_hydro( + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) +{ + size_t step_for_viou = step; - global_count[0] = (grid->nx); - global_count[1] = (grid->ny); - global_count[2] = (grid->nz); +#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) \ + { \ + for (size_t j(1); j < grid->ny + 1; j++) \ + { \ + for (size_t k(1); k < grid->nz + 1; k++) \ + { \ + temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + //#define DUMP_INFO_DEBUG 1 + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + if (!sp) + { + ERROR(("Invalid species")); + } + + clear_hydro_array(hydro_array); + accumulate_hydro_p(hydro_array, sp, interpolator_array); + synchronize_hydro_array(hydro_array); + + char hname[256]; + char hydro_scratch[128]; + char subhydro_scratch[128]; + + sprintf(hydro_scratch, "./%s", "hydro_hdf5"); + FileUtils::makeDirectory(hydro_scratch); + sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); + FileUtils::makeDirectory(subhydro_scratch); + + sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou); + double el1 = uptime(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); + + /* + if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){ + exit(-1); + }*/ + //if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0) + // ERROR_RETURN; + + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + sprintf(hname, "Timestep_%zu", step_for_viou); + hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + el1 = uptime() - el1; + //io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts + double el2 = uptime(); + + // Create a variable list of field values to output. + //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); + //size_t *varlist = new size_t[numvars]; + + //for (size_t i(0), c(0); i < total_field_variables; i++) + // if (global->fdParams.output_vars.bitset(i)) + // varlist[c++] = i; + + //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); + + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; +#ifdef HAS_HYDRO_COMP + //if(!mpi_rank) + //printf("Using Field Compund type !\n"); + hid_t hydro_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(hydro_t)); + H5Tinsert(hydro_comp_type_it, "jx", HOFFSET(hydro_t, jx), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "jy", HOFFSET(hydro_t, jy), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "jz", HOFFSET(hydro_t, jz), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho), H5T_NATIVE_FLOAT); + + H5Tinsert(hydro_comp_type_it, "px", HOFFSET(hydro_t, px), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "py", HOFFSET(hydro_t, py), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "pz", HOFFSET(hydro_t, pz), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "ke", HOFFSET(hydro_t, ke), H5T_NATIVE_FLOAT); + + H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz), H5T_NATIVE_FLOAT); + + H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad), H5T_NATIVE_DOUBLE); +#endif + //typedef struct hydro_array { + // hydro_t * ALIGNED(128) h; + // grid_t * g; + //} hydro_array_t; + + float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + plist_id = H5Pcreate(H5P_DATASET_XFER); + //Comment out for test only + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + + //global->topology_x + + hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; + hydro_global_size[0] = (grid->nx * grid->gpx); + hydro_global_size[1] = (grid->ny * grid->gpy); + hydro_global_size[2] = (grid->nz * grid->gpz); + + hydro_local_size[0] = grid->nx; + hydro_local_size[1] = grid->ny; + hydro_local_size[2] = grid->nz; + + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz); + + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; + + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - fflush(stdout); + printf("global size = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + fflush(stdout); #endif - hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL); - hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); - hid_t dataspace_id; - - //typedef struct hydro { - // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align - //} hydro_t; - - if (hydro_dump_flag.jx) - DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.jy) - DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.jz) - DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.rho) - DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.px) - DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.py) - DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.pz) - DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.ke) - DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.txx) - DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tyy) - DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tzz) - DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); - - if (hydro_dump_flag.tyz) - DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.tzx) - DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); - if (hydro_dump_flag.txy) - DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); - - el2 = uptime() - el2; - io_log("TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - - //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF - float attr_data[2][3]; - attr_data[0][0] = grid->x0; - attr_data[0][1] = grid->y0; - attr_data[0][2] = grid->z0; - attr_data[1][0] = grid->dx; - attr_data[1][1] = grid->dy; - attr_data[1][2] = grid->dz; - hsize_t dims[2]; - dims[0] = 2; - dims[1] = 3; - hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); - H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); - H5Sclose(va_geo_dataspace_id); - H5Aclose(va_geo_attribute_id); - - free(temp_buf); - H5Sclose(filespace); - H5Sclose(memspace); - H5Pclose(plist_id); - H5Gclose(group_id); - H5Fclose(file_id); - - el3 = uptime() - el3; - io_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char output_xml_file[128]; - sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf"); - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - - // TODO: remove or let user set - int hydro_interval = 1; + hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL); + hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); + hid_t dataspace_id; + + //typedef struct hydro { + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , + // float txx, tyy, tzz; // Stress diagonal => , i==j + // float tyz, tzx, txy; // Stress off-diagonal => , i!=j + // float _pad[2]; // 16-byte align + //} hydro_t; + + +#ifdef HAS_HYDRO_COMP + hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) * (grid->ny) * (grid->nz)); + temp_buf_index = 0; + for (size_t i(1); i < grid->nx + 1; i++){ + for (size_t j(1); j < grid->ny + 1; j++){ + for (size_t k(1); k < grid->nz + 1; k++){ + hydro_buf[temp_buf_index] = _hydro(i, j, k); + temp_buf_index = temp_buf_index + 1; + } + } + } + dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + dataspace_id = H5Dget_space(dset_id); + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); + H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id, hydro_buf); + free(hydro_buf); + H5Sclose(dataspace_id); + H5Dclose(dset_id); + H5Tclose(hydro_comp_type_it); +#else - // TODO: remove this dependence on number of steps - int nframes = num_step / hydro_interval + 1; + if (hydro_dump_flag.jx) + DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jy) + DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.jz) + DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.rho) + DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.px) + DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.py) + DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.pz) + DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.ke) + DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.txx) + DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tyy) + DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzz) + DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); + + if (hydro_dump_flag.tyz) + DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.tzx) + DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); + if (hydro_dump_flag.txy) + DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); + + el2 = uptime() - el2; + //io_log("TimeHDF5Write: " << el2 << " s"); - const int tframe = tframe_map[sp->id]; +#endif + double el3 = uptime(); + + //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + /* + float attr_data[2][3]; + attr_data[0][0] = grid->x0; + attr_data[0][1] = grid->y0; + attr_data[0][2] = grid->z0; + attr_data[1][0] = grid->dx; + attr_data[1][1] = grid->dy; + attr_data[1][2] = grid->dz; + hsize_t dims[2]; + dims[0] = 2; + dims[1] = 3; + hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); + H5Sclose(va_geo_dataspace_id); + H5Aclose(va_geo_attribute_id);*/ + + free(temp_buf); + H5Sclose(filespace); + H5Sclose(memspace); + H5Pclose(plist_id); + H5Gclose(group_id); + H5Fclose(file_id); + + el3 = uptime() - el3; + //io_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) + { + char output_xml_file[128]; + sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf"); + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + // TODO: remove or let user set + int hydro_interval = 1; + + // TODO: remove this dependence on number of steps + int nframes = num_step / hydro_interval + 1; + + const int tframe = tframe_map[sp->id]; #ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" hydro_fields_interval: %d \n", hydro_interval); - printf(" current step: %zu \n", step_for_viou); - printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", tframe); + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" hydro_fields_interval: %d \n", hydro_interval); + printf(" current step: %zu \n", step_for_viou); + printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", tframe); #endif - // TODO: why doesnt this just use the cstr? - char speciesname_new[128]; - sprintf(speciesname_new, "hydro_%s", sp->name); - if (tframe >= 1) - { - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } - } - tframe_map[sp->id]++; + // TODO: why doesnt this just use the cstr? + char speciesname_new[128]; + sprintf(speciesname_new, "hydro_%s", sp->name); + if (tframe >= 1) + { + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); + } + } + else + { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); + if (tframe == (nframes - 1)) + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); + } + else + { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); } } + tframe_map[sp->id]++; + } +} }; #endif @@ -1141,24 +1469,24 @@ class OpenPMDDump : public Dump_Strategy { std::string file_type = ".bp"; void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ) + const char *fbase, + int step, + grid_t* grid, + field_array_t* field_array, + int ftag + ) { std::cout << "Writing openPMD data" << std::endl; std::string full_file_name = fbase + file_type; //if (series == nullptr) { - std::cout << "init series" << std::endl; - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); + std::cout << "init series" << std::endl; + openPMD::Series series = openPMD::Series( + full_file_name, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); //} std::cout << "Writing iteration " << step << std::endl; @@ -1422,24 +1750,24 @@ class OpenPMDDump : public Dump_Strategy { } void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) + const char *fbase, + species_t* sp, + grid_t* grid, + int step, + interpolator_array_t* interpolator_array, + int ftag + ) { std::string full_file_name = fbase + file_type; std::cout << "writing particles to " << full_file_name << std::endl; //if (series == nullptr) { - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); + openPMD::Series series = openPMD::Series( + full_file_name, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); //} auto i = series.iterations[ step ]; @@ -1568,25 +1896,25 @@ class OpenPMDDump : public Dump_Strategy { } } void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) + const char *fbase, + int step, + hydro_array_t* hydro_array, + species_t* sp, + interpolator_array_t* interpolator_array, + grid_t* grid, + int ftag + ) { std::string full_file_name = fbase + file_type; std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl; //if (series == nullptr) { - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); + openPMD::Series series = openPMD::Series( + full_file_name, + openPMD::AccessType::CREATE, + MPI_COMM_WORLD + ); //} auto i = series.iterations[ step ]; From 657500e86ee36849666d5c57c37bea543889d8a1 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Mon, 23 Nov 2020 10:33:57 -0700 Subject: [PATCH 94/95] apply clang format to dump_strategy.h --- src/vpic/dump_strategy.h | 3239 +++++++++++++++++++------------------- 1 file changed, 1583 insertions(+), 1656 deletions(-) diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index c6fb7613..85f70b1e 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -1,9 +1,9 @@ #ifndef Dump_Strategy_h #define Dump_Strategy_h +#include #include #include -#include //#define DUMP_INFO_DEBUG 1 //#define H5_ASYNC 1 @@ -12,35 +12,31 @@ #endif //#define CHUNK_FLAG 1 - //#define METADATA_COLL_WRITE 1 //#define TRUE 1 - #define HAS_FIELD_COMP 1 #define HAS_PARTICLE_COMP 1 #define HAS_HYDRO_COMP 1 //#define HAS_INDEPENDENT_IO 1 -#include // TODO: it would be good if this didn't have to know about MPI #include - +#include // TODO: it would be good if this didn't have to know about MPI // TODO: should I drop the ./src here? -#include "../util/io/FileIO.h" -#include "../util/util_base.h" -#include "../util/io/FileUtils.h" #include "../field_advance/field_advance.h" #include "../sf_interface/sf_interface.h" #include "../species_advance/species_advance.h" +#include "../util/io/FileIO.h" +#include "../util/io/FileUtils.h" +#include "../util/util_base.h" #include "dump.h" #include "dumpmacros.h" - #ifdef VPIC_ENABLE_HDF5 -#include "hdf5.h" // from the lib +#include "hdf5.h" // from the lib #include "hdf5_header_info.h" // from vpic #endif @@ -48,13 +44,12 @@ #include #endif - //#define N_FILE_N_PROCESS 1 //#define TEST_MPIIO 1 // TODO: delete this -#define _LOG_PREFIX \ - __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: " +#define _LOG_PREFIX \ + __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: " /* #define io_log(x) do { \ @@ -65,686 +60,623 @@ std::cerr.flush(); \ } while(0) */ - - // Runtime inheritance is obviously not very "VPIC like", as we will [probably] // incur a penalty for the vtable lookup, but given we're about to do IO this // is very negligible. class Dump_Strategy { - public: - int rank, nproc, num_step; - - Dump_Strategy(int _rank, int _nproc ) : - rank(_rank), - nproc(_nproc) - { } // empty - - virtual ~Dump_Strategy() { }; - - virtual void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ) = 0; - virtual void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) = 0; - virtual void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) = 0; +public: + int rank, nproc, num_step; + + Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) {} // empty + + virtual ~Dump_Strategy(){}; + + virtual void dump_fields(const char *fbase, int step, grid_t *grid, + field_array_t *field_array, int ftag) = 0; + virtual void dump_hydro(const char *fbase, int step, + hydro_array_t *hydro_array, species_t *sp, + interpolator_array_t *interpolator_array, + grid_t *grid, int ftag) = 0; + virtual void dump_particles(const char *fbase, species_t *sp, grid_t *grid, + int step, + interpolator_array_t *interpolator_array, + int ftag) = 0; }; class BinaryDump : public Dump_Strategy { - public: - using Dump_Strategy::Dump_Strategy; // inherit constructor - //BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } // empty - - // TODO: now we pass rank and step, ftag has odd semanticds - void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ); - void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ); - void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ); +public: + using Dump_Strategy::Dump_Strategy; // inherit constructor + // BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } // + // empty + + // TODO: now we pass rank and step, ftag has odd semanticds + void dump_fields(const char *fbase, int step, grid_t *grid, + field_array_t *field_array, int ftag); + void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array, + species_t *sp, interpolator_array_t *interpolator_array, + grid_t *grid, int ftag); + void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step, + interpolator_array_t *interpolator_array, int ftag); }; #ifdef VPIC_ENABLE_HDF5 -struct field_dump_flag_t -{ - bool ex = true, ey = true, ez = true, div_e_err = true; - bool cbx = true, cby = true, cbz = true, div_b_err = true; - bool tcax = true, tcay = true, tcaz = true, rhob = true; - bool jfx = true, jfy = true, jfz = true, rhof = true; - bool ematx = true, ematy = true, ematz = true, nmat = true; - bool fmatx = true, fmaty = true, fmatz = true, cmat = true; - void disableE() - { - ex = false, ey = false, ez = false, div_e_err = false; - } +struct field_dump_flag_t { + bool ex = true, ey = true, ez = true, div_e_err = true; + bool cbx = true, cby = true, cbz = true, div_b_err = true; + bool tcax = true, tcay = true, tcaz = true, rhob = true; + bool jfx = true, jfy = true, jfz = true, rhof = true; + bool ematx = true, ematy = true, ematz = true, nmat = true; + bool fmatx = true, fmaty = true, fmatz = true, cmat = true; + void disableE() { ex = false, ey = false, ez = false, div_e_err = false; } - void disableCB() - { - cbx = false, cby = false, cbz = false, div_b_err = false; - } + void disableCB() { cbx = false, cby = false, cbz = false, div_b_err = false; } - void disableTCA() - { - tcax = false, tcay = false, tcaz = false, rhob = false; - } + void disableTCA() { tcax = false, tcay = false, tcaz = false, rhob = false; } - void disableJF() - { - jfx = false, jfy = false, jfz = false, rhof = false; - } + void disableJF() { jfx = false, jfy = false, jfz = false, rhof = false; } - void disableEMAT() - { - ematx = false, ematy = false, ematz = false, nmat = false; - } + void disableEMAT() { + ematx = false, ematy = false, ematz = false, nmat = false; + } - void disableFMAT() - { - fmatx = false, fmaty = false, fmatz = false, cmat = false; - } + void disableFMAT() { + fmatx = false, fmaty = false, fmatz = false, cmat = false; + } - void resetToDefaults() - { - ex = true, ey = true, ez = true, div_e_err = true; - cbx = true, cby = true, cbz = true, div_b_err = true; - tcax = true, tcay = true, tcaz = true, rhob = true; - jfx = true, jfy = true, jfz = true, rhof = true; - ematx = true, ematy = true, ematz = true, nmat = true; - fmatx = true, fmaty = true, fmatz = true, cmat = true; - } + void resetToDefaults() { + ex = true, ey = true, ez = true, div_e_err = true; + cbx = true, cby = true, cbz = true, div_b_err = true; + tcax = true, tcay = true, tcaz = true, rhob = true; + jfx = true, jfy = true, jfz = true, rhof = true; + ematx = true, ematy = true, ematz = true, nmat = true; + fmatx = true, fmaty = true, fmatz = true, cmat = true; + } - bool enabledE() - { - return ex && ey && ez; - } + bool enabledE() { return ex && ey && ez; } - bool enabledCB() - { - return cbx && cby && cbz; - } + bool enabledCB() { return cbx && cby && cbz; } - bool enabledTCA() - { - return tcax && tcay && tcaz; - } + bool enabledTCA() { return tcax && tcay && tcaz; } - bool enabledJF() - { - return jfx && jfy && jfz; - } + bool enabledJF() { return jfx && jfy && jfz; } - bool enabledEMAT() - { - return ematx && ematy && ematz; - } + bool enabledEMAT() { return ematx && ematy && ematz; } - bool enabledFMAT() - { - return fmatx && fmaty && fmatz; - } + bool enabledFMAT() { return fmatx && fmaty && fmatz; } }; -struct hydro_dump_flag_t -{ - bool jx = true, jy = true, jz = true, rho = true; - bool px = true, py = true, pz = true, ke = true; - bool txx = true, tyy = true, tzz = true; - bool tyz = true, tzx = true, txy = true; +struct hydro_dump_flag_t { + bool jx = true, jy = true, jz = true, rho = true; + bool px = true, py = true, pz = true, ke = true; + bool txx = true, tyy = true, tzz = true; + bool tyz = true, tzx = true, txy = true; - void disableJ() - { - jx = false, jy = false, jz = false, rho = false; - } + void disableJ() { jx = false, jy = false, jz = false, rho = false; } - void disableP() - { - px = false, py = false, pz = false, ke = false; - } + void disableP() { px = false, py = false, pz = false, ke = false; } - void disableTD() //Stress diagonal - { - txx = false, tyy = false, tzz = false; - } + void disableTD() // Stress diagonal + { + txx = false, tyy = false, tzz = false; + } - void disableTOD() //Stress off-diagonal - { - tyz = false, tzx = false, txy = false; - } - void resetToDefaults() - { - jx = true, jy = true, jz = true, rho = true; - px = true, py = true, pz = true, ke = true; - txx = true, tyy = true, tzz = true; - tyz = true, tzx = true, txy = true; - } + void disableTOD() // Stress off-diagonal + { + tyz = false, tzx = false, txy = false; + } + void resetToDefaults() { + jx = true, jy = true, jz = true, rho = true; + px = true, py = true, pz = true, ke = true; + txx = true, tyy = true, tzz = true; + tyz = true, tzx = true, txy = true; + } - bool enabledJ() - { - return jx && jy && jz; - } + bool enabledJ() { return jx && jy && jz; } - bool enabledP() - { - return px && py && pz; - } + bool enabledP() { return px && py && pz; } - bool enabledTD() - { - return txx && tyy && tzz; - } + bool enabledTD() { return txx && tyy && tzz; } - bool enabledTOD() - { - return tyz && tzx && txy; - } + bool enabledTOD() { return tyz && tzx && txy; } }; class HDF5Dump : public Dump_Strategy { - std::unordered_map tframe_map; - public: - using Dump_Strategy::Dump_Strategy; // inherit constructor + std::unordered_map tframe_map; - // TODO: replace these with a common dump interface - // Declare vars to use - hydro_dump_flag_t hydro_dump_flag; - field_dump_flag_t field_dump_flag; +public: + using Dump_Strategy::Dump_Strategy; // inherit constructor + + // TODO: replace these with a common dump interface + // Declare vars to use + hydro_dump_flag_t hydro_dump_flag; + field_dump_flag_t field_dump_flag; #define DUMP_DIR_FORMAT "./%s" - // TODO: naming a macro so close to existing functions AND data is not a good - // define to do C-style indexing -#define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - - - /** - * @brief Dump field data to the HDf5 file - * Author: Bin Dong dbin@lbl.gov - * https://crd.lbl.gov/bin-dong - * Nov 2020 - * @param fbase - * @param step - * @param grid - * @param field_array - * @param ftag - */ - void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ) - { - size_t step_for_viou = step; - - int mpi_size, mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + // TODO: naming a macro so close to existing functions AND data is not a good + // define to do C-style indexing +#define _hydro(x, y, z) \ + hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] + + /** + * @brief Dump field data to the HDf5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param step + * @param grid + * @param field_array + * @param ftag + */ + void dump_fields(const char *fbase, int step, grid_t *grid, + field_array_t *field_array, int ftag) { + size_t step_for_viou = step; + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); #ifdef DUMP_INFO_DEBUG - printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); - //printf("base dir for field: %s \n", fdParams.baseDir); - //printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z); - printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); - printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); - //printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z); - printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv); + printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size); + // printf("base dir for field: %s \n", fdParams.baseDir); + // printf("stride x y z = (%ld, %ld, %ld)\n", fdParams.stride_x, + // fdParams.stride_y, fdParams.stride_z); + printf("grid x, y z = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, " + "%f) \n", + grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1); + // printf("global->topology_x, y, z = %f, %f, %f \n ", global->topology_x, + // global->topology_y, global->topology_z); + printf("grid -> sx, sy, sz = (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, + grid->sz, grid->nv); #endif + char fname[256]; + char field_scratch[128]; + char subfield_scratch[128]; + sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); + FileUtils::makeDirectory(field_scratch); + sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); + FileUtils::makeDirectory(subfield_scratch); - char fname[256]; - char field_scratch[128]; - char subfield_scratch[128]; - - sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5"); - FileUtils::makeDirectory(field_scratch); - sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou); - FileUtils::makeDirectory(subfield_scratch); - - sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); - double el1 = uptime(); - - // int file_exist(const char *filename) - //{ - // struct stat buffer; - // return (stat(filename, &buffer) == 0); - //} - - //https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c -#ifdef HAS_FIELD_COMP - if(!mpi_rank) - printf("Using Field Compund type !\n"); - hid_t field_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(field_t)); - H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err), H5T_NATIVE_FLOAT); - - H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err), H5T_NATIVE_FLOAT); - - H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob), H5T_NATIVE_FLOAT); - - H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz), H5T_NATIVE_FLOAT); - H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof), H5T_NATIVE_FLOAT); - - H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat), H5T_NATIVE_SHORT); - - H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz), H5T_NATIVE_SHORT); - H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat), H5T_NATIVE_SHORT); -#endif + sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou); + double el1 = uptime(); + // int file_exist(const char *filename) + //{ + // struct stat buffer; + // return (stat(filename, &buffer) == 0); + //} - //struct stat buffer; - //if((stat(fname, &buffer) == 0)){ - // file_exist_flag = 1; - // if(!mpi_rank) - // printf("Write original files /w HDF5! \n"); - // } - // file_exist_flag = 0; + // https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c +#ifdef HAS_FIELD_COMP + if (!mpi_rank) + printf("Using Field Compund type !\n"); + hid_t field_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(field_t)); + H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err), + H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err), + H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob), + H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz), + H5T_NATIVE_FLOAT); + H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof), + H5T_NATIVE_FLOAT); + + H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat), + H5T_NATIVE_SHORT); + + H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz), + H5T_NATIVE_SHORT); + H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat), + H5T_NATIVE_SHORT); +#endif - hid_t plist_id; - hid_t file_id; - plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); - //H5Pset_alignment(plist_id, 4194304, 4194304); - /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){ - exit(-1); - }*/ + // struct stat buffer; + // if((stat(fname, &buffer) == 0)){ + // file_exist_flag = 1; + // if(!mpi_rank) + // printf("Write original files /w HDF5! \n"); + // } + // file_exist_flag = 0; + + hid_t plist_id; + hid_t file_id; + plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + // H5Pset_alignment(plist_id, 4194304, 4194304); + /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < + 0){ exit(-1); + }*/ #ifdef METADATA_COLL_WRITE - if(!mpi_rank) printf("Enable collective metadata write !\n"); - H5Pset_coll_metadata_write(plist_id, TRUE); + if (!mpi_rank) + printf("Enable collective metadata write !\n"); + H5Pset_coll_metadata_write(plist_id, TRUE); #endif - file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - H5Pclose(plist_id); - - + file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); - sprintf(fname, "Timestep_%zu", step_for_viou); - hid_t group_id; - group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + sprintf(fname, "Timestep_%zu", step_for_viou); + hid_t group_id; + group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - el1 = uptime() - el1; - //io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); + el1 = uptime() - el1; + // io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for + // scripts + double el2 = uptime(); - /* - // Create a variable list of field values to output. - size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); - size_t * varlist = new size_t[numvars]; + /* + // Create a variable list of field values to output. + size_t numvars = std::min(global->fdParams.output_vars.bitsum(), + total_field_variables); size_t * varlist = new size_t[numvars]; - for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; + for(size_t i(0), c(0); ifdParams.output_vars.bitset(i)) varlist[c++] = i; - printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ + printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/ #define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)] - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - // Local voxel mesh resolution. Voxels are - // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 - // with voxels 1:nx,1:ny,1:nz being non-ghost - // voxels. - - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); - hsize_t temp_buf_index; - hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; - //Comment out for test only - - plist_id = H5Pcreate(H5P_DATASET_XFER); + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge + density float jfx, jfy, jfz, rhof; // Free current and charge + density material_id ematx, ematy, ematz, nmat; // Material at edge + centers and nodes material_id fmatx, fmaty, fmatz, cmat; // Material at + face and cell centers } field_t;*/ + // Local voxel mesh resolution. Voxels are + // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 + // with voxels 1:nx,1:ny,1:nz being non-ghost + // voxels. + + float *temp_buf = + (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + hsize_t temp_buf_index; + hid_t dset_id; + // char *field_var_name[] = + // {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + // Comment out for test only + + plist_id = H5Pcreate(H5P_DATASET_XFER); #ifdef HAS_INDEPENDENT_IO - if(!mpi_rank) printf("\n ###\n VPIC Independent I/O! \n ###\n"); - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT); + if (!mpi_rank) + printf("\n ###\n VPIC Independent I/O! \n ###\n"); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT); #else - H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); #endif + // H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, + // (hsize_t *) &numparticles, NULL); + // global->topology_x + hsize_t field_global_size[3], field_local_size[3], global_offset[3], + global_count[3]; + field_global_size[0] = (grid->nx * grid->gpx); + field_global_size[1] = (grid->ny * grid->gpy); + field_global_size[2] = (grid->nz * grid->gpz); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); - - //global->topology_x - - hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3]; - field_global_size[0] = (grid->nx * grid->gpx); - field_global_size[1] = (grid->ny * grid->gpy); - field_global_size[2] = (grid->nz * grid->gpz); - - field_local_size[0] = grid->nx; - field_local_size[1] = grid->ny; - field_local_size[2] = grid->nz; + field_local_size[0] = grid->nx; + field_local_size[1] = grid->ny; + field_local_size[2] = grid->nz; - int gpx = grid->gpx; - int gpy = grid->gpy; - int gpz = grid->gpz; + int gpx = grid->gpx; + int gpy = grid->gpy; + int gpz = grid->gpz; - // Convert rank to local decomposition - int rx, ry, rz; - UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + // Convert rank to local decomposition + int rx, ry, rz; + UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - int mpi_rank_x, mpi_rank_y, mpi_rank_z; - mpi_rank_x = rx; - mpi_rank_y = ry; - mpi_rank_z = rz; + int mpi_rank_x, mpi_rank_y, mpi_rank_z; + mpi_rank_x = rx; + mpi_rank_y = ry; + mpi_rank_z = rz; - global_offset[0] = (grid->nx) * mpi_rank_x; - global_offset[1] = (grid->ny) * mpi_rank_y; - global_offset[2] = (grid->nz) * mpi_rank_z; + global_offset[0] = (grid->nx) * mpi_rank_x; + global_offset[1] = (grid->ny) * mpi_rank_y; + global_offset[2] = (grid->nz) * mpi_rank_z; - global_count[0] = (grid->nx); - global_count[1] = (grid->ny); - global_count[2] = (grid->nz); + global_count[0] = (grid->nx); + global_count[1] = (grid->ny); + global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - if(mpi_rank < 4){ - printf("grid nx, ny nz = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); - printf("global size = %llu %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]); - printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); - fflush(stdout); - } + if (mpi_rank < 4) { + printf("grid nx, ny nz = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, + mpi_rank_x, mpi_rank_y, mpi_rank_z); + printf("global size = %llu %llu %llu \n", field_global_size[0], + field_global_size[1], field_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], + global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], + global_count[1], global_count[2]); + fflush(stdout); + } #endif - hid_t filespace; //= H5Screate_simple(3, field_global_size, NULL); - hid_t memspace; // = H5Screate_simple(3, field_local_size, NULL); - //if(!file_exist_flag){ - filespace = H5Screate_simple(3, field_global_size, NULL); - //} - memspace = H5Screate_simple(3, field_local_size, NULL); + hid_t filespace; //= H5Screate_simple(3, field_global_size, NULL); + hid_t memspace; // = H5Screate_simple(3, field_local_size, NULL); + // if(!file_exist_flag){ + filespace = H5Screate_simple(3, field_global_size, NULL); + //} + memspace = H5Screate_simple(3, field_local_size, NULL); - hsize_t chunk_dims[3]; - chunk_dims[0] = 288; //grid->nx; //8 x 8 x 8 - chunk_dims[1] = 24; //grid->ny; // - chunk_dims[2] = 24; //grid->nz; + hsize_t chunk_dims[3]; + chunk_dims[0] = 288; // grid->nx; //8 x 8 x 8 + chunk_dims[1] = 24; // grid->ny; // + chunk_dims[2] = 24; // grid->nz; - - - hid_t dataspace_id; - hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + hid_t dataspace_id; + hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE); #ifdef CHUNK_FLAG - H5Pset_chunk(dcpl_id, 3, chunk_dims); - if(!mpi_rank) printf("Enable chunking !\n"); + H5Pset_chunk(dcpl_id, 3, chunk_dims); + if (!mpi_rank) + printf("Enable chunking !\n"); #endif -#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } - /* - typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers - } field_t;*/ - +#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, \ + H5P_DEFAULT, dcpl_id, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) { \ + for (size_t j(1); j < grid->ny + 1; j++) { \ + for (size_t k(1); k < grid->nz + 1; k++) { \ + temp_buf[temp_buf_index] = \ + FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, \ + global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, \ + temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } + /* + typedef struct field { + float ex, ey, ez, div_e_err; // Electric field and div E + error float cbx, cby, cbz, div_b_err; // Magnetic field and div + B error float tcax, tcay, tcaz, rhob; // TCA fields and bound + charge density float jfx, jfy, jfz, rhof; // Free current + and charge density material_id ematx, ematy, ematz, nmat; // Material + at edge centers and nodes material_id fmatx, fmaty, fmatz, cmat; // + Material at face and cell centers } field_t;*/ #ifdef HAS_FIELD_COMP - field_t * field_buf; - temp_buf_index = 0; - int global_index; - field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) * (grid->nz)); - for (size_t i(1); i < grid->nx + 1; i++){ - for (size_t j(1); j < grid->ny + 1; j++){ - for (size_t k(1); k < grid->nz + 1; k++){ - field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k); - temp_buf_index++; - } - } + field_t *field_buf; + temp_buf_index = 0; + int global_index; + field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) * + (grid->nz)); + for (size_t i(1); i < grid->nx + 1; i++) { + for (size_t j(1); j < grid->ny + 1; j++) { + for (size_t k(1); k < grid->nz + 1; k++) { + field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k); + temp_buf_index++; } - dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); - dataspace_id = H5Dget_space(dset_id); - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); - H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id, field_buf); - free(field_buf); - H5Sclose(dataspace_id); - H5Dclose(dset_id); - H5Tclose(field_comp_type_it); + } + } + dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace, + H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + dataspace_id = H5Dget_space(dset_id); + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, + global_count, NULL); + H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id, + field_buf); + free(field_buf); + H5Sclose(dataspace_id); + H5Dclose(dset_id); + H5Tclose(field_comp_type_it); #else - if (field_dump_flag.ex) - DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); - if (field_dump_flag.ey) - DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); - if (field_dump_flag.ez) - DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_e_err) - DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.cbx) - DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); - if (field_dump_flag.cby) - DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); - if (field_dump_flag.cbz) - DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); - if (field_dump_flag.div_b_err) - DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); - - if (field_dump_flag.tcax) - DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcay) - DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); - if (field_dump_flag.tcaz) - DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhob) - DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); - - if (field_dump_flag.jfx) - DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfy) - DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); - if (field_dump_flag.jfz) - DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); - if (field_dump_flag.rhof) - DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); - - //H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) - if (field_dump_flag.ematx) - DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); - if (field_dump_flag.ematy) - DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); - if (field_dump_flag.ematz) - DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); - if (field_dump_flag.nmat) - DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); - - if (field_dump_flag.fmatx) - DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); - if (field_dump_flag.fmaty) - DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); - if (field_dump_flag.fmatz) - DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); - if (field_dump_flag.cmat) - DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); + if (field_dump_flag.ex) + DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT); + if (field_dump_flag.ey) + DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT); + if (field_dump_flag.ez) + DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_e_err) + DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.cbx) + DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT); + if (field_dump_flag.cby) + DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT); + if (field_dump_flag.cbz) + DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT); + if (field_dump_flag.div_b_err) + DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT); + + if (field_dump_flag.tcax) + DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcay) + DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT); + if (field_dump_flag.tcaz) + DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhob) + DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT); + + if (field_dump_flag.jfx) + DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfy) + DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT); + if (field_dump_flag.jfz) + DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT); + if (field_dump_flag.rhof) + DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT); + + // H5T_NATIVE_SHORT for material_id (typedef int16_t material_id) + if (field_dump_flag.ematx) + DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT); + if (field_dump_flag.ematy) + DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT); + if (field_dump_flag.ematz) + DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT); + if (field_dump_flag.nmat) + DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT); + + if (field_dump_flag.fmatx) + DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT); + if (field_dump_flag.fmaty) + DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT); + if (field_dump_flag.fmatz) + DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT); + if (field_dump_flag.cmat) + DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT); #endif + H5D_mpio_actual_io_mode_t actual_io_mode; + H5Pget_mpio_actual_io_mode(plist_id, &actual_io_mode); + /* - H5D_mpio_actual_io_mode_t actual_io_mode; - H5Pget_mpio_actual_io_mode(plist_id, &actual_io_mode); - /* - - switch(actual_io_mode){ - case H5D_MPIO_NO_COLLECTIVE: - io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: "); - break; - case H5D_MPIO_CHUNK_INDEPENDENT: - io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: "); - break; - case H5D_MPIO_CHUNK_COLLECTIVE: - io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: "); - break; - case H5D_MPIO_CHUNK_MIXED: - io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: "); - break; - case H5D_MPIO_CONTIGUOUS_COLLECTIVE: - io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: "); - break; - default : - io_log("H5Pget_mpio_actual_io_mode: None returend: "); - break; - } - - H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode; - H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode); - switch(actual_chunk_opt_mode){ - case H5D_MPIO_NO_CHUNK_OPTIMIZATION: - io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_NO_CHUNK_OPTIMIZATION: "); - break; - case H5D_MPIO_MULTI_CHUNK: - io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: "); - break; - // case H5D_MPIO_MULTI_CHUNK_NO_OPT: - // io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK_NO_OPT: "); - // break; - case H5D_MPIO_LINK_CHUNK: - io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: "); - break; - default : - io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: "); - break; - } + switch(actual_io_mode){ + case H5D_MPIO_NO_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: "); + break; + case H5D_MPIO_CHUNK_INDEPENDENT: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: "); + break; + case H5D_MPIO_CHUNK_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: "); + break; + case H5D_MPIO_CHUNK_MIXED: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: "); + break; + case H5D_MPIO_CONTIGUOUS_COLLECTIVE: + io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: "); + break; + default : + io_log("H5Pget_mpio_actual_io_mode: None returend: "); + break; + } + + H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode; + H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode); + switch(actual_chunk_opt_mode){ + case H5D_MPIO_NO_CHUNK_OPTIMIZATION: + io_log("H5Pget_mpio_actual_chunk_opt_mode: +H5D_MPIO_NO_CHUNK_OPTIMIZATION: "); break; case H5D_MPIO_MULTI_CHUNK: + io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: "); + break; + // case H5D_MPIO_MULTI_CHUNK_NO_OPT: + // io_log("H5Pget_mpio_actual_chunk_opt_mode: +H5D_MPIO_MULTI_CHUNK_NO_OPT: "); + // break; + case H5D_MPIO_LINK_CHUNK: + io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: "); + break; + default : + io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: "); + break; + } + + uint32_t local_no_collective_cause, global_no_collective_cause; + H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause, +&global_no_collective_cause); + + switch(local_no_collective_cause){ + case H5D_MPIO_COLLECTIVE: + io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: "); + break; + case H5D_MPIO_SET_INDEPENDENT: + io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); + break; + case H5D_MPIO_DATA_TRANSFORMS: + io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); + break; + //case H5D_MPIO_SET_MPIPOSIX: + // io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); + // break; + case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: + io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: +"); break; + //case H5D_MPIO_POINT_SELECTIONS: + // io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); + // break; + // case H5D_MPIO_FILTERS: + // io_log("local_no_collective_cause: H5D_MPIO_FILTERS: "); + // break; + default : + io_log("local_no_collective_cause: None returend: "); + break; +} - uint32_t local_no_collective_cause, global_no_collective_cause; - H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause, &global_no_collective_cause); - switch(local_no_collective_cause){ - case H5D_MPIO_COLLECTIVE: - io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: "); +switch(global_no_collective_cause){ + case H5D_MPIO_COLLECTIVE: + io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: "); break; - case H5D_MPIO_SET_INDEPENDENT: - io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); + case H5D_MPIO_SET_INDEPENDENT: + io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); break; - case H5D_MPIO_DATA_TRANSFORMS: - io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); + case H5D_MPIO_DATA_TRANSFORMS: + io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); break; //case H5D_MPIO_SET_MPIPOSIX: - // io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); + // io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); // break; - case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: - io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); - break; + case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: + io_log("global_no_collective_cause: +H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); break; //case H5D_MPIO_POINT_SELECTIONS: - // io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); + // io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); // break; // case H5D_MPIO_FILTERS: - // io_log("local_no_collective_cause: H5D_MPIO_FILTERS: "); - // break; - default : - io_log("local_no_collective_cause: None returend: "); + // io_log("global_no_collective_cause: H5D_MPIO_FILTERS: "); + // break; + default : + io_log("global_no_collective_cause: None returend: "); break; - } - - - switch(global_no_collective_cause){ - case H5D_MPIO_COLLECTIVE: - io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: "); - break; - case H5D_MPIO_SET_INDEPENDENT: - io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: "); - break; - case H5D_MPIO_DATA_TRANSFORMS: - io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: "); - break; - //case H5D_MPIO_SET_MPIPOSIX: - // io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: "); - // break; - case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: - io_log("global_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); - break; - //case H5D_MPIO_POINT_SELECTIONS: - // io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: "); - // break; - // case H5D_MPIO_FILTERS: - // io_log("global_no_collective_cause: H5D_MPIO_FILTERS: "); - // break; - default : - io_log("global_no_collective_cause: None returend: "); - break; - } - */ +} +*/ el2 = uptime() - el2; - //io_log("TimeHDF5Write: " << el2 << " s"); + // io_log("TimeHDF5Write: " << el2 << " s"); double el3 = uptime(); @@ -762,14 +694,15 @@ class HDF5Dump : public Dump_Strategy { dims[1] = 3; if(!file_exist_flag){ hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", + H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); H5Sclose(va_geo_dataspace_id); H5Aclose(va_geo_attribute_id); } */ free(temp_buf); - //if(!file_exist_flag) + // if(!file_exist_flag) H5Sclose(filespace); H5Sclose(memspace); H5Pclose(plist_id); @@ -777,399 +710,405 @@ class HDF5Dump : public Dump_Strategy { H5Fclose(file_id); el3 = uptime() - el3; - //io_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - - - // TODO: remove or let the user set - int field_interval = 1; - - // TODO: remove this dependence on number of steps - //std::cout << "num_step " << num_step << std::endl; - - int nframes = num_step / field_interval + 1; - static int field_tframe = 0; + // io_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) { + char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf"; + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], + field_global_size[1], field_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], + field_global_size[1], field_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + // TODO: remove or let the user set + int field_interval = 1; + + // TODO: remove this dependence on number of steps + // std::cout << "num_step " << num_step << std::endl; + + int nframes = num_step / field_interval + 1; + static int field_tframe = 0; #ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" field_interval: %d \n", field_interval); - printf(" current step: %zd \n", step_for_viou); - printf(" current step: %zd \n", step_for_viou); - - //printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", field_tframe); + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" field_interval: %d \n", field_interval); + printf(" current step: %zd \n", step_for_viou); + printf(" current step: %zd \n", step_for_viou); + + // printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", field_tframe); #endif - // TODO: this footer dumping is more likely better done in a - // destructor, rather than hoping a multiple division works out - if (field_tframe >= 1) - { - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } + // TODO: this footer dumping is more likely better done in a + // destructor, rather than hoping a multiple division works out + if (field_tframe >= 1) { + if (field_tframe == (nframes - 1)) { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, + dimensions_4d, dimensions_3d, 1); + } else { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, + dimensions_4d, dimensions_3d, 0); } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval); - if (field_tframe == (nframes - 1)) - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0); - } + } else { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, + nframes, field_interval); + if (field_tframe == (nframes - 1)) { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, + dimensions_4d, dimensions_3d, 1); + } else { + invert_field_xml_item(output_xml_file, "fields", step_for_viou, + dimensions_4d, dimensions_3d, 0); } - field_tframe++; - } + } + field_tframe++; } - /** - * @brief dump_particles to the HDF5 file - * Author: Bin Dong dbin@lbl.gov - * https://crd.lbl.gov/bin-dong - * Nov 2020 - * @param fbase - * @param sp - * @param grid - * @param step - * @param interpolator_array - * @param ftag - */ - void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) - { - static int file_index = 0; - file_index ++; - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - double dump_particles_uptime = uptime(); - time_t seconds = time(NULL); - // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f, epoch_seconds = %ld \n ", mpi_rank, file_index, dump_particles_uptime, seconds); - - - size_t step_for_viou = step; - char fname[256]; - char group_name[256]; - char particle_scratch[128]; - char subparticle_scratch[128]; - - int np_local; - - float *Pf; - int *Pi; - - // get the total number of particles. in this example, output only electrons - //sp = species_list; - sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); - FileUtils::makeDirectory(particle_scratch); - sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); - FileUtils::makeDirectory(subparticle_scratch); - - // TODO: Allow the user to set this - int stride_particle_dump = 1; - - np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; - - // make a copy of the part of particle data to be dumped - double ec1 = uptime(); - - int sp_np = sp->np; - int sp_max_np = sp->max_np; - particle_t *ALIGNED(128) p_buf = NULL; - if (!p_buf) - MALLOC_ALIGNED(p_buf, np_local, 128); - particle_t *sp_p = sp->p; - sp->p = p_buf; - sp->np = np_local; - sp->max_np = np_local; - - for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i) - { - COPY(&sp->p[i], &sp_p[iptl], 1); - } + } + /** + * @brief dump_particles to the HDF5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param sp + * @param grid + * @param step + * @param interpolator_array + * @param ftag + */ + void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step, + interpolator_array_t *interpolator_array, int ftag) { + static int file_index = 0; + file_index++; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + double dump_particles_uptime = uptime(); + time_t seconds = time(NULL); + // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f, + // epoch_seconds = %ld \n ", mpi_rank, file_index, dump_particles_uptime, + // seconds); - center_p(sp, interpolator_array); + size_t step_for_viou = step; + char fname[256]; + char group_name[256]; + char particle_scratch[128]; + char subparticle_scratch[128]; + + int np_local; + + float *Pf; + int *Pi; + + // get the total number of particles. in this example, output only electrons + // sp = species_list; + sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5"); + FileUtils::makeDirectory(particle_scratch); + sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou); + FileUtils::makeDirectory(subparticle_scratch); + + // TODO: Allow the user to set this + int stride_particle_dump = 1; + + np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump; + + // make a copy of the part of particle data to be dumped + double ec1 = uptime(); + + int sp_np = sp->np; + int sp_max_np = sp->max_np; + particle_t *ALIGNED(128) p_buf = NULL; + if (!p_buf) + MALLOC_ALIGNED(p_buf, np_local, 128); + particle_t *sp_p = sp->p; + sp->p = p_buf; + sp->np = np_local; + sp->max_np = np_local; + + for (long long iptl = 0, i = 0; iptl < sp_np; + iptl += stride_particle_dump, ++i) { + COPY(&sp->p[i], &sp_p[iptl], 1); + } - ec1 = uptime() - ec1; + center_p(sp, interpolator_array); + ec1 = uptime() - ec1; - //if(!mpi_rank || mpi_rank == 2047 ) - // std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl; + // if(!mpi_rank || mpi_rank == 2047 ) + // std::cout << "on mpi_rank: " << mpi_rank << ", time in copying + // particle data: " << ec1 << " s" << ", np_local = " << np_local << + // std::endl; #ifndef N_FILE_N_PROCESS - int np_local_max, np_local_min; - MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD); - MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD); - //io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " << np_local_max << ", local_min = "<< np_local_min); + int np_local_max, np_local_min; + MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0, + MPI_COMM_WORLD); + MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0, + MPI_COMM_WORLD); + // io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " + // << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " << + // np_local_max << ", local_min = "<< np_local_min); #endif - Pf = (float *)sp->p; - Pi = (int *)sp->p; + Pf = (float *)sp->p; + Pi = (int *)sp->p; - // open HDF5 file in "particle/T./" subdirectory - // filename: eparticle.h5p + // open HDF5 file in "particle/T./" subdirectory + // filename: eparticle.h5p #ifndef N_FILE_N_PROCESS - sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou); + sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, + step_for_viou); #else - sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name, step_for_viou, mpi_rank); + sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name, + step_for_viou, mpi_rank); #endif - sprintf(group_name, "/Timestep_%ld", step_for_viou); - double el1 = uptime(); - - + sprintf(group_name, "/Timestep_%ld", step_for_viou); + double el1 = uptime(); - long long total_particles, offset; - long long numparticles = np_local; + long long total_particles, offset; + long long numparticles = np_local; #ifndef N_FILE_N_PROCESS - MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - offset -= numparticles; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, + MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; #else - total_particles = np_local; - offset = 0; + total_particles = np_local; + offset = 0; #endif - - hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS); + hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS); #ifndef N_FILE_N_PROCESS - H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); + H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); #endif #ifdef H5_ASYNC - if(!mpi_rank) printf("Enable async on particle data"); + if (!mpi_rank) + printf("Enable async on particle data"); - assert(H5Pset_vol_async(file_plist_id)); + assert(H5Pset_vol_async(file_plist_id)); #endif - hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id); - //if(!mpi_rank ) - //io_log("++Particle H5Fcreate) "); - + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id); + // if(!mpi_rank ) + // io_log("++Particle H5Fcreate) "); - hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - //if(!mpi_rank ) - //io_log("++Particle H5Gcreate) "); + hid_t group_id = + H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + // if(!mpi_rank ) + // io_log("++Particle H5Gcreate) "); -#ifdef HAS_PARTICLE_COMP - if(!mpi_rank) - printf("Using Partilce Compund type !\n"); - hid_t particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t)); - H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx), H5T_NATIVE_FLOAT); - H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy), H5T_NATIVE_FLOAT); - H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz), H5T_NATIVE_FLOAT); - - H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i), H5T_NATIVE_INT); - - H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux), H5T_NATIVE_FLOAT); - H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy), H5T_NATIVE_FLOAT); - H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz), H5T_NATIVE_FLOAT); - H5Tinsert(particle_comp_type_it, "w", HOFFSET(particle_t, w), H5T_NATIVE_FLOAT); +#ifdef HAS_PARTICLE_COMP + if (!mpi_rank) + printf("Using Partilce Compund type !\n"); + hid_t particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t)); + H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx), + H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy), + H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz), + H5T_NATIVE_FLOAT); + + H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i), + H5T_NATIVE_INT); + + H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux), + H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy), + H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz), + H5T_NATIVE_FLOAT); + H5Tinsert(particle_comp_type_it, "w", HOFFSET(particle_t, w), + H5T_NATIVE_FLOAT); #endif - hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); - H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL); + hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, + (hsize_t *)&numparticles, NULL); - //if(!mpi_rank ) - //io_log("++Particle H5Sselect_hyperslab) "); + // if(!mpi_rank ) + // io_log("++Particle H5Sselect_hyperslab) "); - //plist_id = H5P_DEFAULT; - hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER); + // plist_id = H5P_DEFAULT; + hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER); #ifndef N_FILE_N_PROCESS #ifdef HAS_INDEPENDENT_IO - if(!mpi_rank) { - printf("\n ###\n VPIC Independent I/O! \n ###\n"); - } - H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT); + if (!mpi_rank) { + printf("\n ###\n VPIC Independent I/O! \n ###\n"); + } + H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT); #else - H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE); + H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE); #endif #endif #ifdef H5_ASYNC - H5Pset_dxpl_async(io_plist_id, true); + H5Pset_dxpl_async(io_plist_id, true); #endif - hsize_t linearspace_count_temp = numparticles; - hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); + hsize_t linearspace_count_temp = numparticles; + hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL); - hsize_t memspace_count_temp; - hid_t memspace; + hsize_t memspace_count_temp; + hid_t memspace; #ifdef HAS_PARTICLE_COMP - memspace_count_temp = numparticles ; - memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + memspace_count_temp = numparticles; + memspace = H5Screate_simple(1, &memspace_count_temp, NULL); #else - memspace_count_temp = numparticles * 8; - memspace = H5Screate_simple(1, &memspace_count_temp, NULL); - hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL); + memspace_count_temp = numparticles * 8; + memspace = H5Screate_simple(1, &memspace_count_temp, NULL); + hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local; + H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, + &memspace_stride, &memspace_count, NULL); #endif - el1 = uptime() - el1; - //if(!mpi_rank || mpi_rank == 2047 ) - //io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts - double el2 = uptime(); - int ierr; - -#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p){\ - hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p);\ - H5Dclose(dset_id);\ -} - - - //MPI_Info_set(info, "romio_cb_write", "disable"); -#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p){\ - MPI_File fh;\ - MPI_Status status;\ - sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name, step_for_viou, dname_p);\ - if(mpi_rank == 0) printf("fname= %s \n", fname);\ - MPI_Info info;\ - MPI_Info_create(&info);\ - MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE, info, &fh);\ - MPI_File_write_at(fh, offset_p, data_buf_p, count_p,type_p, &status);\ - MPI_Info_free(&info);\ - MPI_File_close(&fh);\ -} + el1 = uptime() - el1; + // if(!mpi_rank || mpi_rank == 2047 ) + // io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle + // results for scripts + double el2 = uptime(); + int ierr; + +#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p) \ + { \ + hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace, \ + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p); \ + H5Dclose(dset_id); \ + } + + // MPI_Info_set(info, "romio_cb_write", "disable"); +#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p) \ + { \ + MPI_File fh; \ + MPI_Status status; \ + sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name, \ + step_for_viou, dname_p); \ + if (mpi_rank == 0) \ + printf("fname= %s \n", fname); \ + MPI_Info info; \ + MPI_Info_create(&info); \ + MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE, \ + info, &fh); \ + MPI_File_write_at(fh, offset_p, data_buf_p, count_p, type_p, &status); \ + MPI_Info_free(&info); \ + MPI_File_close(&fh); \ + } #ifdef HAS_PARTICLE_COMP - hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id, sp->p); - H5Dclose(dset_id); + hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it, + filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id, + sp->p); + H5Dclose(dset_id); #else #ifdef TEST_MPIIO - //Here we don't use the stripe but just for performance test - if(!mpi_rank) printf("Test MPI-IO\n"); - WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("i", offset * sizeof(int), Pf, numparticles, MPI_INT); - WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); - WRITE_MPI_FILE("q", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + // Here we don't use the stripe but just for performance test + if (!mpi_rank) + printf("Test MPI-IO\n"); + WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("i", offset * sizeof(int), Pf, numparticles, MPI_INT); + WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); + WRITE_MPI_FILE("q", offset * sizeof(float), Pf, numparticles, MPI_FLOAT); #else #ifndef N_FILE_N_PROCESS - if(!mpi_rank) printf("Test HDF5-IO Single \n"); + if (!mpi_rank) + printf("Test HDF5-IO Single \n"); #else - if(!mpi_rank) printf("Test HDF5-IO N Files N Process\n"); + if (!mpi_rank) + printf("Test HDF5-IO N Files N Process\n"); #endif - //if(!mpi_rank ) - //io_log("++Particle Starting to write ) "); - WRITE_H5_FILE(group_id, Pf, H5T_NATIVE_FLOAT, "dX") - WRITE_H5_FILE(group_id, Pf+1, H5T_NATIVE_FLOAT, "dY") - WRITE_H5_FILE(group_id, Pf+2, H5T_NATIVE_FLOAT, "dZ") - WRITE_H5_FILE(group_id, Pi+3, H5T_NATIVE_INT, "i") - WRITE_H5_FILE(group_id, Pf+4, H5T_NATIVE_FLOAT, "ux") - WRITE_H5_FILE(group_id, Pf+5, H5T_NATIVE_FLOAT, "uy") - WRITE_H5_FILE(group_id, Pf+6, H5T_NATIVE_FLOAT, "uz") - WRITE_H5_FILE(group_id, Pf+7, H5T_NATIVE_FLOAT, "q") + // if(!mpi_rank ) + // io_log("++Particle Starting to write ) "); + WRITE_H5_FILE(group_id, Pf, H5T_NATIVE_FLOAT, "dX") + WRITE_H5_FILE(group_id, Pf + 1, H5T_NATIVE_FLOAT, "dY") + WRITE_H5_FILE(group_id, Pf + 2, H5T_NATIVE_FLOAT, "dZ") + WRITE_H5_FILE(group_id, Pi + 3, H5T_NATIVE_INT, "i") + WRITE_H5_FILE(group_id, Pf + 4, H5T_NATIVE_FLOAT, "ux") + WRITE_H5_FILE(group_id, Pf + 5, H5T_NATIVE_FLOAT, "uy") + WRITE_H5_FILE(group_id, Pf + 6, H5T_NATIVE_FLOAT, "uz") + WRITE_H5_FILE(group_id, Pf + 7, H5T_NATIVE_FLOAT, "q") #endif #endif - el2 = uptime() - el2; - //io_log("Particle TimeHDF5Write: " << el2 << " s"); - - double el3 = uptime(); - H5Sclose(memspace); - H5Sclose(filespace); - H5Pclose(file_plist_id); - H5Pclose(io_plist_id); - H5Gclose(group_id); - + el2 = uptime() - el2; + // io_log("Particle TimeHDF5Write: " << el2 << " s"); + double el3 = uptime(); + H5Sclose(memspace); + H5Sclose(filespace); + H5Pclose(file_plist_id); + H5Pclose(io_plist_id); + H5Gclose(group_id); - H5Fclose(file_id); + H5Fclose(file_id); #ifdef H5_ASYNC - H5VLasync_finalize(); + H5VLasync_finalize(); #endif - el3 = uptime() - el3; - //io_log("Particle TimeHDF5Close: " << el3 << " s"); - - } - -/** - * @brief Dump hydro data to the HDf5 file - * Author: Bin Dong dbin@lbl.gov - * https://crd.lbl.gov/bin-dong - * Nov 2020 - * @param fbase - * @param step - * @param hydro_array - * @param sp - * @param interpolator_array - * @param grid - * @param ftag - */ -void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) -{ + el3 = uptime() - el3; + // io_log("Particle TimeHDF5Close: " << el3 << " s"); + } + + /** + * @brief Dump hydro data to the HDf5 file + * Author: Bin Dong dbin@lbl.gov + * https://crd.lbl.gov/bin-dong + * Nov 2020 + * @param fbase + * @param step + * @param hydro_array + * @param sp + * @param interpolator_array + * @param grid + * @param ftag + */ + void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array, + species_t *sp, interpolator_array_t *interpolator_array, + grid_t *grid, int ftag) { size_t step_for_viou = step; -#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ - { \ - dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ - temp_buf_index = 0; \ - for (size_t i(1); i < grid->nx + 1; i++) \ - { \ - for (size_t j(1); j < grid->ny + 1; j++) \ - { \ - for (size_t k(1); k < grid->nz + 1; k++) \ - { \ - temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME; \ - temp_buf_index = temp_buf_index + 1; \ - } \ - } \ - } \ - dataspace_id = H5Dget_space(dset_id); \ - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); \ - H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf); \ - H5Sclose(dataspace_id); \ - H5Dclose(dset_id); \ - } +#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE) \ + { \ + dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, \ + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \ + temp_buf_index = 0; \ + for (size_t i(1); i < grid->nx + 1; i++) { \ + for (size_t j(1); j < grid->ny + 1; j++) { \ + for (size_t k(1); k < grid->nz + 1; k++) { \ + temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME; \ + temp_buf_index = temp_buf_index + 1; \ + } \ + } \ + } \ + dataspace_id = H5Dget_space(dset_id); \ + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, \ + global_count, NULL); \ + H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, \ + temp_buf); \ + H5Sclose(dataspace_id); \ + H5Dclose(dset_id); \ + } //#define DUMP_INFO_DEBUG 1 int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - if (!sp) - { - ERROR(("Invalid species")); + if (!sp) { + ERROR(("Invalid species")); } clear_hydro_array(hydro_array); @@ -1185,15 +1124,16 @@ void dump_hydro( sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou); FileUtils::makeDirectory(subhydro_scratch); - sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou); + sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, + step_for_viou); double el1 = uptime(); hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); /* - if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){ - exit(-1); + if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < + 0){ exit(-1); }*/ - //if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0) + // if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0) // ERROR_RETURN; H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL); @@ -1201,70 +1141,83 @@ void dump_hydro( H5Pclose(plist_id); sprintf(hname, "Timestep_%zu", step_for_viou); - hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + hid_t group_id = + H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); el1 = uptime() - el1; - //io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts + // io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for + // scripts double el2 = uptime(); // Create a variable list of field values to output. - //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables); - //size_t *varlist = new size_t[numvars]; + // size_t numvars = std::min(global->fdParams.output_vars.bitsum(), + // total_field_variables); size_t *varlist = new size_t[numvars]; - //for (size_t i(0), c(0); i < total_field_variables; i++) + // for (size_t i(0), c(0); i < total_field_variables; i++) // if (global->fdParams.output_vars.bitset(i)) // varlist[c++] = i; - //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); + // printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars); - - //typedef struct hydro { + // typedef struct hydro { // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align + // float px, py, pz, ke; // Momentum and K.E. density => , float txx, tyy, tzz; // Stress diagonal => + // , i==j float tyz, tzx, txy; // Stress off-diagonal => , i!=j float _pad[2]; // 16-byte align //} hydro_t; #ifdef HAS_HYDRO_COMP - //if(!mpi_rank) - //printf("Using Field Compund type !\n"); - hid_t hydro_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(hydro_t)); + // if(!mpi_rank) + // printf("Using Field Compund type !\n"); + hid_t hydro_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(hydro_t)); H5Tinsert(hydro_comp_type_it, "jx", HOFFSET(hydro_t, jx), H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "jy", HOFFSET(hydro_t, jy), H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "jz", HOFFSET(hydro_t, jz), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho), H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho), + H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "px", HOFFSET(hydro_t, px), H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "py", HOFFSET(hydro_t, py), H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "pz", HOFFSET(hydro_t, pz), H5T_NATIVE_FLOAT); H5Tinsert(hydro_comp_type_it, "ke", HOFFSET(hydro_t, ke), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz), H5T_NATIVE_FLOAT); - - H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy), H5T_NATIVE_FLOAT); - H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad), H5T_NATIVE_DOUBLE); + H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx), + H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy), + H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz), + H5T_NATIVE_FLOAT); + + H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz), + H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx), + H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy), + H5T_NATIVE_FLOAT); + H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad), + H5T_NATIVE_DOUBLE); #endif - //typedef struct hydro_array { + // typedef struct hydro_array { // hydro_t * ALIGNED(128) h; // grid_t * g; //} hydro_array_t; - float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); + float *temp_buf = + (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz)); hsize_t temp_buf_index; hid_t dset_id; - //char *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; + // char *field_var_name[] = + // {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"}; plist_id = H5Pcreate(H5P_DATASET_XFER); - //Comment out for test only + // Comment out for test only H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); - //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL); + // H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, + // (hsize_t *) &numparticles, NULL); - //global->topology_x + // global->topology_x - hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3]; + hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], + global_count[3]; hydro_global_size[0] = (grid->nx * grid->gpx); hydro_global_size[1] = (grid->ny * grid->gpy); hydro_global_size[2] = (grid->nz * grid->gpz); @@ -1274,7 +1227,8 @@ void dump_hydro( hydro_local_size[2] = grid->nz; int mpi_rank_x, mpi_rank_y, mpi_rank_z; - UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz); + UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, + grid->gpz); global_offset[0] = (grid->nx) * mpi_rank_x; global_offset[1] = (grid->ny) * mpi_rank_y; @@ -1285,10 +1239,14 @@ void dump_hydro( global_count[2] = (grid->nz); #ifdef DUMP_INFO_DEBUG - printf("global size = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]); - printf("global_count = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]); - printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z); + printf("global size = %llu %llu %llu \n", hydro_global_size[0], + hydro_global_size[1], hydro_global_size[2]); + printf("global_offset = %llu %llu %llu \n", global_offset[0], + global_offset[1], global_offset[2]); + printf("global_count = %llu %llu %llu \n", global_count[0], + global_count[1], global_count[2]); + printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, + mpi_rank_y, mpi_rank_z); fflush(stdout); #endif @@ -1296,31 +1254,34 @@ void dump_hydro( hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL); hid_t dataspace_id; - //typedef struct hydro { + // typedef struct hydro { // float jx, jy, jz, rho; // Current and charge density => , - // float px, py, pz, ke; // Momentum and K.E. density => , - // float txx, tyy, tzz; // Stress diagonal => , i==j - // float tyz, tzx, txy; // Stress off-diagonal => , i!=j - // float _pad[2]; // 16-byte align + // float px, py, pz, ke; // Momentum and K.E. density => , float txx, tyy, tzz; // Stress diagonal => + // , i==j float tyz, tzx, txy; // Stress off-diagonal => , i!=j float _pad[2]; // 16-byte align //} hydro_t; - #ifdef HAS_HYDRO_COMP - hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) * (grid->ny) * (grid->nz)); + hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) * + (grid->ny) * (grid->nz)); temp_buf_index = 0; - for (size_t i(1); i < grid->nx + 1; i++){ - for (size_t j(1); j < grid->ny + 1; j++){ - for (size_t k(1); k < grid->nz + 1; k++){ - hydro_buf[temp_buf_index] = _hydro(i, j, k); - temp_buf_index = temp_buf_index + 1; - } + for (size_t i(1); i < grid->nx + 1; i++) { + for (size_t j(1); j < grid->ny + 1; j++) { + for (size_t k(1); k < grid->nz + 1; k++) { + hydro_buf[temp_buf_index] = _hydro(i, j, k); + temp_buf_index = temp_buf_index + 1; } + } } - dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace, + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); dataspace_id = H5Dget_space(dset_id); - H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL); - H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id, hydro_buf); + H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, + global_count, NULL); + H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id, + hydro_buf); free(hydro_buf); H5Sclose(dataspace_id); H5Dclose(dset_id); @@ -1328,44 +1289,44 @@ void dump_hydro( #else if (hydro_dump_flag.jx) - DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT); if (hydro_dump_flag.jy) - DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT); if (hydro_dump_flag.jz) - DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT); if (hydro_dump_flag.rho) - DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT); if (hydro_dump_flag.px) - DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT); if (hydro_dump_flag.py) - DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT); if (hydro_dump_flag.pz) - DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT); if (hydro_dump_flag.ke) - DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT); if (hydro_dump_flag.txx) - DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT); if (hydro_dump_flag.tyy) - DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT); if (hydro_dump_flag.tzz) - DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT); if (hydro_dump_flag.tyz) - DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT); if (hydro_dump_flag.tzx) - DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT); if (hydro_dump_flag.txy) - DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); + DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT); el2 = uptime() - el2; - //io_log("TimeHDF5Write: " << el2 << " s"); + // io_log("TimeHDF5Write: " << el2 << " s"); #endif double el3 = uptime(); - //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF + // Write metadata (geo original and geo dx/dy/dz) for ArrayUDF /* float attr_data[2][3]; attr_data[0][0] = grid->x0; @@ -1378,7 +1339,8 @@ void dump_hydro( dims[0] = 2; dims[1] = 3; hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL); - hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); + hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", + H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT); H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data); H5Sclose(va_geo_dataspace_id); H5Aclose(va_geo_attribute_id);*/ @@ -1391,720 +1353,685 @@ void dump_hydro( H5Fclose(file_id); el3 = uptime() - el3; - //io_log("TimeHDF5Close: " << el3 << " s"); - - if (mpi_rank == 0) - { - char output_xml_file[128]; - sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf"); - char dimensions_3d[128]; - sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]); - char dimensions_4d[128]; - sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3); - char orignal[128]; - sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); - char dxdydz[128]; - sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); - - // TODO: remove or let user set - int hydro_interval = 1; - - // TODO: remove this dependence on number of steps - int nframes = num_step / hydro_interval + 1; - - const int tframe = tframe_map[sp->id]; + // io_log("TimeHDF5Close: " << el3 << " s"); + + if (mpi_rank == 0) { + char output_xml_file[128]; + sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, + ".xdmf"); + char dimensions_3d[128]; + sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], + hydro_global_size[1], hydro_global_size[2]); + char dimensions_4d[128]; + sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], + hydro_global_size[1], hydro_global_size[2], 3); + char orignal[128]; + sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0); + char dxdydz[128]; + sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz); + + // TODO: remove or let user set + int hydro_interval = 1; + + // TODO: remove this dependence on number of steps + int nframes = num_step / hydro_interval + 1; + + const int tframe = tframe_map[sp->id]; #ifdef DUMP_INFO_DEBUG - printf(" meta file : %s \n", output_xml_file); - printf(" array dims per var: %s \n", dimensions_3d); - printf("array dims all vars: %s \n", dimensions_4d); - printf(" orignal: %s \n", orignal); - printf(" dxdydz: %s \n", dxdydz); - printf(" nframes: %d \n", nframes); - printf(" hydro_fields_interval: %d \n", hydro_interval); - printf(" current step: %zu \n", step_for_viou); - printf(" Simulation time: %f \n", grid->t0); - printf(" tframe: %d \n", tframe); + printf(" meta file : %s \n", output_xml_file); + printf(" array dims per var: %s \n", dimensions_3d); + printf("array dims all vars: %s \n", dimensions_4d); + printf(" orignal: %s \n", orignal); + printf(" dxdydz: %s \n", dxdydz); + printf(" nframes: %d \n", nframes); + printf(" hydro_fields_interval: %d \n", hydro_interval); + printf(" current step: %zu \n", step_for_viou); + printf(" Simulation time: %f \n", grid->t0); + printf(" tframe: %d \n", tframe); #endif - // TODO: why doesnt this just use the cstr? - char speciesname_new[128]; - sprintf(speciesname_new, "hydro_%s", sp->name); - if (tframe >= 1) - { - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } + // TODO: why doesnt this just use the cstr? + char speciesname_new[128]; + sprintf(speciesname_new, "hydro_%s", sp->name); + if (tframe >= 1) { + if (tframe == (nframes - 1)) { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, + dimensions_4d, dimensions_3d, 1); + } else { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, + dimensions_4d, dimensions_3d, 0); } - else - { - create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval); - if (tframe == (nframes - 1)) - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1); - } - else - { - invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0); - } + } else { + create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, + nframes, hydro_interval); + if (tframe == (nframes - 1)) { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, + dimensions_4d, dimensions_3d, 1); + } else { + invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, + dimensions_4d, dimensions_3d, 0); } - tframe_map[sp->id]++; + } + tframe_map[sp->id]++; } -} + } }; #endif #ifdef VPIC_ENABLE_OPENPMD class OpenPMDDump : public Dump_Strategy { - public: - //openPMD::Series* series; - using Dump_Strategy::Dump_Strategy; // inherit constructor - - //std::string file_type = ".h5"; - std::string file_type = ".bp"; - - void dump_fields( - const char *fbase, - int step, - grid_t* grid, - field_array_t* field_array, - int ftag - ) - { - std::cout << "Writing openPMD data" << std::endl; - - std::string full_file_name = fbase + file_type; - - //if (series == nullptr) { - std::cout << "init series" << std::endl; - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); - //} - - std::cout << "Writing iteration " << step << std::endl; - auto i = series.iterations[ step ]; - // TODO: it would be nice to set these... - //series.setAuthor( "Axel Huebl "); - //series.setMachine( "Hall Probe 5000, Model 3"); - i.setAttribute( "vacuum", true); - - auto cB = i.meshes["B"]; - auto E = i.meshes["E"]; - auto J = i.meshes["J"]; - auto Tca = i.meshes["Tca"]; - auto Emat = i.meshes["Emat"]; - auto Fmat = i.meshes["Fmat"]; - auto Rho = i.meshes["Rho"]; - auto DivErr = i.meshes["DivErr"]; - - // record components - auto Cbx = cB["x"]; - auto Cby = cB["y"]; - auto Cbz = cB["z"]; - - auto Ex = E["x"]; - auto Ey = E["y"]; - auto Ez = E["z"]; - - auto Jx = J["x"]; - auto Jy = J["y"]; - auto Jz = J["z"]; - - auto Tcax = Tca["x"]; - auto Tcay = Tca["y"]; - auto Tcaz = Tca["z"]; - - auto Ematx = Emat["x"]; - auto Ematy = Emat["y"]; - auto Ematz = Emat["z"]; - - auto Fmatx = Fmat["x"]; - auto Fmaty = Fmat["y"]; - auto Fmatz = Fmat["z"]; - - auto RhoB = Rho["B"]; - auto RhoF = Rho["F"]; - - auto DivEErr = DivErr["E"]; - auto DivBErr = DivErr["B"]; - - // TODO: set unitDimension so the anaylsis software knows what fields - // things are - // - // // TODO: add timers for the convert and for the write - - size_t gnx = (grid->nx * grid->gpx); - size_t gny = (grid->ny * grid->gpy); - size_t gnz = (grid->nz * grid->gpz); - openPMD::Extent global_extent = {gnx, gny, gnz}; - - openPMD::Datatype datatype = openPMD::determineDatatype(); - openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - - Cbx.resetDataset(dataset); - Cby.resetDataset(dataset); - Cbz.resetDataset(dataset); - - Ex.resetDataset(dataset); - Ey.resetDataset(dataset); - Ez.resetDataset(dataset); - - Jx.resetDataset(dataset); - Jy.resetDataset(dataset); - Jz.resetDataset(dataset); - - Tcax.resetDataset(dataset); - Tcay.resetDataset(dataset); - Tcaz.resetDataset(dataset); - - Ematx.resetDataset(dataset); - Ematy.resetDataset(dataset); - Ematz.resetDataset(dataset); - - Fmatx.resetDataset(dataset); - Fmaty.resetDataset(dataset); - Fmatz.resetDataset(dataset); - - RhoB.resetDataset(dataset); - RhoF.resetDataset(dataset); - - DivEErr.resetDataset(dataset); - DivBErr.resetDataset(dataset); - - // TODO: hoist this conversion code, as is it used elsewhere - // Convert rank to local x/y/z - int rx, ry, rz; - UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - size_t nx = grid->nx; - size_t ny = grid->ny; - size_t nz = grid->nz; - - // NOTE: this assumes a static mesh decomposition in nx/ny/nz - size_t global_offset_x = (nx) * rx; - size_t global_offset_y = (ny) * ry; - size_t global_offset_z = (nz) * rz; - - openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; - openPMD::Extent chunk_extent = {nx, ny, nz}; - - std::cout << "Local offset " << - " x: " << global_offset_x << - " y: " << global_offset_y << - " z: " << global_offset_z << - std::endl; - - // Store a local copy of the data which we pull out of the AoS - std::vector cbx_data; - std::vector cby_data; - std::vector cbz_data; - - std::vector ex_data; - std::vector ey_data; - std::vector ez_data; - - std::vector jx_data; - std::vector jy_data; - std::vector jz_data; - - std::vector tcax_data; - std::vector tcay_data; - std::vector tcaz_data; - - // TODO: these are material_id (ints not floats) - std::vector ematx_data; - std::vector ematy_data; - std::vector ematz_data; - - std::vector fmatx_data; - std::vector fmaty_data; - std::vector fmatz_data; - // end todo - - std::vector rhob_data; - std::vector rhof_data; - - std::vector divb_data; - std::vector dive_data; - - size_t nv = nx * ny * nz; - - // TODO: resize here will zero out the data which we don't need, we - // could change to a different semantic to avoid this - cbx_data.resize(nv); - cby_data.resize(nv); - cbz_data.resize(nv); - - ex_data.resize(nv); - ey_data.resize(nv); - ez_data.resize(nv); - - jx_data.resize(nv); - jy_data.resize(nv); - jz_data.resize(nv); - - tcax_data.resize(nv); - tcay_data.resize(nv); - tcaz_data.resize(nv); - - ematx_data.resize(nv); - ematy_data.resize(nv); - ematz_data.resize(nv); - - fmatx_data.resize(nv); - fmaty_data.resize(nv); - fmatz_data.resize(nv); - - rhob_data.resize(nv); - rhof_data.resize(nv); - - divb_data.resize(nv); - dive_data.resize(nv); +public: + // openPMD::Series* series; + using Dump_Strategy::Dump_Strategy; // inherit constructor + + // std::string file_type = ".h5"; + std::string file_type = ".bp"; + + void dump_fields(const char *fbase, int step, grid_t *grid, + field_array_t *field_array, int ftag) { + std::cout << "Writing openPMD data" << std::endl; + + std::string full_file_name = fbase + file_type; + + // if (series == nullptr) { + std::cout << "init series" << std::endl; + openPMD::Series series = openPMD::Series( + full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD); + //} + + std::cout << "Writing iteration " << step << std::endl; + auto i = series.iterations[step]; + // TODO: it would be nice to set these... + // series.setAuthor( "Axel Huebl "); + // series.setMachine( "Hall Probe 5000, Model 3"); + i.setAttribute("vacuum", true); + + auto cB = i.meshes["B"]; + auto E = i.meshes["E"]; + auto J = i.meshes["J"]; + auto Tca = i.meshes["Tca"]; + auto Emat = i.meshes["Emat"]; + auto Fmat = i.meshes["Fmat"]; + auto Rho = i.meshes["Rho"]; + auto DivErr = i.meshes["DivErr"]; + + // record components + auto Cbx = cB["x"]; + auto Cby = cB["y"]; + auto Cbz = cB["z"]; + + auto Ex = E["x"]; + auto Ey = E["y"]; + auto Ez = E["z"]; + + auto Jx = J["x"]; + auto Jy = J["y"]; + auto Jz = J["z"]; + + auto Tcax = Tca["x"]; + auto Tcay = Tca["y"]; + auto Tcaz = Tca["z"]; + + auto Ematx = Emat["x"]; + auto Ematy = Emat["y"]; + auto Ematz = Emat["z"]; + + auto Fmatx = Fmat["x"]; + auto Fmaty = Fmat["y"]; + auto Fmatz = Fmat["z"]; + + auto RhoB = Rho["B"]; + auto RhoF = Rho["F"]; + + auto DivEErr = DivErr["E"]; + auto DivBErr = DivErr["B"]; + + // TODO: set unitDimension so the anaylsis software knows what fields + // things are + // + // // TODO: add timers for the convert and for the write + + size_t gnx = (grid->nx * grid->gpx); + size_t gny = (grid->ny * grid->gpy); + size_t gnz = (grid->nz * grid->gpz); + openPMD::Extent global_extent = {gnx, gny, gnz}; + + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + Cbx.resetDataset(dataset); + Cby.resetDataset(dataset); + Cbz.resetDataset(dataset); + + Ex.resetDataset(dataset); + Ey.resetDataset(dataset); + Ez.resetDataset(dataset); + + Jx.resetDataset(dataset); + Jy.resetDataset(dataset); + Jz.resetDataset(dataset); + + Tcax.resetDataset(dataset); + Tcay.resetDataset(dataset); + Tcaz.resetDataset(dataset); + + Ematx.resetDataset(dataset); + Ematy.resetDataset(dataset); + Ematz.resetDataset(dataset); + + Fmatx.resetDataset(dataset); + Fmaty.resetDataset(dataset); + Fmatz.resetDataset(dataset); + + RhoB.resetDataset(dataset); + RhoF.resetDataset(dataset); + + DivEErr.resetDataset(dataset); + DivBErr.resetDataset(dataset); + + // TODO: hoist this conversion code, as is it used elsewhere + // Convert rank to local x/y/z + int rx, ry, rz; + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + size_t nx = grid->nx; + size_t ny = grid->ny; + size_t nz = grid->nz; + + // NOTE: this assumes a static mesh decomposition in nx/ny/nz + size_t global_offset_x = (nx)*rx; + size_t global_offset_y = (ny)*ry; + size_t global_offset_z = (nz)*rz; + + openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, + global_offset_z}; + openPMD::Extent chunk_extent = {nx, ny, nz}; + + std::cout << "Local offset " + << " x: " << global_offset_x << " y: " << global_offset_y + << " z: " << global_offset_z << std::endl; + + // Store a local copy of the data which we pull out of the AoS + std::vector cbx_data; + std::vector cby_data; + std::vector cbz_data; + + std::vector ex_data; + std::vector ey_data; + std::vector ez_data; + + std::vector jx_data; + std::vector jy_data; + std::vector jz_data; + + std::vector tcax_data; + std::vector tcay_data; + std::vector tcaz_data; + + // TODO: these are material_id (ints not floats) + std::vector ematx_data; + std::vector ematy_data; + std::vector ematz_data; - // TODO: make this AoS to SoA conversion a function - - // We could do 1D here, but we don't really care about the ghosts, and we - // can thread over nz/ny (collapsed?) - // Go over non-ghosts and grab just that data into a dense array - for (size_t k = 1; k < grid->nz + 1; k++) - { - for (size_t j = 1; j < grid->ny + 1; j++) - { - for (size_t i = 1; i < grid->nx + 1; i++) - { - int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); - int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); - - cbx_data[local_index] = field_array->f[global_index].cbx; - cby_data[local_index] = field_array->f[global_index].cby; - cbz_data[local_index] = field_array->f[global_index].cbz; - - ex_data[local_index] = field_array->f[global_index].ex; - ey_data[local_index] = field_array->f[global_index].ey; - ez_data[local_index] = field_array->f[global_index].ez; - - jx_data[local_index] = field_array->f[global_index].jfx; - jy_data[local_index] = field_array->f[global_index].jfy; - jz_data[local_index] = field_array->f[global_index].jfz; - - tcax_data[local_index] = field_array->f[global_index].tcax; - tcay_data[local_index] = field_array->f[global_index].tcay; - tcaz_data[local_index] = field_array->f[global_index].tcaz; - - ematx_data[local_index] = field_array->f[global_index].ematx; - ematy_data[local_index] = field_array->f[global_index].ematy; - ematz_data[local_index] = field_array->f[global_index].ematz; - - fmatx_data[local_index] = field_array->f[global_index].fmatx; - fmaty_data[local_index] = field_array->f[global_index].fmaty; - fmatz_data[local_index] = field_array->f[global_index].fmatz; + std::vector fmatx_data; + std::vector fmaty_data; + std::vector fmatz_data; + // end todo - rhob_data[local_index] = field_array->f[global_index].rhob; - rhof_data[local_index] = field_array->f[global_index].rhof; + std::vector rhob_data; + std::vector rhof_data; - dive_data[local_index] = field_array->f[global_index].div_e_err; - divb_data[local_index] = field_array->f[global_index].div_b_err; - } - } - } - - Cbx.storeChunk( cbx_data, chunk_offset, chunk_extent); - Cby.storeChunk( cby_data, chunk_offset, chunk_extent); - Cbz.storeChunk( cbz_data, chunk_offset, chunk_extent); - - Ex.storeChunk( ex_data, chunk_offset, chunk_extent); - Ey.storeChunk( ey_data, chunk_offset, chunk_extent); - Ez.storeChunk( ez_data, chunk_offset, chunk_extent); + std::vector divb_data; + std::vector dive_data; + + size_t nv = nx * ny * nz; - Jx.storeChunk( jx_data, chunk_offset, chunk_extent); - Jy.storeChunk( jy_data, chunk_offset, chunk_extent); - Jz.storeChunk( jz_data, chunk_offset, chunk_extent); + // TODO: resize here will zero out the data which we don't need, we + // could change to a different semantic to avoid this + cbx_data.resize(nv); + cby_data.resize(nv); + cbz_data.resize(nv); + + ex_data.resize(nv); + ey_data.resize(nv); + ez_data.resize(nv); - Tcax.storeChunk( tcax_data, chunk_offset, chunk_extent); - Tcay.storeChunk( tcay_data, chunk_offset, chunk_extent); - Tcaz.storeChunk( tcaz_data, chunk_offset, chunk_extent); + jx_data.resize(nv); + jy_data.resize(nv); + jz_data.resize(nv); + + tcax_data.resize(nv); + tcay_data.resize(nv); + tcaz_data.resize(nv); + + ematx_data.resize(nv); + ematy_data.resize(nv); + ematz_data.resize(nv); + + fmatx_data.resize(nv); + fmaty_data.resize(nv); + fmatz_data.resize(nv); + + rhob_data.resize(nv); + rhof_data.resize(nv); - Ematx.storeChunk( ematx_data, chunk_offset, chunk_extent); - Ematy.storeChunk( ematy_data, chunk_offset, chunk_extent); - Ematz.storeChunk( ematz_data, chunk_offset, chunk_extent); - - Fmatx.storeChunk( fmatx_data, chunk_offset, chunk_extent); - Fmaty.storeChunk( fmaty_data, chunk_offset, chunk_extent); - Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent); - - RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent); - RhoF.storeChunk( rhof_data, chunk_offset, chunk_extent); + divb_data.resize(nv); + dive_data.resize(nv); - DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent); - DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent); + // TODO: make this AoS to SoA conversion a function + + // We could do 1D here, but we don't really care about the ghosts, and we + // can thread over nz/ny (collapsed?) + // Go over non-ghosts and grab just that data into a dense array + for (size_t k = 1; k < grid->nz + 1; k++) { + for (size_t j = 1; j < grid->ny + 1; j++) { + for (size_t i = 1; i < grid->nx + 1; i++) { + int local_index = VOXEL(i - 1, j - 1, k - 1, grid->nx - 2, + grid->ny - 2, grid->nz - 2); + int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + + cbx_data[local_index] = field_array->f[global_index].cbx; + cby_data[local_index] = field_array->f[global_index].cby; + cbz_data[local_index] = field_array->f[global_index].cbz; - series.flush(); + ex_data[local_index] = field_array->f[global_index].ex; + ey_data[local_index] = field_array->f[global_index].ey; + ez_data[local_index] = field_array->f[global_index].ez; + + jx_data[local_index] = field_array->f[global_index].jfx; + jy_data[local_index] = field_array->f[global_index].jfy; + jz_data[local_index] = field_array->f[global_index].jfz; + + tcax_data[local_index] = field_array->f[global_index].tcax; + tcay_data[local_index] = field_array->f[global_index].tcay; + tcaz_data[local_index] = field_array->f[global_index].tcaz; + + ematx_data[local_index] = field_array->f[global_index].ematx; + ematy_data[local_index] = field_array->f[global_index].ematy; + ematz_data[local_index] = field_array->f[global_index].ematz; + + fmatx_data[local_index] = field_array->f[global_index].fmatx; + fmaty_data[local_index] = field_array->f[global_index].fmaty; + fmatz_data[local_index] = field_array->f[global_index].fmatz; + + rhob_data[local_index] = field_array->f[global_index].rhob; + rhof_data[local_index] = field_array->f[global_index].rhof; + + dive_data[local_index] = field_array->f[global_index].div_e_err; + divb_data[local_index] = field_array->f[global_index].div_b_err; } + } + } + + Cbx.storeChunk(cbx_data, chunk_offset, chunk_extent); + Cby.storeChunk(cby_data, chunk_offset, chunk_extent); + Cbz.storeChunk(cbz_data, chunk_offset, chunk_extent); + + Ex.storeChunk(ex_data, chunk_offset, chunk_extent); + Ey.storeChunk(ey_data, chunk_offset, chunk_extent); + Ez.storeChunk(ez_data, chunk_offset, chunk_extent); + + Jx.storeChunk(jx_data, chunk_offset, chunk_extent); + Jy.storeChunk(jy_data, chunk_offset, chunk_extent); + Jz.storeChunk(jz_data, chunk_offset, chunk_extent); + + Tcax.storeChunk(tcax_data, chunk_offset, chunk_extent); + Tcay.storeChunk(tcay_data, chunk_offset, chunk_extent); + Tcaz.storeChunk(tcaz_data, chunk_offset, chunk_extent); + + Ematx.storeChunk(ematx_data, chunk_offset, chunk_extent); + Ematy.storeChunk(ematy_data, chunk_offset, chunk_extent); + Ematz.storeChunk(ematz_data, chunk_offset, chunk_extent); - void dump_particles( - const char *fbase, - species_t* sp, - grid_t* grid, - int step, - interpolator_array_t* interpolator_array, - int ftag - ) - { - std::string full_file_name = fbase + file_type; + Fmatx.storeChunk(fmatx_data, chunk_offset, chunk_extent); + Fmaty.storeChunk(fmaty_data, chunk_offset, chunk_extent); + Fmatz.storeChunk(fmatz_data, chunk_offset, chunk_extent); - std::cout << "writing particles to " << full_file_name << std::endl; + RhoB.storeChunk(rhob_data, chunk_offset, chunk_extent); + RhoF.storeChunk(rhof_data, chunk_offset, chunk_extent); - //if (series == nullptr) { - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); - //} + DivEErr.storeChunk(dive_data, chunk_offset, chunk_extent); + DivBErr.storeChunk(divb_data, chunk_offset, chunk_extent); - auto i = series.iterations[ step ]; + series.flush(); + } - // TODO: set these - i.setTime( (float)step ); - i.setDt(1.0); - i.setTimeUnitSI(1.0); + void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step, + interpolator_array_t *interpolator_array, int ftag) { + std::string full_file_name = fbase + file_type; - auto& p = i.particles[sp->name]; + std::cout << "writing particles to " << full_file_name << std::endl; - const int np = sp->np; + // if (series == nullptr) { + openPMD::Series series = openPMD::Series( + full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD); + //} - // TODO: this could be a function call as it's used elsewhere (in hdf5) - unsigned long long total_particles, offset; - unsigned long long numparticles = np; - MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); - offset -= numparticles; + auto i = series.iterations[step]; - openPMD::Extent global_extent = {total_particles}; - openPMD::Datatype datatype = openPMD::determineDatatype(); - openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + // TODO: set these + i.setTime((float)step); + i.setDt(1.0); + i.setTimeUnitSI(1.0); - auto px = p["position"]["x"]; - auto pxo = p["positionOffset"]["x"]; + auto &p = i.particles[sp->name]; - auto py = p["position"]["y"]; - auto pyo = p["positionOffset"]["y"]; + const int np = sp->np; - auto pz = p["position"]["z"]; - auto pzo = p["positionOffset"]["z"]; + // TODO: this could be a function call as it's used elsewhere (in hdf5) + unsigned long long total_particles, offset; + unsigned long long numparticles = np; + MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, + MPI_COMM_WORLD); + MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + offset -= numparticles; - auto ux = p["velocity"]["x"]; - auto uy = p["velocity"]["y"]; - auto uz = p["velocity"]["z"]; + openPMD::Extent global_extent = {total_particles}; + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - px.resetDataset(dataset); - pxo.resetDataset(dataset); + auto px = p["position"]["x"]; + auto pxo = p["positionOffset"]["x"]; - py.resetDataset(dataset); - pyo.resetDataset(dataset); + auto py = p["position"]["y"]; + auto pyo = p["positionOffset"]["y"]; - pz.resetDataset(dataset); - pzo.resetDataset(dataset); + auto pz = p["position"]["z"]; + auto pzo = p["positionOffset"]["z"]; - ux.resetDataset(dataset); - uy.resetDataset(dataset); - uz.resetDataset(dataset); - // convert data to SoA, allowing the user to chunk the operation + auto ux = p["velocity"]["x"]; + auto uy = p["velocity"]["y"]; + auto uz = p["velocity"]["z"]; - // TODO: Add code the convert to global offsets -#ifndef PMD_MAX_IO_CHUNK // in particles + px.resetDataset(dataset); + pxo.resetDataset(dataset); + + py.resetDataset(dataset); + pyo.resetDataset(dataset); + + pz.resetDataset(dataset); + pzo.resetDataset(dataset); + + ux.resetDataset(dataset); + uy.resetDataset(dataset); + uz.resetDataset(dataset); + // convert data to SoA, allowing the user to chunk the operation + + // TODO: Add code the convert to global offsets +#ifndef PMD_MAX_IO_CHUNK // in particles #define PMD_MAX_IO_CHUNK 16777216; // 512MB total write #endif - const int max_chunk = PMD_MAX_IO_CHUNK; - - // Loop over all particles in chunks - for (int i = 0; i < np; i += max_chunk) - { - // We have to be careful as the last chunk may not be full - // Find how many are left and do that many - size_t to_write = std::min(np-i, max_chunk); - - // Convert the chunk ready to write - std::vector x_pos; - std::vector x_off; - x_pos.resize(to_write); - x_off.resize(to_write); - - std::vector y_pos; - std::vector y_off; - y_pos.resize(to_write); - y_off.resize(to_write); - - std::vector z_pos; - std::vector z_off; - z_pos.resize(to_write); - z_off.resize(to_write); - - std::vector ux_pos; - ux_pos.resize(to_write); - - std::vector uy_pos; - uy_pos.resize(to_write); - - std::vector uz_pos; - uz_pos.resize(to_write); - - for (int j = 0; j < to_write; j++) - { - // TODO: do I need to center the particles? - auto& particle = sp->p[i+j]; - - x_pos[j] = particle.dx; - y_pos[j] = particle.dy; - z_pos[j] = particle.dz; - - ux_pos[j] = particle.ux; - uy_pos[j] = particle.uy; - uz_pos[j] = particle.uz; - - std::array gi = global_particle_index(particle.i, grid, rank); - x_off[j] = (float)gi[1]; - y_off[j] = (float)gi[2]; - z_off[j] = (float)gi[3]; - } - - // Base offset plus i to account for chunks - auto o = openPMD::Offset{offset + i}; - auto e = openPMD::Extent{to_write}; - px.storeChunk(x_pos, o, e); - pxo.storeChunk(x_off, o, e); - - py.storeChunk(y_pos, o, e); - pyo.storeChunk(y_off, o, e); - - pz.storeChunk(z_pos, o, e); - pzo.storeChunk(z_off, o, e); - - ux.storeChunk(ux_pos, o, e); - uy.storeChunk(uy_pos, o, e); - uz.storeChunk(uz_pos, o, e); - - series.flush(); - } - } - void dump_hydro( - const char *fbase, - int step, - hydro_array_t* hydro_array, - species_t* sp, - interpolator_array_t* interpolator_array, - grid_t* grid, - int ftag - ) - { - std::string full_file_name = fbase + file_type; - - std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl; - - //if (series == nullptr) { - openPMD::Series series = openPMD::Series( - full_file_name, - openPMD::AccessType::CREATE, - MPI_COMM_WORLD - ); - //} - - auto i = series.iterations[ step ]; - - // TODO: set these - i.setTime( (float)step ); - i.setDt(1.0); - i.setTimeUnitSI(1.0); - - if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name )); - - // TODO: do we want each backend to have to explicitly call these - // manually? Or, as it is common, should we hoist it to the VPIC - // call-site - clear_hydro_array( hydro_array ); - accumulate_hydro_p( hydro_array, sp, interpolator_array ); - synchronize_hydro_array( hydro_array ); - - if( !fbase ) ERROR(( "Invalid filename" )); - - if( rank==0 ) - MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase)); - - // Write data - //float jx, jy, jz, rho; // Current and charge density => , - //float px, py, pz, ke; // Momentum and K.E. density => , - //float txx, tyy, tzz; // Stress diagonal => , i==j - //float tyz, tzx, txy; // Stress off-diagonal => , i!=j - auto J = i.meshes["J"]; - auto P = i.meshes["P"]; - auto T = i.meshes["T"]; - auto _Ke = i.meshes["Ke"]; - auto _Rho = i.meshes["Rho"]; - - auto Jx = J["x"]; - auto Jy = J["y"]; - auto Jz = J["z"]; - - auto Px = P["x"]; - auto Py = P["y"]; - auto Pz = P["z"]; - - auto Txx = T["xx"]; - auto Tyy = T["yy"]; - auto Tzz = T["zz"]; - auto Tyz = T["yz"]; - auto Tzx = T["zx"]; - auto Txy = T["xy"]; - - auto Rho = _Rho["rho"]; // TODO: bad name.. - auto Ke = _Ke["ke"]; // TODO: bad name.. - - size_t gnx = (grid->nx * grid->gpx); - size_t gny = (grid->ny * grid->gpy); - size_t gnz = (grid->nz * grid->gpz); - openPMD::Extent global_extent = {gnx, gny, gnz}; - - openPMD::Datatype datatype = openPMD::determineDatatype(); - openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); - - Jx.resetDataset(dataset); - Jy.resetDataset(dataset); - Jz.resetDataset(dataset); - - Px.resetDataset(dataset); - Py.resetDataset(dataset); - Pz.resetDataset(dataset); - - Txx.resetDataset(dataset); - Tyy.resetDataset(dataset); - Tzz.resetDataset(dataset); - Tyz.resetDataset(dataset); - Tzx.resetDataset(dataset); - Txy.resetDataset(dataset); - - Rho.resetDataset(dataset); - Ke.resetDataset(dataset); - - // TODO: hoist this conversion code, as is it used elsewhere - // Convert rank to local x/y/z - int rx, ry, rz; - UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); - - size_t nx = grid->nx; - size_t ny = grid->ny; - size_t nz = grid->nz; - - // NOTE: this assumes a static mesh decomposition in nx/ny/nz - size_t global_offset_x = (nx) * rx; - size_t global_offset_y = (ny) * ry; - size_t global_offset_z = (nz) * rz; - - openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z}; - openPMD::Extent chunk_extent = {nx, ny, nz}; - - std::cout << "Local offset " << - " x: " << global_offset_x << - " y: " << global_offset_y << - " z: " << global_offset_z << - std::endl; - - std::vector jx_data; - std::vector jy_data; - std::vector jz_data; - - std::vector px_data; - std::vector py_data; - std::vector pz_data; - - std::vector txx_data; - std::vector tyy_data; - std::vector tzz_data; - std::vector tyz_data; - std::vector tzx_data; - std::vector txy_data; - - std::vector rho_data; - std::vector ke_data; - - size_t nv = nx * ny * nz; - - jx_data.resize(nv); - jy_data.resize(nv); - jz_data.resize(nv); - - px_data.resize(nv); - py_data.resize(nv); - pz_data.resize(nv); - - txx_data.resize(nv); - tyy_data.resize(nv); - tzz_data.resize(nv); - tyz_data.resize(nv); - tzx_data.resize(nv); - txy_data.resize(nv); - - rho_data.resize(nv); - ke_data.resize(nv); - - // Transpose AoS to SoAs - for (size_t k = 1; k < grid->nz + 1; k++) - { - for (size_t j = 1; j < grid->ny + 1; j++) - { - for (size_t i = 1; i < grid->nx + 1; i++) - { - int local_index = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2); - int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); - - jx_data[local_index] = hydro_array->h[global_index].jx; - jy_data[local_index] = hydro_array->h[global_index].jy; - jz_data[local_index] = hydro_array->h[global_index].jz; - - px_data[local_index] = hydro_array->h[global_index].px; - py_data[local_index] = hydro_array->h[global_index].py; - pz_data[local_index] = hydro_array->h[global_index].pz; - - txx_data[local_index] = hydro_array->h[global_index].txx; - tyy_data[local_index] = hydro_array->h[global_index].tyy; - tzz_data[local_index] = hydro_array->h[global_index].tzz; - tyz_data[local_index] = hydro_array->h[global_index].tyz; - tzx_data[local_index] = hydro_array->h[global_index].tzx; - txy_data[local_index] = hydro_array->h[global_index].txy; - - rho_data[local_index] = hydro_array->h[global_index].rho; - ke_data[local_index] = hydro_array->h[global_index].ke; - } - } - } - - Jx.storeChunk( jx_data, chunk_offset, chunk_extent); - Jy.storeChunk( jy_data, chunk_offset, chunk_extent); - Jz.storeChunk( jz_data, chunk_offset, chunk_extent); - - Px.storeChunk( px_data, chunk_offset, chunk_extent); - Py.storeChunk( py_data, chunk_offset, chunk_extent); - Pz.storeChunk( pz_data, chunk_offset, chunk_extent); - - Txx.storeChunk( txx_data, chunk_offset, chunk_extent); - Tyy.storeChunk( tyy_data, chunk_offset, chunk_extent); - Tzz.storeChunk( tzz_data, chunk_offset, chunk_extent); - Tyz.storeChunk( tyz_data, chunk_offset, chunk_extent); - Tzx.storeChunk( tzx_data, chunk_offset, chunk_extent); - Txy.storeChunk( txy_data, chunk_offset, chunk_extent); - - Rho.storeChunk( rho_data, chunk_offset, chunk_extent); - Ke.storeChunk( ke_data, chunk_offset, chunk_extent); - - series.flush(); + const int max_chunk = PMD_MAX_IO_CHUNK; + + // Loop over all particles in chunks + for (int i = 0; i < np; i += max_chunk) { + // We have to be careful as the last chunk may not be full + // Find how many are left and do that many + size_t to_write = std::min(np - i, max_chunk); + + // Convert the chunk ready to write + std::vector x_pos; + std::vector x_off; + x_pos.resize(to_write); + x_off.resize(to_write); + + std::vector y_pos; + std::vector y_off; + y_pos.resize(to_write); + y_off.resize(to_write); + + std::vector z_pos; + std::vector z_off; + z_pos.resize(to_write); + z_off.resize(to_write); + + std::vector ux_pos; + ux_pos.resize(to_write); + + std::vector uy_pos; + uy_pos.resize(to_write); + + std::vector uz_pos; + uz_pos.resize(to_write); + + for (int j = 0; j < to_write; j++) { + // TODO: do I need to center the particles? + auto &particle = sp->p[i + j]; + + x_pos[j] = particle.dx; + y_pos[j] = particle.dy; + z_pos[j] = particle.dz; + + ux_pos[j] = particle.ux; + uy_pos[j] = particle.uy; + uz_pos[j] = particle.uz; + + std::array gi = global_particle_index(particle.i, grid, rank); + x_off[j] = (float)gi[1]; + y_off[j] = (float)gi[2]; + z_off[j] = (float)gi[3]; + } + + // Base offset plus i to account for chunks + auto o = openPMD::Offset{offset + i}; + auto e = openPMD::Extent{to_write}; + px.storeChunk(x_pos, o, e); + pxo.storeChunk(x_off, o, e); + + py.storeChunk(y_pos, o, e); + pyo.storeChunk(y_off, o, e); + + pz.storeChunk(z_pos, o, e); + pzo.storeChunk(z_off, o, e); + + ux.storeChunk(ux_pos, o, e); + uy.storeChunk(uy_pos, o, e); + uz.storeChunk(uz_pos, o, e); + + series.flush(); + } + } + void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array, + species_t *sp, interpolator_array_t *interpolator_array, + grid_t *grid, int ftag) { + std::string full_file_name = fbase + file_type; + + std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl; + + // if (series == nullptr) { + openPMD::Series series = openPMD::Series( + full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD); + //} + + auto i = series.iterations[step]; + + // TODO: set these + i.setTime((float)step); + i.setDt(1.0); + i.setTimeUnitSI(1.0); + + if (!sp) + ERROR(("Invalid species \"%s\"", sp->name)); + + // TODO: do we want each backend to have to explicitly call these + // manually? Or, as it is common, should we hoist it to the VPIC + // call-site + clear_hydro_array(hydro_array); + accumulate_hydro_p(hydro_array, sp, interpolator_array); + synchronize_hydro_array(hydro_array); + + if (!fbase) + ERROR(("Invalid filename")); + + if (rank == 0) + MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"", sp->name, fbase)); + + // Write data + // float jx, jy, jz, rho; // Current and charge density => , + // float px, py, pz, ke; // Momentum and K.E. density => , float txx, tyy, tzz; // Stress diagonal => , i==j float tyz, tzx, txy; // Stress off-diagonal => , i!=j + auto J = i.meshes["J"]; + auto P = i.meshes["P"]; + auto T = i.meshes["T"]; + auto _Ke = i.meshes["Ke"]; + auto _Rho = i.meshes["Rho"]; + + auto Jx = J["x"]; + auto Jy = J["y"]; + auto Jz = J["z"]; + + auto Px = P["x"]; + auto Py = P["y"]; + auto Pz = P["z"]; + + auto Txx = T["xx"]; + auto Tyy = T["yy"]; + auto Tzz = T["zz"]; + auto Tyz = T["yz"]; + auto Tzx = T["zx"]; + auto Txy = T["xy"]; + + auto Rho = _Rho["rho"]; // TODO: bad name.. + auto Ke = _Ke["ke"]; // TODO: bad name.. + + size_t gnx = (grid->nx * grid->gpx); + size_t gny = (grid->ny * grid->gpy); + size_t gnz = (grid->nz * grid->gpz); + openPMD::Extent global_extent = {gnx, gny, gnz}; + + openPMD::Datatype datatype = openPMD::determineDatatype(); + openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent); + + Jx.resetDataset(dataset); + Jy.resetDataset(dataset); + Jz.resetDataset(dataset); + + Px.resetDataset(dataset); + Py.resetDataset(dataset); + Pz.resetDataset(dataset); + + Txx.resetDataset(dataset); + Tyy.resetDataset(dataset); + Tzz.resetDataset(dataset); + Tyz.resetDataset(dataset); + Tzx.resetDataset(dataset); + Txy.resetDataset(dataset); + + Rho.resetDataset(dataset); + Ke.resetDataset(dataset); + + // TODO: hoist this conversion code, as is it used elsewhere + // Convert rank to local x/y/z + int rx, ry, rz; + UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz); + + size_t nx = grid->nx; + size_t ny = grid->ny; + size_t nz = grid->nz; + + // NOTE: this assumes a static mesh decomposition in nx/ny/nz + size_t global_offset_x = (nx)*rx; + size_t global_offset_y = (ny)*ry; + size_t global_offset_z = (nz)*rz; + + openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, + global_offset_z}; + openPMD::Extent chunk_extent = {nx, ny, nz}; + + std::cout << "Local offset " + << " x: " << global_offset_x << " y: " << global_offset_y + << " z: " << global_offset_z << std::endl; + + std::vector jx_data; + std::vector jy_data; + std::vector jz_data; + + std::vector px_data; + std::vector py_data; + std::vector pz_data; + + std::vector txx_data; + std::vector tyy_data; + std::vector tzz_data; + std::vector tyz_data; + std::vector tzx_data; + std::vector txy_data; + + std::vector rho_data; + std::vector ke_data; + + size_t nv = nx * ny * nz; + + jx_data.resize(nv); + jy_data.resize(nv); + jz_data.resize(nv); + + px_data.resize(nv); + py_data.resize(nv); + pz_data.resize(nv); + + txx_data.resize(nv); + tyy_data.resize(nv); + tzz_data.resize(nv); + tyz_data.resize(nv); + tzx_data.resize(nv); + txy_data.resize(nv); + + rho_data.resize(nv); + ke_data.resize(nv); + + // Transpose AoS to SoAs + for (size_t k = 1; k < grid->nz + 1; k++) { + for (size_t j = 1; j < grid->ny + 1; j++) { + for (size_t i = 1; i < grid->nx + 1; i++) { + int local_index = VOXEL(i - 1, j - 1, k - 1, grid->nx - 2, + grid->ny - 2, grid->nz - 2); + int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz); + + jx_data[local_index] = hydro_array->h[global_index].jx; + jy_data[local_index] = hydro_array->h[global_index].jy; + jz_data[local_index] = hydro_array->h[global_index].jz; + + px_data[local_index] = hydro_array->h[global_index].px; + py_data[local_index] = hydro_array->h[global_index].py; + pz_data[local_index] = hydro_array->h[global_index].pz; + + txx_data[local_index] = hydro_array->h[global_index].txx; + tyy_data[local_index] = hydro_array->h[global_index].tyy; + tzz_data[local_index] = hydro_array->h[global_index].tzz; + tyz_data[local_index] = hydro_array->h[global_index].tyz; + tzx_data[local_index] = hydro_array->h[global_index].tzx; + txy_data[local_index] = hydro_array->h[global_index].txy; + + rho_data[local_index] = hydro_array->h[global_index].rho; + ke_data[local_index] = hydro_array->h[global_index].ke; } + } + } + + Jx.storeChunk(jx_data, chunk_offset, chunk_extent); + Jy.storeChunk(jy_data, chunk_offset, chunk_extent); + Jz.storeChunk(jz_data, chunk_offset, chunk_extent); + + Px.storeChunk(px_data, chunk_offset, chunk_extent); + Py.storeChunk(py_data, chunk_offset, chunk_extent); + Pz.storeChunk(pz_data, chunk_offset, chunk_extent); + + Txx.storeChunk(txx_data, chunk_offset, chunk_extent); + Tyy.storeChunk(tyy_data, chunk_offset, chunk_extent); + Tzz.storeChunk(tzz_data, chunk_offset, chunk_extent); + Tyz.storeChunk(tyz_data, chunk_offset, chunk_extent); + Tzx.storeChunk(tzx_data, chunk_offset, chunk_extent); + Txy.storeChunk(txy_data, chunk_offset, chunk_extent); + + Rho.storeChunk(rho_data, chunk_offset, chunk_extent); + Ke.storeChunk(ke_data, chunk_offset, chunk_extent); + + series.flush(); + } }; #endif From c3a63fc1b5edb6753812c6a6d98fb81f9ccd7121 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Tue, 1 Dec 2020 12:03:33 -0700 Subject: [PATCH 95/95] revert change where hdf5 backend stopped tracking num_steps --- src/vpic/dump.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index 65d15910..21804882 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -73,6 +73,7 @@ void vpic_simulation::enable_binary_dump() { void vpic_simulation::enable_hdf5_dump() { std::cout << "Enabling HDF5 IO backend" << std::endl; dump_strategy = std::unique_ptr(new HDF5Dump( rank(), nproc() )); + dump_strategy->num_step = num_step; } #endif